In [1]:
from pathlib import Path
import numpy as np, pandas as pd, re

FPATH_SBIC = Path('./')

train_raw = pd.read_csv(FPATH_SBIC/'SBIC.v2.agg.trn.csv')
# train_processed = pd.read_csv(FPATH_SBIC/'processed_SBIC.v2.agg.trn.csv')

## Original Data Import

In [2]:
train_raw

Unnamed: 0.1,Unnamed: 0,post,targetMinority,targetCategory,targetStereotype,whoTarget,intentYN,sexYN,offensiveYN,dataSource,hasBiasedImplication
0,0,"\n\nBill Kristol and Ben Shaprio, two turds in...",[],[],[],0.0,0.886667,0.0,1.000000,Gab,1
1,1,\n\nRose\n🌹Taylor‏ @RealRoseTaylor 6h6 hours a...,[],[],[],,0.220000,0.0,0.000000,Gab,1
2,2,\nCharlie Kirk‏\n\nJohnny Depp calls for death...,"[""liberals""]","[""social""]","[""miscontrue things"", ""take things in a weird ...",1.0,1.000000,0.0,1.000000,Gab,0
3,3,\nDavid Knight‏ \n\nNotice how quickly things ...,[],[],[],0.0,0.110000,0.0,0.166667,Gab,1
4,4,\nFinland fireball: Time-lapse video shows nig...,[],[],[],,0.000000,0.0,0.000000,Gab,1
...,...,...,...,...,...,...,...,...,...,...,...
35419,35498,👉 Illegally in the country after 5 deportation...,[],[],[],,0.220000,0.0,0.000000,Gab,1
35420,35500,💥Breaking💥\nJulian Assange is the gate keeper ...,[],[],[],0.0,0.000000,0.0,0.000000,Gab,1
35421,35501,📖 2Kings 22:19 because your heart was peniten...,[],[],[],,0.000000,0.0,0.000000,Gab,1
35422,35502,🚨#FAKENEWSAWARDS🚨\n\n🚨 who is #1 fake news ?🚨\...,[],[],[],0.0,0.553333,0.0,0.500000,Gab,1


## Filter for Target Classes

In [3]:
targets = ['women', 'black', 'asian', 'jewish', 'muslim', 'gay', 'lesbian', 'trans', 'bisexual', 'asexual']

target_minority = []
def has_class(targetMinority):
    # return len(  set(targets).intersection( set(eval(targetMinority)) )  ) > 0
    for target in targets:
        if target in targetMinority.lower():
            target = 'LGBT' if target in ['gay', 'lesbian', 'trans', 'bisexual', 'asexual'] else target
            target_minority.append(target)
            return True
    return False

train_filtered = train_raw[ train_raw.targetMinority.apply(has_class) ]
train_preproc = pd.DataFrame()
train_preproc['post'] = train_filtered.post
train_preproc['target_minority'] = target_minority
train_preproc = train_preproc.reset_index(drop=True)

print(train_preproc.target_minority.value_counts())
train_preproc

women     3129
black     2823
jewish     931
muslim     474
LGBT       283
asian      264
Name: target_minority, dtype: int64


Unnamed: 0,post,target_minority
0,\nJust watched facial recognition technology f...,muslim
1,I propose a new law \n\n Any congressman or w...,women
2,!!! RT @mayasolovely: As a woman you shouldn't...,women
3,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,women
4,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,women
...,...,...
7899,“Woman is not made to be the admiration of all...,women
7900,"“You can drink an ugly woman pretty, faster th...",women
7901,"“You da bomb!” “No, you da bomb!” In America –...",muslim
7902,⁠Why do wh*te people like to play hockey? It’...,black


## Pre-Process Data + Add Integer Labels

In [35]:
def textproc(text):
    # processed = re.sub(r'([\r\n]|@.+?\s|[^a-zA-Z0-9\s])', '', text)
    # processed = re.sub(r'([\r\n]|@.+?\s|&#\d+;|&.+?;|[^a-zA-Z0-9\s\'\.\,\;\!\-])', '', text).strip()
    misc = r'@.+?\s|&#\d+;|&.+?;|https*://.+?[\s\n]*$'           #remove @mentions, &entities, links
    include = r'[\r\n]'                     #remove line breaks
    exclude = r'[^a-zA-Z0-9\s\'\.\,\;\!\-]' # keep these characters
    processed = re.sub(r'(%s|%s|%s)'% (misc, include, exclude), '', text).strip()
    processed = np.NaN if len(processed.replace(' ', ''))==0 else processed
    return processed

classes = list(train_preproc.target_minority.unique())
labels = [classes.index(c) for c in train_preproc.target_minority]

train_postproc = pd.DataFrame()
train_postproc = train_preproc.copy()

train_postproc.post = train_postproc.post.apply(textproc)
train_postproc['labels'] = labels
train_postproc = train_postproc.dropna()


print(classes)
train_postproc

['muslim', 'women', 'black', 'LGBT', 'jewish', 'asian']


Unnamed: 0,post,target_minority,labels
0,Just watched facial recognition technology fro...,muslim,0
1,I propose a new law Any congressman or woman ...,women,1
2,!!! RT As a woman you shouldn't complain about...,women,1
3,!!!!!!! RT Dawg!!!! RT You ever fuck a bitch a...,women,1
4,!!!!!!!!! RT she look like a tranny,women,1
...,...,...,...
7898,Post your dark jokes here! No joke is too dark...,black,2
7899,"Woman is not made to be the admiration of all,...",women,1
7900,"You can drink an ugly woman pretty, faster tha...",women,1
7901,"You da bomb! No, you da bomb! In America a co...",muslim,0


## Output to Required Format

In [36]:
train_postproc.post.to_csv('dataset.txt', header=None, index=None)
train_postproc.labels.to_csv('labels.txt', header=None, index=None)
np.savetxt('classes.txt', classes, fmt='%s')