In [80]:
import pandas as pd
from itertools import chain
pd.options.display.float_format = '{:,.0f}'.format

In [81]:
df_racism = pd.read_csv("twitter_racism_parsed_dataset.csv", engine='python')  #contains tweets containing racism 13.4
df_sexism = pd.read_csv("twitter_sexism_parsed_dataset.csv", engine='python')  #contains tweets containing sexism 14.8
df_both = pd.read_csv("twitter_parsed_dataset.csv", engine='python')           #contains tweets containing racism and sexism 16.8k
df_bullying = pd.read_csv("kaggle_parsed_dataset.csv", engine='python')        #contains tweets containing general cyber-bullying 8k

In [82]:
df_bullying.count()

index       8799
oh_label    8799
Date        7557
Text        8799
dtype: int64

In [83]:
#since df racism, sexism and both are of the same row column format, deleting unwanted columns from them once concatenated..
df_del = pd.concat([df_racism, df_sexism, df_both])
del df_del['index']
del df_del['id']


In [84]:
df_del.head()

Unnamed: 0,Text,Annotation,oh_label
0,@AAlwuhaib1977 Muslim mob violence against Hin...,racism,1
1,@Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG,none,0
2,@jncatron @isra_jourisra @AMPalestine Islamoph...,racism,1
3,"Finally I'm all caught up, and that sudden dea...",none,0
4,@carolinesinders @herecomesfran *hugs*,none,0


In [85]:
#bringing df_bullying into the same format as the rest..

In [86]:
del df_bullying['Date']
del df_bullying['index']

In [87]:
ohs = df_bullying['oh_label'].tolist()
label = []
for oh in ohs:
    if oh == 1:
        label.append('abuse')
    elif oh == 0:
        label.append('none')

df_bullying['Annotation'] = label
df_bullying_final = df_bullying[['Text','Annotation','oh_label']]

In [88]:
df_bullying_final.head()

Unnamed: 0,Text,Annotation,oh_label
0,"""You fuck your dad.""",abuse,1
1,"""i really don't understand your point.\xa0 It ...",none,0
2,"""A\\xc2\\xa0majority of Canadians can and has ...",none,0
3,"""listen if you dont wanna get married to a man...",none,0
4,"""C\xe1c b\u1ea1n xu\u1ed1ng \u0111\u01b0\u1edd...",none,0


In [89]:
#finally concatenating all databases..

In [90]:
df_final = pd.concat([df_del, df_bullying_final])

In [91]:
df_final.drop_duplicates(inplace= True)

In [92]:
df_final

Unnamed: 0,Text,Annotation,oh_label
0,@AAlwuhaib1977 Muslim mob violence against Hin...,racism,1
1,@Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG,none,0
2,@jncatron @isra_jourisra @AMPalestine Islamoph...,racism,1
3,"Finally I'm all caught up, and that sudden dea...",none,0
4,@carolinesinders @herecomesfran *hugs*,none,0
...,...,...,...
8794,"""Never really gave it much thought. I just fig...",none,0
8795,"""Nadie se salva de la regla 34 xd""",none,0
8796,"""Question: Are you a boy or a girl?""",none,0
8797,"""Leave your email or phone number and maybe yo...",abuse,1


In [93]:
df_dropped = df_final.dropna(how='any')

In [94]:
df_dropped.count()

Text          25596
Annotation    25596
oh_label      25596
dtype: int64

In [95]:
records = df_dropped.to_records(index=False)
tweet_ann_label = list(records)

#prints an example of a (tweet, annotation, label) tuple:
tweet_ann_label[0]
len(tweet_ann_label)

25596

In [96]:
from nltk.tokenize import TweetTokenizer #I chose to tokenize with this, as it gets rid of @ handlers
tknzr = TweetTokenizer(preserve_case=True, reduce_len=False, strip_handles=True)

lst = []

for tweet, ann, label in tweet_ann_label:    
    if type(tweet) != str: #removing tweets that are not 'tokenizable'
        pass
    else:
        lst.append((tknzr.tokenize(tweet), ann, label))

In [97]:
len(lst)

25596

In [98]:
import nltk
from nltk.corpus import stopwords
stop_words = nltk.corpus.stopwords.words('english')
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer 
ps = PorterStemmer() 

remove = "!$%&()*+, .-./:;<=>?@[\]^_`{|}~]\wdiwj/'0123456789rt......\"\x92\\n\r"

In [99]:
ft = []
for tokens, annotation, label in lst:
    filt = []
    for token in tokens:
        if 'http' in token.lower():
            continue
        if token.lower() in remove:
            continue
        if token.lower() in stop_words:
            continue
        else:
            filt.append((nltk.WordNetLemmatizer().lemmatize(token.lower(), 'v')))
    untoken_filt = ' '.join([str(elem) for elem in filt])
    ft.append((untoken_filt, annotation, int(label)))

filtered_tweets = [x for x in ft if x[0] != '']

In [100]:
filtered_tweets[1]

('islamophobia like idea naziphobia islam religion hate must outlaw',
 'racism',
 1)

In [101]:
df_tweets = pd.DataFrame(filtered_tweets, columns =['Tokens', 'Annotation', 'Label'])
df_tweets

Unnamed: 0,Tokens,Annotation,Label
0,muslim mob violence hindus bangladesh continue...,racism,1
1,islamophobia like idea naziphobia islam religi...,racism,1
2,finally i'm catch sudden death cook look like ...,none,0
3,hug,none,0
4,please please start use discernment blunt ster...,none,0
...,...,...,...
25279,never really give much think figure back door,none,0
25280,nadie se salva de la regla xd,none,0
25281,question boy girl,none,0
25282,leave email phone number maybe twit meet beat ...,abuse,1
