# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import operator
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# Data Pre-processing

In [60]:
df_slur = pd.read_csv("Slangs List Big Data.csv")
df_merge = pd.read_csv("mergedDataSet.csv")
df_toxic = df_merge[(df_merge['merged_rating']==1) | (df_merge['merged_rating']==2)][['comment_text','merged_rating']]
sports_page_list = ['Arsenal_facebook_comments.csv','nba_facebook_comments.csv','NFL_facebook_comments.csv']
news_page_list =['cnn_facebook_comments.csv','FoxNews_facebook_comments.csv','TheYoungTurks_facebook_comments.csv']
entertainment_page_list = ['9gag_facebook_comments.csv','BuzzFeed_facebook_comments.csv','NowThisEntertainment_facebook_comments.csv']

def read_file(file_path):
    df = pd.read_csv(file_path)
    return df

def text_cleaning(comment):
    comment = comment.lower()                               # Converting comments into lowercase
    comment = comment.strip("b'").strip('b"')               # Removing b",b' from start and end of comment 
    comment = re.sub("\\[\\[(.*?)\\]\\]","",comment)        # Removing GIFs and images from comments
    comment = re.sub(r'\\x\S+',"",comment)                  # Removing unwanted text , emojis etc.
#   Replacing apostrophes
    comment = re.sub(r"'s",' is',comment)                   
    comment = re.sub(r"'re",' are',comment)
    comment = re.sub(r"'t",' not',comment)
    comment = re.sub(r"'m",' am',comment)
    comment = re.sub(r"'d",' would',comment)
    comment = re.sub(r"'ll",' will',comment)
    comment = re.sub(r"'ve",' have',comment)
    comment = re.sub('[.]', ' ', comment)
    
    comment = ''.join([c for c in comment if c not in ('!', '?' ,'.','\\','"',',','-','$','%',"'")]) #Removing Punctuations and other signs
    comment = re.sub(r'[0-9]',"",comment)                    # Removing numbers
    comment = re.sub(r'http\S+',"",comment)                  # Removing Url
    comment = ' '.join([c for c in comment.split() if c not in stop_words])   # Removing stopwords
    comment = ' '.join([lemmatizer.lemmatize(c) for c in comment.split()])    # Lemmatizing
    return comment

def page_merge(page_list):
    chunks =[]
    for page in page_list:
        df_page = read_file(page)
        chunks.append(df_page)
        df_concat = pd.concat(chunks,ignore_index=True)
    df_concat = df_concat['comment_message'].sample(n=20000,random_state =10)
    df_concat = df_concat.apply(lambda x: text_cleaning(x))
    df_concat = df_concat[df_concat.apply(lambda x: x is not "")]
    return df_concat

# Creating arrays for toxic categories

In [89]:
df_slur = df_slur.fillna('None')
homophobic_array = df_slur['Homophobic'].values
sexist_array = df_slur['Sexist'].values
racist_array = df_slur['Racist'].values

# Comments list from each page category

In [90]:
sports_comments = page_merge(sports_page_list)
news_comments = page_merge(news_page_list)
entertainment_comments = page_merge(entertainment_page_list)

In [91]:
sports_array = sports_comments.values
sports_corpus = " ".join([c for c in sports_array])

news_array = news_comments.values
news_corpus = " ".join([c for c in news_array])

entertainment_array = entertainment_comments.values
entertainment_corpus = " ".join([c for c in entertainment_array])

# Code to check toxic words in different pages.

In [113]:
# This is not general implementaion, need to create a function.Currently you have to use seperately for each page category
# and toxic category("Racist,Sexist,Homophobic")
d ={}
for i in entertainment_corpus.split():
    if i in racist_array:
        if i not in d:
            d[i]=1
        else:
            d[i]+=1
    else:
        d[i] =0


In [114]:
sorted_x = sorted(d.items(), key=operator.itemgetter(1),reverse=True)

In [115]:
sorted_x

[('redneck', 5),
 ('twinkie', 2),
 ('coolie', 1),
 ('miran', 0),
 ('koko', 0),
 ('stalin', 0),
 ('baxter', 0),
 ('darius', 0),
 ('graben', 0),
 ('nesse', 0),
 ('mrki', 0),
 ('blanchard', 0),
 ('fak', 0),
 ('importante', 0),
 ('aleena', 0),
 ('chadwick', 0),
 ('morawski', 0),
 ('attacknnasking', 0),
 ('ceuninck', 0),
 ('abortion', 0),
 ('winnnnn', 0),
 ('nem', 0),
 ('whipp', 0),
 ('iznenadim', 0),
 ('benghazi', 0),
 ('kimmy', 0),
 ('trisa', 0),
 ('sabbagh', 0),
 ('savla', 0),
 ('trecen', 0),
 ('naumoski', 0),
 ('prot', 0),
 ('montelongo', 0),
 ('mancini', 0),
 ('malone', 0),
 ('exists', 0),
 ('krider', 0),
 ('(april', 0),
 ('stairmaster', 0),
 ('goodenough', 0),
 ('bladimir', 0),
 ('stronger', 0),
 ('sin', 0),
 ('jankowski', 0),
 ('nwhen', 0),
 ('veerappan', 0),
 ('saba', 0),
 ('rudy', 0),
 ('nicolai', 0),
 ('aaah', 0),
 ('catie', 0),
 ('flu', 0),
 ('buehler', 0),
 ('yumi', 0),
 ('gay', 0),
 ('razanewakim', 0),
 ('hala', 0),
 ('abrille', 0),
 ('hshaha', 0),
 ('insanity', 0),
 ('type', 0