# Part 1

General Overview:

1. Understand the shape of the data
2. Data Exploration
3. Data Cleaning
4. Data Preprocessing for Model

In [1]:
import numpy as np
import pandas as pd
import re
import calendar

import seaborn as sns
import matplotlib.style as style


import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline

# grid: rgb(240,240,240)
# background: rgb(200,200,200)
style.use('fivethirtyeight')

In [2]:
#nlp
import string
import re    #for regex
import nltk
from nltk.corpus import stopwords # remove useless words or words that do not add value
from nltk.stem.lancaster import LancasterStemmer # Convert words to the infinitive form - very aggressive transformation.
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer

### 1. Understand the shape of the data

In [3]:
# Missing values
def missing_values_func(df):
    """
    @author: Cristobal Zamorano Astudillo
    
    Personalize Missing Data Function
    
    Paramaters
    ----------
    df : DataFrame of interest
    
    Returns
    -------
    A string with counting all the features of the input DataFrame. If some values are missing, then 
    function will return a DataFrame with the following:
    - Index as the features with the missing values
    - A `Missing Values Count` Feature that tells the exact number of rows that has a misssing value in that feature index.
    - A `% of Total Values` that tells how much are the missing values of that feature with respect to the other missing values.
    """
    
    # Total missing values
    mis_val = df.isnull().sum()

    # Percentage of missing values
    mis_val_percent = 100 * df.isnull().sum() / len(df)

    # Make a table with the results
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

    # Rename the columns
    mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values Count', 1 : '% of Total Values'})

    # Sort the table by percentage of missing descending
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)


    # Returns
    if mis_val_table_ren_columns.empty:
        return f'Your selected dataframe has  {df.shape[1]} features. There are  {mis_val_table_ren_columns.shape[0]} features that have missing values.'
    else:
        display(mis_val_table_ren_columns.style.background_gradient(cmap='Reds'))
        return f'Your selected dataframe has  {df.shape[1]} features. There are  {mis_val_table_ren_columns.shape[0]} features that have missing values.'
    
def getting_to_know(df, question=None):
    display(df.shape)
    display(df.columns)
    if question == 'y':
        display(df.head())
        display(df.tail())
    print('--------------------------END--------------------------------')

In [6]:
train = pd.read_csv('data/train.csv')

In [7]:
missing_values_func(train)

'Your selected dataframe has  8 features. There are  0 features that have missing values.'

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


So, above we can see a glimpse of what the data look like 

In [9]:
[train['comment_text'][i] for i in range(3)]

["Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
 "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)",
 "Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info."]

### 2. Data Exploration

In [10]:
getting_to_know(train, 'y')

(159571, 8)

Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0
159570,fff46fc426af1f9a,"""\nAnd ... I really don't think you understand...",0,0,0,0,0,0


--------------------------END--------------------------------


Let's see a quick rundown of the features counts and see how they relate to each other:

In [11]:
train.iloc[:, 2:].sum()

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64

In [12]:
train['toxic'].value_counts(normalize = True)

0    0.904156
1    0.095844
Name: toxic, dtype: float64

In [13]:
train['severe_toxic'].value_counts(normalize = True)

0    0.990004
1    0.009996
Name: severe_toxic, dtype: float64

In [14]:
train['obscene'].value_counts(normalize = True)

0    0.947052
1    0.052948
Name: obscene, dtype: float64

In [15]:
train['threat'].value_counts(normalize = True)

0    0.997004
1    0.002996
Name: threat, dtype: float64

In [16]:
train['insult'].value_counts(normalize = True)

0    0.950636
1    0.049364
Name: insult, dtype: float64

In [17]:
train['identity_hate'].value_counts(normalize = True)

0    0.991195
1    0.008805
Name: identity_hate, dtype: float64

In [18]:
no_toxic_train = train[(train['toxic'] == 0)
   & (train['severe_toxic'] == 0)
   & (train['obscene'] == 0) 
   & (train['threat'] == 0) 
   & (train['insult'] == 0) 
   & (train['identity_hate'] == 0)]

toxic_train = train[(train['toxic'] == 1)
   | (train['severe_toxic'] == 1)
   | (train['obscene'] == 1) 
   | (train['threat'] == 1) 
   | (train['insult'] == 1) 
   | (train['identity_hate'] == 1)]

In [19]:
train.shape[0] == (no_toxic_train.shape[0] + toxic_train.shape[0])

True

In [20]:
print(f'Proportion of the No toxic comments : {(no_toxic_train.shape[0] / train.shape[0])}')

Proportion of the No toxic comments : 0.8983211235124177


In [21]:
print(f'Proportion of the toxic comments : {(toxic_train.shape[0] / train.shape[0])}')

Proportion of the toxic comments : 0.10167887648758234


Data is clearly very umbalanced. 

#### Graphs

### 3. Data Cleaning

In [22]:
stop = stopwords.words('english')
stop

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [23]:
def clean_data(df):

#     # Lowercase letters
#     df['comment_text'] = df['comment_text'].str.lower()
    
    # Remove special characters (i.e. @, $, %)
    df['comment_text'] = df['comment_text'].str.replace("[^a-z0-9!@#\$%\^\&\*_\-,\.' ]", " ") 
    
    # Remove punctuations - if you don't want to remove this, comment below out
    df['comment_text'] = df['comment_text'].str.replace('[^\w\s]', "")
    
    # Remove random numbers
    df['comment_text'] = df['comment_text'].str.replace("[^a-z' ]", "")
    
    # Remove repeating characters and extra spaces 
    # Examples: 'whaaat' --> 'what' OR 'hello   bye' --> 'hello bye'
    # Note: This does change words like "look" to "lok" cuz of the double letters, 
    # if we don't want this, we can remove the \w in the regex pattern below.
    repeat_pattern = re.compile(r'(\w|\s)\1*')
    match_substitution = r'\1'
    df['comment_text'] = df['comment_text'].apply(lambda x: repeat_pattern.sub(match_substitution, x))
    
    # Remove all numbers with letters
    alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)

    # '[%s]' % re.escape(string.punctuation),' ' - replace punctuation with white space
    # .lower() - convert all strings to lowercase 
    punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

    # Remove all '\n' in the string and replace it with a space
    remove_n = lambda x: re.sub("\n", " ", x)

    # Remove all non-ascii characters 
    remove_non_ascii = lambda x: re.sub(r'[^\x00-\x7f]',r' ', x)


    

    # Apply all the lambda functions wrote previously through .map on the comments column
    df['comment_text'] = df['comment_text'].map(alphanumeric).map(punc_lower).map(remove_n).map(remove_non_ascii)
    
    # Remove stop words
#     pattern = re.compile(r'\b('+stop_words_arr+r')\b', flags=re.IGNORECASE).apply(str)
#     remove_stop_words = lambda x: re.sub('\s+', ' ', re.sub(pattern, '', x))
#     pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
#     df['comment_text'] = pattern.sub('', str(df['comment_text']))
    df['comment_text_clean']= df['comment_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

    

    
    return df

In [24]:
data = clean_data(train)
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text_clean
0,0000997932d777bf,xplanation hy the edits made under my usernam...,0,0,0,0,0,0,xplanation hy edits made username ardcore etal...
1,000103f0d9cfb60f,aw e matches this background colour m semingl...,0,0,0,0,0,0,aw e matches background colour semingly stuck ...
2,000113f07ec002fd,ey man m realy not trying to edit war ts just...,0,0,0,0,0,0,ey man realy trying edit war ts guy constantly...
3,0001b41b1c6bb37e,ore cant make any real sugestions on improvem...,0,0,0,0,0,0,ore cant make real sugestions improvement wond...
4,0001d958c54c6e35,ou sir are my hero ny chance you remember wha...,0,0,0,0,0,0,ou sir hero ny chance remember page thats


In [25]:
[data['comment_text'][i] for i in range(3)]

[' xplanation hy the edits made under my username ardcore etalica an were reverted hey werent vandalisms just closure on some s after voted at ew ork ols nd please dont remove the template from the talk page since m retired now',
 ' aw e matches this background colour m semingly stuck with hanks talk anuary ',
 ' ey man m realy not trying to edit war ts just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page e sems to care more about the formating than the actual info']

In [26]:
# data.to_csv('./data/output/cleaned_train_data.csv')

### 4. Data Preprocessing for Model

In [27]:
toxic_df = data.loc[:,['id', 'comment_text', 'toxic']]
severe_toxic_df = data.loc[:,['id', 'comment_text', 'severe_toxic']]
obscene_df = data.loc[:,['id', 'comment_text', 'obscene']]
threat_df = data.loc[:,['id', 'comment_text', 'threat']]
insult_df = data.loc[:,['id', 'comment_text', 'insult']]
id_hate_df = data.loc[:,['id', 'comment_text', 'identity_hate']]

In [28]:
toxic_features = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']
toxic_val = [data[e].value_counts()[1] for e in toxic_features]
dict_toxic_value = {k:v for k,v in zip(toxic_features, toxic_val)}
dict_toxic_value

{'toxic': 15294,
 'severe_toxic': 1595,
 'obscene': 8449,
 'threat': 478,
 'insult': 7877,
 'identity_hate': 1405}

In [29]:
toxic_1 = toxic_df[toxic_df['toxic'] == 1].sample(5000)
toxic_0 = toxic_df[toxic_df['toxic'] == 0].sample(5000)
data_toxic_balanced = pd.concat([toxic_1,toxic_0], axis = 0)
data_toxic_balanced.shape

(10000, 3)

In [30]:
severe_toxic_1 = severe_toxic_df[severe_toxic_df['severe_toxic'] == 1].sample(1595)
severe_toxic_0 = severe_toxic_df[severe_toxic_df['severe_toxic'] == 0].sample(1595)
data_severe_toxic_balanced = pd.concat([severe_toxic_1,severe_toxic_0], axis = 0)
data_severe_toxic_balanced.shape

(3190, 3)

In [31]:
obscene_1 = obscene_df[obscene_df['obscene'] == 1].sample(5000)
obscene_0 = obscene_df[obscene_df['obscene'] == 0].sample(5000)
data_obscene_balanced = pd.concat([obscene_1,obscene_0], axis = 0)
data_obscene_balanced.shape

(10000, 3)

In [32]:
threat_1 = threat_df[threat_df['threat'] == 1].sample(478)
threat_0 = threat_df[obscene_df['obscene'] == 0].sample(1912)
data_threat_balanced = pd.concat([threat_1,threat_0], axis = 0)
data_threat_balanced.shape

(2390, 3)

In [33]:
insult_1 = insult_df[insult_df['insult'] == 1].sample(5000)
insult_0 = insult_df[insult_df['insult'] == 0].sample(5000)
data_insult_balanced = pd.concat([insult_1,insult_0], axis = 0)
data_insult_balanced.shape

(10000, 3)

In [34]:
id_hate_1 = id_hate_df[id_hate_df['identity_hate'] == 1].sample(1405)
id_hate_0 = id_hate_df[id_hate_df['identity_hate'] == 0].sample(5620)
data_id_hate_balanced = pd.concat([id_hate_1,id_hate_0], axis = 0)
data_id_hate_balanced.shape

(7025, 3)

In [35]:
data_toxic_balanced.to_csv('./data/output/pre_process_data/data_toxic_balanced.csv')
data_severe_toxic_balanced.to_csv('./data/output/pre_process_data/data_severe_toxic_balanced.csv')
data_obscene_balanced.to_csv('./data/output/pre_process_data/data_obscene_balanced.csv')
data_threat_balanced.to_csv('./data/output/pre_process_data/data_threat_balanced.csv')
data_insult_balanced.to_csv('./data/output/pre_process_data/data_insult_balanced.csv')
data_id_hate_balanced.to_csv('./data/output/pre_process_data/data_id_hate_balanced.csv')

In [36]:
import wordcloud
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords

In [37]:
def wordcloud(df, label):
    
    # Print only rows where the toxic category label value is 1 (ie. the comment is toxic)
    subset=df[df[label]==1]
    text=subset.comment_text.values
    wc= WordCloud(background_color="black",max_words=4000)

    wc.generate(" ".join(text))

    plt.figure(figsize=(20,20))
    plt.subplot(221)
    plt.axis("off")
    plt.title("Words frequented in {}".format(label), fontsize=20)
    plt.imshow(wc.recolor(colormap= 'gist_earth' , random_state=244), alpha=0.98)

In [40]:
# wordcloud(data_id_hate_balanced,'identity_hate')

In [41]:
# wordcloud(data,'severe_toxic')