In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

from sklearn.pipeline import make_pipeline, Pipeline

# imbalance learn
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler, BorderlineSMOTE
from imblearn.under_sampling import NearMiss, RandomUnderSampler

import contractions
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet, brown
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

import re
import string

import warnings
warnings.filterwarnings('ignore')

stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/affan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/affan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
train = pd.read_csv('../data/raw/jigsaw/train.csv')
train.head(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [3]:
test = pd.read_csv('../data/raw/jigsaw/test.csv')
test_label = pd.read_csv('../data/raw/jigsaw/test_labels.csv')
test = test.merge(test_label, on='id', how='inner')
test.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [4]:
# merge all toxic labels
def is_toxic(row):    
    return 1 if row.sum() > 1 else -1 if row.sum() < 0 else 0

train['is_toxic'] = train.iloc[:, 2:].apply(is_toxic, axis=1)
train['is_toxic'].value_counts()

0    149706
1      9865
Name: is_toxic, dtype: int64

In [5]:
test['is_toxic'] = test.iloc[:, 2:].apply(is_toxic, axis=1)
test = test[test['is_toxic'] >= 0]
test['is_toxic'].value_counts()

0    59577
1     4401
Name: is_toxic, dtype: int64

In [6]:
# turns into binary label
train_toxic = train[['comment_text', 'is_toxic']]
display(train_toxic.head())

test_toxic = test[['comment_text', 'is_toxic']]
display(test_toxic.head())

Unnamed: 0,comment_text,is_toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0


Unnamed: 0,comment_text,is_toxic
5,Thank you for understanding. I think very high...,0
7,:Dear god this site is horrible.,0
11,"""::: Somebody will invariably try to add Relig...",0
13,""" \n\n It says it right there that it IS a typ...",0
14,""" \n\n == Before adding a new product to the l...",0


## Text Cleaning
Fix:
- Contractions

Remove:
- Links
- Punctuation
- Numbers
- White-Spaces
- Non-Ascii
- Emoji

In [7]:
def fix_contractions(text):
    return contractions.fix(text)

def rm_link(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)
    # return re.sub(r"https?://\S+|www\.\S+", "", text)

def rm_punct(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# handle case like "shut up okay?Im only 10 years old"
# become "shut up okay Im only 10 years old"
def rm_punct2(text):
    return re.sub(r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]', ' ', text)

def rm_number(text):
    return re.sub(r'\d+', '', text)

def rm_whitespaces(text):
    return re.sub(r' +', ' ', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def rm_emoji(text):
    emojis = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE
    )
    return emojis.sub(r'', text)

# reference: https://towardsdatascience.com/yet-another-twitter-sentiment-analysis-part-1-tackling-class-imbalance-4d7a7f717d44
def spell_correction(text):
    return re.sub(r'(.)\1+', r'\1\1', text)

def clean_pipeline(text):
    fix_contr = fix_contractions(text)
    no_link = rm_link(fix_contr)
    no_punct = rm_punct2(no_link)
    no_number = rm_number(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    no_emoji = rm_emoji(no_nonasci)
    spell_corrected = spell_correction(no_emoji)
    return spell_corrected.lower()

In [8]:
train_toxic.loc[:, 'comment_clean'] = train_toxic.loc[:, 'comment_text'].apply(clean_pipeline)
train_toxic.head()

Unnamed: 0,comment_text,is_toxic,comment_clean
0,Explanation\nWhy the edits made under my usern...,0,explanation\nwhy the edits made under my usern...
1,D'aww! He matches this background colour I'm s...,0,d aww he matches this background colour i am s...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man i am really not trying to edit war it ...
3,"""\nMore\nI can't make any real suggestions on ...",0,\nmore\ni cannot make any real suggestions on...
4,"You, sir, are my hero. Any chance you remember...",0,you sir are my hero any chance you remember wh...


## Text Preprocessing
- Tokenize
- Stopword removal
- POS Tagging (optional)
- Lemmatize

In [9]:
def tokenize(text):
    return word_tokenize(text)

def rm_stopwords(text):
    return [i for i in text if i not in stopwords]

def postag(text):
    wordnet_map = {
        "N":wordnet.NOUN, 
        "V":wordnet.VERB, 
        "J":wordnet.ADJ, 
        "R":wordnet.ADV
    }

    train_sents = brown.tagged_sents(categories='news')
    # not implemented yet
    return

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()    
    lemmas = [lemmatizer.lemmatize(t) for t in text]
    # make sure lemmas does not contains sotpwords
    return rm_stopwords(lemmas)

def preprocess_pipeline(text):
    tokens = tokenize(text)
    no_stopwords = rm_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

In [10]:
train_toxic.loc[:, 'comment_processed'] = train_toxic.loc[:, 'comment_clean'].apply(preprocess_pipeline)
train_toxic.head()

Unnamed: 0,comment_text,is_toxic,comment_clean,comment_processed
0,Explanation\nWhy the edits made under my usern...,0,explanation\nwhy the edits made under my usern...,explanation edits made username hardcore metal...
1,D'aww! He matches this background colour I'm s...,0,d aww he matches this background colour i am s...,aww match background colour seemingly stuck th...
2,"Hey man, I'm really not trying to edit war. It...",0,hey man i am really not trying to edit war it ...,hey man really trying edit war guy constantly ...
3,"""\nMore\nI can't make any real suggestions on ...",0,\nmore\ni cannot make any real suggestions on...,make real suggestion improvement wondered sect...
4,"You, sir, are my hero. Any chance you remember...",0,you sir are my hero any chance you remember wh...,sir hero chance remember page


## Prepare Predictor and Target Variable

In [11]:
# select feature subset
X = train_toxic['comment_processed']
y = train_toxic['is_toxic']

# bag of words
bow = CountVectorizer(min_df=50, max_df=10000)

# logistic regression
lr = LogisticRegression()

## Rebalancing Data
- Cross Validation
 - Feature Extraction
 - Oversampling / Undersampling
 - Evaluation
- Modelling

In [12]:
def cross_val_pipeline(X, y, bow, model, sampler=None, splits=5):

    # metrics list
    accuracy, precision, recall, f_score = [], [], [], []

    # kfold cross validation
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)
    for train, test in kfold.split(X, y):
        
        # bag of words
        train_bow = bow.fit_transform(X[train])
        test_bow = bow.transform(X[test])

        if sampler != None:

            # oversample / undersample
            X_resample, y_resample = sampler.fit_resample(train_bow, y[train])            

            # fit to model
            model.fit(X_resample, y_resample)
        
        else:
            model.fit(train_bow, y[train])

        prediction = model.predict(test_bow)

        # evaluate
        acc = accuracy_score(y[test], prediction)
        ppv = precision_score(y[test], prediction)  # precision
        tpr = recall_score(y[test], prediction)     # recall
        f1 = f1_score(y[test], prediction)

        accuracy.append(acc)
        precision.append(ppv)
        recall.append(tpr)
        f_score.append(f1)
    
    # print output
    print(f'accuracy: {np.mean(accuracy):.2f}')
    print(f'precision: {np.mean(precision):.2f}')
    print(f'recall: {np.mean(recall):.2f}')
    print(f'f1 score: {np.mean(f_score):.2f}')
    print('-'*50)

### Baseline (without sampler)

In [13]:
# baseline
cross_val_pipeline(X, y, bow, lr)

accuracy: 0.97
precision: 0.85
recall: 0.63
f1 score: 0.73
--------------------------------------------------


### SMOTE sampler
Sampled size:
- 15%
- 50%
- 75%
- 100%

In [14]:
# SMOTE sampler

# 15% sample size
print(f'Sample size: 15%')
smote = SMOTE(random_state=42, sampling_strategy=.15)
cross_val_pipeline(X, y, bow, lr, smote)

# 50% sample size
print(f'Sample size: 50%')
smote = SMOTE(random_state=42, sampling_strategy=.5)
cross_val_pipeline(X, y, bow, lr, smote)

# 75% sample size
print(f'Sample size: 75%')
smote = SMOTE(random_state=42, sampling_strategy=.75)
cross_val_pipeline(X, y, bow, lr, smote)

# 100% sample size
print(f'Sample size: 100%')
smote = SMOTE(random_state=42)
cross_val_pipeline(X, y, bow, lr, smote)

Sample size: 15%
accuracy: 0.97
precision: 0.79
recall: 0.72
f1 score: 0.76
--------------------------------------------------
Sample size: 50%
accuracy: 0.94
precision: 0.51
recall: 0.79
f1 score: 0.62
--------------------------------------------------
Sample size: 75%
accuracy: 0.92
precision: 0.42
recall: 0.80
f1 score: 0.56
--------------------------------------------------
Sample size: 100%
accuracy: 0.91
precision: 0.39
recall: 0.81
f1 score: 0.52
--------------------------------------------------


### ADASYN Sampler
Sampled size:
- 15%
- 50%
- 75%
- 100%

In [15]:
# ADASYN sampler

# 15% sample size
print('Sample size: 15%')
adasyn = ADASYN(random_state=42, sampling_strategy=.15)
cross_val_pipeline(X, y, bow, lr, adasyn)

# 50% sample size
print('Sample size: 50%')
adasyn = ADASYN(random_state=42, sampling_strategy=.5)
cross_val_pipeline(X, y, bow, lr, adasyn)

# 75% sample size
print('Sample size: 75%')
adasyn = ADASYN(random_state=42, sampling_strategy=.75)
cross_val_pipeline(X, y, bow, lr, adasyn)

# 100% sample size
print('Sample size: 100%')
adasyn = ADASYN(random_state=42, sampling_strategy=1)
cross_val_pipeline(X, y, bow, lr, adasyn)

Sample size: 15%
accuracy: 0.97
precision: 0.78
recall: 0.73
f1 score: 0.75
--------------------------------------------------
Sample size: 50%
accuracy: 0.92
precision: 0.43
recall: 0.80
f1 score: 0.56
--------------------------------------------------
Sample size: 75%
accuracy: 0.90
precision: 0.37
recall: 0.81
f1 score: 0.51
--------------------------------------------------
Sample size: 100%
accuracy: 0.89
precision: 0.34
recall: 0.81
f1 score: 0.48
--------------------------------------------------


### Borderline SMOTE Sampler
Sampled size:
- 15%
- 50%
- 75%
- 100%

In [16]:
# Borderline SMOTE sampler

# 15% sample size
print('Sample size: 15%')
bordersmote = BorderlineSMOTE(random_state=42, sampling_strategy=.15)
cross_val_pipeline(X, y, bow, lr, bordersmote)

# 50% sample size
print('Sample size: 50%')
bordersmote = BorderlineSMOTE(random_state=42, sampling_strategy=.5)
cross_val_pipeline(X, y, bow, lr, bordersmote)

# 75% sample size
print('Sample size: 75%')
bordersmote = BorderlineSMOTE(random_state=42, sampling_strategy=.75)
cross_val_pipeline(X, y, bow, lr, bordersmote)

# 100% sample size
print('Sample size: 100%')
bordersmote = BorderlineSMOTE(random_state=42, sampling_strategy=1)
cross_val_pipeline(X, y, bow, lr, bordersmote)

Sample size: 15%


KeyboardInterrupt: 

### Random Undersampler

In [None]:
# Random Undersampler
# 15% sample size
print(f'Sample size: 15%')
rus = RandomUnderSampler(random_state=42, sampling_strategy=.15)
cross_val_pipeline(X, y, bow, lr, rus)

# 50% sample size
print(f'Sample size: 50%')
rus = RandomUnderSampler(random_state=42, sampling_strategy=.5)
cross_val_pipeline(X, y, bow, lr, rus)

# 75% sample size
print(f'Sample size: 75%')
rus = RandomUnderSampler(random_state=42, sampling_strategy=.75)
cross_val_pipeline(X, y, bow, lr, rus)

# 100% sample size
print(f'Sample size: 100%')
rus = RandomUnderSampler(random_state=42)
cross_val_pipeline(X, y, bow, lr, rus)

Sample size: 15%
accuracy: 0.97
precision: 0.78
recall: 0.74
f1 score: 0.76
--------------------------------------------------
Sample size: 50%
accuracy: 0.96
precision: 0.62
recall: 0.85
f1 score: 0.71
--------------------------------------------------
Sample size: 75%
accuracy: 0.95
precision: 0.55
recall: 0.87
f1 score: 0.68
--------------------------------------------------
Sample size: 100%
accuracy: 0.94
precision: 0.50
recall: 0.89
f1 score: 0.64
--------------------------------------------------


## Near Miss
Sample size:
- 15%
- 50%
- 75%
- 100%

In [None]:
# NearMiss

# 15% sample size
print('Sample size 15%')
nm = NearMiss(version=3, sampling_strategy=.15, n_neighbors_ver3=3)
cross_val_pipeline(X, y, bow, lr, nm)

# 50% sample size
print('Sample size 50%')
nm = NearMiss(version=3, sampling_strategy=.5, n_neighbors_ver3=3)
cross_val_pipeline(X, y, bow, lr, nm)

# 75% sample size
print('Sample size 75%')
nm = NearMiss(version=3, sampling_strategy=.75, n_neighbors_ver3=3)
cross_val_pipeline(X, y, bow, lr, nm)

# 100% sample size
print('Sample size 100%')
nm = NearMiss(version=3, n_neighbors_ver3=3)
cross_val_pipeline(X, y, bow, lr, nm)

Sample size 15%
accuracy: 0.90
precision: 0.36
recall: 0.83
f1 score: 0.50
--------------------------------------------------
Sample size 50%
accuracy: 0.90
precision: 0.36
recall: 0.83
f1 score: 0.50
--------------------------------------------------
Sample size 75%
accuracy: 0.90
precision: 0.36
recall: 0.83
f1 score: 0.50
--------------------------------------------------
Sample size 100%
accuracy: 0.90
precision: 0.36
recall: 0.83
f1 score: 0.50
--------------------------------------------------
