# You're Toxic, I'm Slippin' Under: Toxic Comment Classification Challenge

#### STINTSY S13 Group 8
- VICENTE, Francheska Josefa
- VISTA, Sophia Danielle S.

## Requirements and Imports
Before starting, the relevant libraries and files in building and training the model should be loaded into the notebook first.

### Import
Several libraries are required to perform a thorough analysis of the dataset. Each of these libraries will be imported and described below:

#### Basic Libraries 
Import `numpy` and `pandas`.
- `numpy` contains a large collection of mathematical functions
- `pandas` contains functions that are designed for data manipulation and data analysis

In [None]:
import numpy as np
import pandas as pd

#### Natural Language Processing Libraries 
- `re` is a module that allows the use of regular expressions
- `nltk` provides functions for processing text data
- `stopwords` is a corpus from NLTK, which includes a compiled list of stopwords
- `Counter` is from Python's `collections` module, which is helpful for tokenization
- `string` contains functions for string operations

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#### Machine Learning Libraries

In [None]:
import sys
!{sys.executable} -m pip install scikit-multilearn

from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import BinaryRelevance

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

### Datasets and Files


In [None]:
train = pd.read_csv('cleaned_data/cleaned_train.csv')
test = pd.read_csv('cleaned_data/cleaned_test.csv')

## Trying different Models

In [None]:
def compute_accuracy(predictions, actual):
    accuracy = np.sum (predictions == actual) / len (predictions) * 100
    return accuracy

In [None]:
classes = train.columns [2:]

In [None]:
test ['comment_text'] = test ['comment_text'].apply(lambda x: np.str_(x))

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
class mn_hyper_parameter:
    def __init__(self, class_, alpha, fit_prior):
        self.class_ = class_
        self.alpha = alpha
        self.fit_prior = fit_prior

In [None]:
class lr_hyperparameter:
    def __init__(self, class_, c, max_iter):
        self.class_ = class_
        self.c = c
        self.max_iter = max_iter

In [None]:
tf_idf_vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 5000)

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
count_vectorizer = CountVectorizer(stop_words = 'english', max_features = 5000)

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_test = count_vectorizer.transform(X_test)

### Classifier Chain: Logistic Regression

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
lr_cc = ClassifierChain(
    classifier = LogisticRegression(max_iter = 300, C = 10),
)

lr_cc.fit(count_train, y_train)

predictions = lr_cc.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 

predictions = lr_cc.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_class_lr_cc.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
parameters_lr = [
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [1, 12, 15],
        'classifier__max_iter': [600, 1800, 3000]
    }
]

In [None]:
lr_cc_tuned = GridSearchCV(ClassifierChain(), parameters_lr, scoring = 'accuracy')

In [None]:
# train
lr_cc_tuned.fit(count_train, y_train)
print (lr_cc_tuned.best_params_, lr_cc_tuned.best_score_)

In [None]:
predictions = lr_cc_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = lr_cc_tuned.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_class_lr_cc_tuned.csv', index = False) 

### Classifier Chain: Multinomial Naive Bayes

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
mn_cc = ClassifierChain(
    classifier = MultinomialNB(),
    max_iter = 300,
    alpha = 1.0,
    fit_prior = True
)

mn_cc.fit(count_train, y_train)

predictions = mn_cc.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_cc.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_class_mn_cc.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
parameters_mn = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.7, 1.0],
        'classifier__fit_prior': [True, False]
    }
]

In [None]:
mn_cc_tuned = GridSearchCV(ClassifierChain(), parameters_mn, scoring = 'accuracy')

In [None]:
# train
mn_cc_tuned.fit(count_train, y_train)
print (mn_cc_tuned.best_params_, mn_cc_tuned.best_score_)

In [None]:
predictions = mn_cc_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_cc_tuned.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_class_mn_cc_tuned.csv', index = False) 

### Multinomial Naive Bayes using TF-IDF Vectorizer

#### Model Training

In [29]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [30]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [31]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [32]:
arr_model = []
counter = 0
for class_ in classes:
    y_train = train[class_]
    model = MultinomialNB ()
    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    arr_model.append(model)
    counter = counter + 1

In [33]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

Class:  toxic
95.135080935759
Class:  severe_toxic
99.07940665910473
Class:  obscene
97.44753119301126
Class:  threat
99.70295354419036
Class:  insult
96.97689429783607
Class:  identity_hate
99.18594230781282


In [34]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 

counter = 0
for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tfidf_nb.csv', index = False) 

#### Hyperparameter Tuning

In [35]:
X = train ['comment_text']

In [36]:
hyperparameters = [{
    'alpha' : [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000], 
    'fit_prior' : [False, True]
}]

In [37]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = MultinomialNB ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, random_state = 42, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = tf_idf_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = tf_idf_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = mn_hyper_parameter (class_, best_grid['alpha'], best_grid['fit_prior'])
    final_hyperparameters.append(temp)

Class:  toxic
Best accuracy:  95.00162935853407 %
Best grid:  {'alpha': 0.1, 'fit_prior': True}
Class:  severe_toxic
Best accuracy:  99.0675055774196 %
Best grid:  {'alpha': 1, 'fit_prior': True}
Class:  obscene
Best accuracy:  97.27270448449603 %
Best grid:  {'alpha': 0.1, 'fit_prior': True}
Class:  threat
Best accuracy:  99.69919534755472 %
Best grid:  {'alpha': 1, 'fit_prior': True}
Class:  insult
Best accuracy:  96.82400421126513 %
Best grid:  {'alpha': 0.1, 'fit_prior': True}
Class:  identity_hate
Best accuracy:  99.18030732208658 %
Best grid:  {'alpha': 1, 'fit_prior': True}


#### Model Selection

In [38]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [42]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [43]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = MultinomialNB (alpha = temp.alpha, fit_prior = temp.fit_prior)

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    
    arr_model.append(model)
    counter = counter + 1

Class:  toxic
95.2246962167311
Class:  severe_toxic
99.07940665910473
Class:  obscene
97.48763873134843
Class:  threat
99.70295354419036
Class:  insult
97.01073503330807
Class:  identity_hate
99.18594230781282


In [44]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tf_idf_mn_tuned.csv', index = False) 

### Multinomial Naive Bayes using Count Vectorizer

#### Model Training

In [45]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [46]:
count_train = count_vectorizer.fit_transform(X_train)

In [47]:
count_test = count_vectorizer.transform(X_test)

In [48]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    y_train = train[class_]
    model = MultinomialNB ()
    model.fit(count_train, y_train)
    
    predictions = model.predict(count_train)
    arr_model.append(model)
    counter = counter + 1

In [49]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(count_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

Class:  toxic
94.87250189570786
Class:  severe_toxic
98.35057748588403
Class:  obscene
97.05146925193175
Class:  threat
98.91396306346391
Class:  insult
96.49059039549793
Class:  identity_hate
98.14627971247909


In [50]:
sample_submission = pd.read_csv('data/sample_submission.csv')

sample_submission ['id'] = test ['id'] 
counter = 0
for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
sample_submission.to_csv(f'results/submission_count_nb.csv', index = False) 

#### Hyperparameter Tuning

In [51]:
X = train ['comment_text']

In [52]:
hyperparameters = [{
    'alpha' : [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
    'fit_prior' : [False, True]
}]

In [53]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = MultinomialNB ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, random_state = 42, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = count_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = count_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = mn_hyper_parameter (class_, best_grid['alpha'], best_grid['fit_prior'])
    final_hyperparameters.append(temp)

Class:  toxic
Best accuracy:  94.85624044318553 %
Best grid:  {'alpha': 10, 'fit_prior': True}
Class:  severe_toxic
Best accuracy:  99.01486476324168 %
Best grid:  {'alpha': 1000, 'fit_prior': True}
Class:  obscene
Best accuracy:  97.12480886371043 %
Best grid:  {'alpha': 10, 'fit_prior': True}
Class:  threat
Best accuracy:  99.6641014714361 %
Best grid:  {'alpha': 1000, 'fit_prior': True}
Class:  insult
Best accuracy:  96.41791793046399 %
Best grid:  {'alpha': 0.1, 'fit_prior': True}
Class:  identity_hate
Best accuracy:  99.08003910460482 %
Best grid:  {'alpha': 1000, 'fit_prior': True}


#### Model Selection

In [54]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [55]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [56]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = MultinomialNB (alpha = temp.alpha, fit_prior = temp.fit_prior)

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    
    arr_model.append(model)
    counter = counter + 1

Class:  toxic
94.79855362189872
Class:  severe_toxic
99.00357834443602
Class:  obscene
97.06212281680256
Class:  threat
99.6390321549655
Class:  insult
96.51315088581258
Class:  identity_hate
99.0580995293631


In [57]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_mn_tuned.csv', index = False) 

### Logistic Regression using TF-IDF Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_train)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    model = LogisticRegression ()

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tf_idf_log_reg.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
hyperparameters = [{
    'C' : [1, 12, 15],
    'max_iter' :[600, 1800, 3000, 4200]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = LogisticRegression ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, random_state = 42, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = tf_idf_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = tf_idf_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = lr_hyperparameter (class_, best_grid['C'], best_grid['max_iter'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)
tf_idf_test = tf_idf_vectorizer.transform(X_train)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = LogisticRegression (C = temp.C, max_iter = temp.max_iter)

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tf_idf_log_reg_tuned.csv', index = False) 

### Logistic Regression using Count Vectorizer

#### Model Training

In [58]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    model = LogisticRegression ()

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(count_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_log_reg.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
hyperparameters = [{
    'C' : [1, 12, 15],
    'max_iter' :[600, 1800, 3000, 4200]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = LogisticRegression ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, random_state = 42, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = count_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = count_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = lr_hyperparameter (class_, best_grid['C'], best_grid['max_iter'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_train)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = LogisticRegression (C = temp.C, max_iter = temp.max_iter)

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_log_reg_tuned.csv', index = False) 