# You're Toxic, I'm Slippin' Under: Toxic Comment Classification Challenge

#### STINTSY S13 Group 8
- VICENTE, Francheska Josefa
- VISTA, Sophia Danielle S.

## Requirements and Imports
Before starting, the relevant libraries and files in building and training the model should be loaded into the notebook first.

### Import
Several libraries are required to perform a thorough analysis of the dataset. Each of these libraries will be imported and described below:

#### Basic Libraries 
Import `numpy` and `pandas`.
- `numpy` contains a large collection of mathematical functions
- `pandas` contains functions that are designed for data manipulation and data analysis

In [56]:
import numpy as np
import pandas as pd

#### Natural Language Processing Libraries 
- `re` is a module that allows the use of regular expressions
- `nltk` provides functions for processing text data
- `stopwords` is a corpus from NLTK, which includes a compiled list of stopwords
- `Counter` is from Python's `collections` module, which is helpful for tokenization
- `string` contains functions for string operations

In [57]:
import sys
!{sys.executable} -m pip install gensim



In [58]:
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import string
import gensim

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#### Machine Learning Libraries

In [59]:
import sys
!{sys.executable} -m pip install scikit-multilearn

from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multioutput import MultiOutputClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score



### Datasets and Files


In [60]:
train = pd.read_csv('cleaned_data/cleaned_train.csv')
test = pd.read_csv('cleaned_data/cleaned_test.csv')

## Trying different Models

In [61]:
def compute_accuracy(predictions, actual):
    accuracy = np.sum (predictions == actual) / len (predictions) * 100
    return accuracy

In [62]:
classes = train.columns [2:]

In [63]:
test ['comment_text'] = test ['comment_text'].apply(lambda x: np.str_(x))
train ['comment_text'] = train ['comment_text'].apply(lambda x: np.str_(x))

In [64]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [65]:
X = train ['comment_text']

In [66]:
class mn_hyper_parameter:
    def __init__(self, class_, alpha, fit_prior):
        self.class_ = class_
        self.alpha = alpha
        self.fit_prior = fit_prior

In [67]:
class lr_hyperparameter:
    def __init__(self, class_, c, max_iter):
        self.class_ = class_
        self.c = c
        self.max_iter = max_iter

In [68]:
tf_idf_vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 10000)

In [69]:
X_train

0         explanation why the edits made under my userna...
1         d aww he matches this background colour i am s...
2         hey man i am really not trying to edit war it ...
3         more i can not make any real suggestions on im...
4         you sir are my hero any chance you remember wh...
                                ...                        
159566    and for the second time of asking when your vi...
159567    you should be ashamed of yourself that is a ho...
159568    spitzer umm theres no actual article for prost...
159569    and it looks like it was actually you who put ...
159570    and i really do not think you understand i cam...
Name: comment_text, Length: 159571, dtype: object

In [70]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [71]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [72]:
count_vectorizer = CountVectorizer(stop_words = 'english', max_features = 10000)

In [73]:
count_train = count_vectorizer.fit_transform(X_train)

In [74]:
count_test = count_vectorizer.transform(X_test)

In [75]:
parameters_mn = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.5, 0.6, 0.7, 0.8, 1.0],
        'classifier__fit_prior': [True, False]
    }
]

In [76]:
parameters_lr = [
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [1, 12, 15],
        'classifier__max_iter': [600, 1800, 3000]
    }
]

### OneVsRestClassifier Classifier: Logistic Regression using Count Vector

In [78]:
from sklearn.multiclass import OneVsRestClassifier
lr_oc = OneVsRestClassifier(LogisticRegression(max_iter = 3000))
lr_oc.fit(count_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(max_iter=3000))

In [79]:
predictions = lr_oc.predict(count_train)
print(compute_accuracy(predictions, y_train))

toxic            96.740009
severe_toxic     99.272424
obscene          98.464633
threat           99.805729
insult           97.465705
identity_hate    99.340106
dtype: float64


In [80]:
predictions = lr_oc.predict(count_test)

In [81]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_oc_lr_count.csv', index = False) 

### MultiOutput Classifier: Logistic Regression using TF-IDF Vector

In [None]:
lr_mo = MultiOutputClassifier(LogisticRegression(max_iter = 3000))
lr_mo.fit(tf_idf_train, y_train)

In [None]:
predictions = lr_mo.predict(count_train)
print(compute_accuracy(predictions, y_train))

In [None]:
predictions = lr_mo.predict(tf_idf_test)

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_mo_lr_tfidf.csv', index = False) 

### MultiOutput Classifier: Multinomial Naive Bayes using Count Vector

In [None]:
mn_mo = MultiOutputClassifier(MultinomialNB())
mn_mo.fit(count_train, y_train)

In [None]:
predictions = mn_mo.predict(count_train)
print(compute_accuracy(predictions, y_train))

In [None]:
predictions = mn_mo.predict(count_test)

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_mo_mn_count.csv', index = False) 

### MultiOutput Classifier: Logistic Regression using Count Vector

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
lr_mo = MultiOutputClassifier(LogisticRegression(class_weight = 'balanced', max_iter = 3000))
lr_mo.fit(count_train, y_train)

In [None]:
predictions = lr_mo.predict(count_train)
print(compute_accuracy(predictions, y_train))

In [None]:
predictions = lr_mo.predict(count_test)

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_mo_lr_count.csv', index = False) 

In [None]:
parameters_lr_mo = [
    {
        'estimator__C': [1, 12, 15],
        'estimator__max_iter': [600, 1800, 3000],
        'estimator__class_weight' : ['balanced', None]
    }
]

In [None]:
lr_mo_tuned = RandomizedSearchCV(MultiOutputClassifier(LogisticRegression ()), parameters_lr_mo, scoring = 'accuracy', n_jobs = 3, verbose = 10)

In [None]:
lr_mo_tuned.fit(count_train, y_train)

In [None]:
predictions = lr_mo_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions, y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = lr_mo_tuned.predict(count_test)
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_count_lr_mo_tuned.csv', index = False) 

In [None]:
lr_mo_tuned.best_params_

### Classifier Chain: Multinomial Naive Bayes using Count Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
count_vectorizer = CountVectorizer(stop_words = 'english', max_features = 10000)
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
mn_cc = ClassifierChain(
    classifier = MultinomialNB(alpha = 1.0, fit_prior = True),
)

mn_cc.fit(count_train, y_train)

predictions = mn_cc.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_cc.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_count_mn_cc.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
mn_cc_tuned = RandomizedSearchCV(ClassifierChain(), parameters_mn, scoring = 'accuracy', n_jobs = 3)

In [None]:
# train
mn_cc_tuned.fit(count_train, y_train)
print (mn_cc_tuned.best_params_, mn_cc_tuned.best_score_)

In [None]:
predictions = mn_cc_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_cc_tuned.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_count_mn_cc_tuned.csv', index = False) 

### Classifier Chain: Multinomial Naive Bayes using TF-IDF Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
tf_idf_vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 5000)

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
mn_cc = ClassifierChain(
    classifier = MultinomialNB(alpha = 1.0, fit_prior = True),
)

mn_cc.fit(tf_idf_train, y_train)

predictions = mn_cc.predict(tf_idf_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_cc.predict(tf_idf_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_tfidf_mn_cc.csv', index = False) 

#### Hyperparameter Tuning

### Binary Relevance: Logistic Regression using Count Vectorizer

#### Model Training

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
binary_lr = BinaryRelevance(classifier = LogisticRegression())

In [None]:
binary_lr.fit(count_train, y_train)

In [None]:
predictions = binary_lr.predict(count_train)
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = binary_lr.predict(count_test)
predictions = predictions.todense()

In [None]:
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_binary_lr_count.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
lr_br_tuned = RandomizedSearchCV(BinaryRelevance(), parameters_lr, scoring = 'accuracy', n_jobs = 3)

In [None]:
# train
lr_br_tuned.fit(count_train, y_train)
print (lr_br_tuned.best_params_, lr_br_tuned.best_score_)

In [None]:
predictions = lr_br_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = lr_br_tuned.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_binary_lr_count_tuned.csv', index = False) 

#### Model Selection

### Binary Relevance: Multinomial Naive Bayes using TF-IDF Vectorizer

#### Model Training

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
binary_mn = BinaryRelevance(classifier = MultinomialNB())

In [None]:
binary_mn.fit(tf_idf_train, y_train)

In [None]:
predictions = binary_mn.predict(tf_idf_train)
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = binary_mn.predict(tf_idf_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv('results/submission_binary_mn_tfidf.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
mn_br_tuned = RandomizedSearchCV(BinaryRelevance(), parameters_mn, scoring = 'accuracy', n_jobs = 3)

In [None]:
# train
mn_br_tuned.fit(tf_idf_train, y_train)
print (mn_br_tuned.best_params_, v.best_score_)

In [None]:
predictions = mn_br_tuned.predict(tf_idf_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_br_tuned.predict(tf_idf_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_binary_mn_tfidf_tuned.csv', index = False) 

#### Model Selection

### Binary Relevance: Multinomial Naive Bayes using Count Vectorizer

#### Model Training

In [None]:
count_vectorizer = CountVectorizer(max_features = 15000)

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
binary_mn = BinaryRelevance(classifier = MultinomialNB())

In [None]:
binary_mn.fit(count_train, y_train)

In [None]:
predictions = binary_mn.predict(count_train)
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id']
counter = 0

predictions = binary_mn.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv('results/submission_binary_mn_count.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
mn_br_tuned = RandomizedSearchCV(BinaryRelevance(), parameters_mn, scoring = 'accuracy', n_jobs = 3)

In [None]:
# train
mn_br_tuned.fit(count_train, y_train)

In [None]:
print (mn_br_tuned.best_params_, mn_br_tuned.best_score_)

In [None]:
predictions = mn_br_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_br_tuned.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_binary_mn_count_tuned.csv', index = False) 

#### Model Selection

### Multinomial Naive Bayes using TF-IDF Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
arr_model = []
counter = 0
for class_ in classes:
    y_train = train[class_]
    model = MultinomialNB ()
    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    arr_model.append(model)
    counter = counter + 1

In [None]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 

counter = 0
for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tfidf_nb.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
X = train ['comment_text']

In [None]:
hyperparameters = [{
    'alpha' : [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000], 
    'fit_prior' : [False, True]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = MultinomialNB ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = tf_idf_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = tf_idf_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = mn_hyper_parameter (class_, best_grid['alpha'], best_grid['fit_prior'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = MultinomialNB (alpha = temp.alpha, fit_prior = temp.fit_prior)

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tf_idf_mn_tuned.csv', index = False) 

### Multinomial Naive Bayes using Count Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    y_train = train[class_]
    model = MultinomialNB ()
    model.fit(count_train, y_train)
    
    predictions = model.predict(count_train)
    arr_model.append(model)
    counter = counter + 1

In [None]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(count_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')

sample_submission ['id'] = test ['id'] 
counter = 0
for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
sample_submission.to_csv(f'results/submission_count_nb.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
X = train ['comment_text']

In [None]:
hyperparameters = [{
    'alpha' : [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
    'fit_prior' : [False, True]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = MultinomialNB ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = count_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = count_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = mn_hyper_parameter (class_, best_grid['alpha'], best_grid['fit_prior'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = MultinomialNB (alpha = temp.alpha, fit_prior = temp.fit_prior)

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_mn_tuned.csv', index = False) 

### Logistic Regression using TF-IDF Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    model = LogisticRegression ()

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tf_idf_log_reg.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
X = train ['comment_text']

In [None]:
hyperparameters = [{
    'C' : [1, 12, 15],
    'max_iter' :[600, 1800, 3000, 4200]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = LogisticRegression ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = tf_idf_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = tf_idf_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = lr_hyperparameter (class_, best_grid['C'], best_grid['max_iter'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = LogisticRegression (C = temp.c, max_iter = temp.max_iter)

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tf_idf_log_reg_tuned.csv', index = False) 

### Logistic Regression using Count Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    model = LogisticRegression ()

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(count_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_log_reg.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
X = train ['comment_text']

In [None]:
hyperparameters = [{
    'C' : [1, 12, 15],
    'max_iter' :[600, 1800, 3000, 4200]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = LogisticRegression ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = count_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = count_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = lr_hyperparameter (class_, best_grid['C'], best_grid['max_iter'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = LogisticRegression (C = temp.c, max_iter = temp.max_iter)

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_log_reg_tuned.csv', index = False) 