# You're Toxic, I'm Slippin' Under: Toxic Comment Classification Challenge

#### STINTSY S13 Group 8
- VICENTE, Francheska Josefa
- VISTA, Sophia Danielle S.

## Requirements and Imports
Before starting, the relevant libraries and files in building and training the model should be loaded into the notebook first.

### Import
Several libraries are required to perform a thorough analysis of the dataset. Each of these libraries will be imported and described below:

#### Basic Libraries 
Import `numpy` and `pandas`.
- `numpy` contains a large collection of mathematical functions
- `pandas` contains functions that are designed for data manipulation and data analysis

In [121]:
import numpy as np
import pandas as pd

#### Natural Language Processing Libraries 
- `re` is a module that allows the use of regular expressions
- `nltk` provides functions for processing text data
- `stopwords` is a corpus from NLTK, which includes a compiled list of stopwords
- `Counter` is from Python's `collections` module, which is helpful for tokenization
- `string` contains functions for string operations

In [122]:
import sys
!{sys.executable} -m pip install gensim



In [123]:
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
import string
import gensim

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#### Machine Learning Libraries

In [124]:
import sys
!{sys.executable} -m pip install scikit-multilearn

from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multioutput import MultiOutputClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import log_loss
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score



### Datasets and Files


In [125]:
train = pd.read_csv('cleaned_data/cleaned_train.csv')
test = pd.read_csv('cleaned_data/cleaned_test.csv')

## Trying different Models

In [126]:
test ['comment_text'] = test ['comment_text'].apply(lambda x: np.str_(x))
train ['comment_text'] = train ['comment_text'].apply(lambda x: np.str_(x))

In [127]:
X_train = train ['comment_text']
y_train = train.loc [ : , 'toxic' : ]

X_test = test ['comment_text']

In [128]:
classes = train.columns [2:]

In [129]:
class mn_hyper_parameter:
    def __init__(self, class_, alpha, fit_prior):
        self.class_ = class_
        self.alpha = alpha
        self.fit_prior = fit_prior

In [130]:
class lr_hyperparameter:
    def __init__(self, class_, c, max_iter):
        self.class_ = class_
        self.c = c
        self.max_iter = max_iter

#### Helper Functions

In [131]:
def compute_accuracy(predictions, actual):
    accuracy = np.sum (predictions == actual) / len (predictions) * 100
    return accuracy

In [132]:
def to_submission_csv(predictions, filename):
    sample_submission = pd.read_csv('data/sample_submission.csv')
    sample_submission ['id'] = test ['id'] 
    counter = 0

    for i in range (6):
        sample_submission[classes [i]] = predictions[:, i : i + 1]

    sample_submission.to_csv(f'results/' + filename + '.csv', index = False) 

In [133]:
def to_submission_csv_multiclass (predictions, filename):
    sample_submission = pd.read_csv('data/sample_submission.csv')
    sample_submission ['id'] = test ['id'] 
    counter = 0

    for i in range (6):
        temp = list(zip(*predictions[i]))
        sample_submission[classes [i]] = temp[1]

    sample_submission.to_csv(f'results/' + filename + '.csv', index = False) 

### TF-IDF Vectorizer

In [134]:
tfidf_vectorizer = TfidfVectorizer()

In [135]:
tfidf_train = tf_idf_vectorizer.fit_transform(X_train)

In [136]:
tfidf_test = tf_idf_vectorizer.transform(X_test)

In [137]:
count_vectorizer = CountVectorizer()

In [138]:
count_train = count_vectorizer.fit_transform(X_train)

In [139]:
count_test = count_vectorizer.transform(X_test)

In [140]:
parameters_mn = [
    {
        'classifier': [MultinomialNB()],
        'classifier__alpha': [0.5, 0.6, 0.7, 0.8, 1.0],
        'classifier__fit_prior': [True, False]
    }
]

In [141]:
parameters_lr = [
    {
        'classifier': [LogisticRegression()],
        'classifier__C': [1, 12, 15],
        'classifier__max_iter': [600, 1800, 3000]
    }
]

In [78]:
predictions_mnb_count = np.zeros((len(test), len(classes)))
predictions_mnb_tfidf = np.zeros((len(test), len(classes)))

for i in range(6):
    print('Fitting', classes[i] + '...')
    
    mnb = MultinomialNB()
    
    mnb.fit(count_train, y_train[classes[i]])
    print('Count Vectors:', compute_accuracy(mnb.predict(count_train), y_train[classes[i]]))
    predictions_mnb_count[:,i] = mnb.predict_proba(count_test)[:,1]
    
    mnb.fit(tfidf_train, y_train[classes[i]])
    print('TF-IDF Vectors:', compute_accuracy(mnb.predict(tfidf_train), y_train[classes[i]]))
    predictions_mnb_tfidf[:,i] = mnb.predict_proba(tfidf_test)[:,1]

Fitting toxic...
Count Vectors: 95.13696097661855
TF-IDF Vectors: 92.36828747078103
Fitting severe_toxic...
Count Vectors: 98.641983819115
TF-IDF Vectors: 98.99104473870565
Fitting obscene...
Count Vectors: 96.70867513520626
TF-IDF Vectors: 95.38449968979326
Fitting threat...
Count Vectors: 99.55505699657206
TF-IDF Vectors: 99.6973134216117
Fitting insult...
Count Vectors: 96.46301646289113
TF-IDF Vectors: 95.35629907689994
Fitting identity_hate...
Count Vectors: 98.77233331871079
TF-IDF Vectors: 99.11074067343063


In [79]:
to_submission_csv(predictions_mnb_count, 'submission_mnb_count')
to_submission_csv(predictions_mnb_tfidf, 'submission_mnb_tfidf')

In [80]:
pd.DataFrame(
    data={'private': [0.84551, 0.82510], 'public': [0.85581, 0.83586]}, 
    index=['submission_mnb_count.csv', 'submission_mnb_tfidf.csv']
)

Unnamed: 0,private,public
submission_mnb_count.csv,0.84551,0.85581
submission_mnb_tfidf.csv,0.8251,0.83586


In [81]:
parameters_mnb = [{
    'alpha' : [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
    'fit_prior' : [False, True]
}]

In [None]:
predictions_mnb_count_tuned = np.zeros((len(test), len(classes)))
predictions_mnb_tfidf_tuned = np.zeros((len(test), len(classes)))

for i in range(6):
    print('Fitting', classes[i] + '...')
    
    mnb_tuned = GridSearchCV(MultinomialNB(), parameters_mnb, scoring='f1')
    mnb_tuned.fit(count_train, y_train[classes[i]])
    print('Count Vectors:', compute_accuracy(mnb_tuned.predict(count_train), y_train[classes[i]]), mnb_tuned.best_params_)
    predictions_mnb_count_tuned[:,i] = mnb_tuned.predict_proba(count_test)[:,1]
    
    mnb_tuned = GridSearchCV(MultinomialNB(), parameters_mnb, scoring='f1')
    mnb_tuned.fit(tf_idf_train, y_train[classes[i]])
    print('TF-IDF Vectors:', compute_accuracy(mnb_tuned.predict(tfidf_train), y_train[classes[i]]), mnb_tuned.best_params_)
    predictions_mnb_tfidf_tuned[:,i] = mnb_tuned.predict_proba(tfidf_test)[:,1]

Fitting toxic...
Count Vectors: 95.13696097661855 {'alpha': 1, 'fit_prior': True}
TF-IDF Vectors: 97.48262528905627 {'alpha': 0.001, 'fit_prior': True}
Fitting severe_toxic...
Count Vectors: 98.72031885492977 {'alpha': 0.001, 'fit_prior': True}
TF-IDF Vectors: 99.42157409554368 {'alpha': 0.001, 'fit_prior': True}
Fitting obscene...
Count Vectors: 96.70867513520626 {'alpha': 1, 'fit_prior': True}
TF-IDF Vectors: 98.62067668937338 {'alpha': 0.001, 'fit_prior': True}
Fitting threat...


In [None]:
to_submission_csv(predictions_mnb_count_tuned, 'submission_mnb_count_tuned')
to_submission_csv(predictions_mnb_tfidf_tuned, 'submission_mnb_tfidf_tuned')

In [89]:
pd.DataFrame(
    data={'private': [0.75966, 0.82930], 'public': [0.76208, 0.83952]}, 
    index=['submission_mnb_count_tuned.csv', 'submission_mnb_tfidf_tuned.csv']
)

Unnamed: 0,private,public
submission_mnb_count_tuned.csv,0.75966,0.76208
submission_mnb_tfidf_tuned.csv,0.8293,0.83952


### Ensemble Models: RandomForestClassifier

In [40]:
predictions_rf_count = np.zeros((len(test), len(classes)))
predictions_rf_tfidf = np.zeros((len(test), len(classes)))

for i in range(6):
    print('Fitting', classes[i] + '...')
    
    rf = RandomForestClassifier(n_estimators=1000, max_leaf_nodes=20, n_jobs=-1)
    
    rf.fit(count_train, y_train[classes[i]])
    print('Count Vectors:', compute_accuracy(rf.predict(count_train), y_train[classes[i]]))
    predictions_rf_count[:,i] = rf.predict_proba(count_test)[:,1]
    
    rf.fit(tfidf_train, y_train[classes[i]])
    print('TF-IDF Vectors:', compute_accuracy(rf.predict(tfidf_train), y_train[classes[i]]))
    predictions_rf_tfidf[:,i] = rf.predict_proba(tfidf_test)[:,1]

Fitting toxic...
Count Vectors: 90.41555169799024
TF-IDF Vectors: 90.41555169799024
Fitting severe_toxic...
Count Vectors: 99.00044494300343
TF-IDF Vectors: 99.00044494300343
Fitting obscene...
Count Vectors: 94.7051782592075
TF-IDF Vectors: 94.7051782592075
Fitting threat...
Count Vectors: 99.70044682304429
TF-IDF Vectors: 99.70170018361732
Fitting insult...
Count Vectors: 95.06363938309592
TF-IDF Vectors: 95.06363938309592
Fitting identity_hate...
Count Vectors: 99.11951419744189
TF-IDF Vectors: 99.11951419744189


In [42]:
to_submission_csv(predictions_rf_count, 'submission_rf_count')
to_submission_csv(predictions_rf_tfidf, 'submission_rf_tfidf')

In [76]:
pd.DataFrame(
    data={'private': [0.96735, 0.96784], 'public': [0.96725, 0.96710]}, 
    index=['submission_rf_count.csv', 'submission_rf_tfidf.csv']
)

Unnamed: 0,private,public
submission_rf_count.csv,0.96735,0.96725
submission_rf_tfidf.csv,0.96784,0.9671


#### Hyperparameter Tuning

In [65]:
parameters_rf = [{
    'n_estimators' : [500, 1000, 1500],
    'min_samples_split' : [2, 10, 20],
    'max_leaf_nodes' : [15, 20, 25],
    'min_samples_leaf' : [1, 5, 10],
}]

In [68]:
predictions_rf_count_tuned = np.zeros((len(test), len(classes)))
predictions_rf_tfidf_tuned = np.zeros((len(test), len(classes)))

for i in range(6):
    print('Fitting', classes[i] + '...')
    rf_tuned = GridSearchCV(RandomForestClassifier(n_jobs=-1), parameters_rf, scoring='accuracy', verbose=1)
    
    rf_tuned.fit(count_train, y_train[classes[i]])
    print('Count Vectors:', compute_accuracy(rf_tuned.predict(count_train), y_train[classes[i]]))
    predictions_rf_count_tuned[:,i] = rf_tuned.predict_proba(count_test)[:,1]
    print(rf_tuned.best_params_, rf_tuned.best_score_)
    
    rf_tuned.fit(tf_idf_train, y_train[classes[i]])
    print('TF-IDF Vectors:', compute_accuracy(rf_tuned.predict(tfidf_train), y_train[classes[i]]))
    predictions_rf_tfidf_tuned[:,i] = rf_tuned.predict_proba(tfidf_test)[:,1]
    print(rf_tuned.best_params_, rf_tuned.best_score_)

Fitting toxic...
Fitting 5 folds for each of 81 candidates, totalling 405 fits


KeyboardInterrupt: 

In [None]:
to_submission_csv(predictions_rf_count_tuned, 'submission_rf_count_tuned')
to_submission_csv(predictions_rf_tfidf_tuned, 'submission_rf_tfidf_tuned')

### Ensemble Models: GradientBoostingClassifier

In [53]:
predictions_gbc_count = np.zeros((len(test), len(classes)))
predictions_gbc_tfidf = np.zeros((len(test), len(classes)))

for i in range(6):
    print('Fitting', classes[i] + '...')
    
    gbc = GradientBoostingClassifier(verbose=2)
    
    gbc.fit(count_train, y_train[classes[i]])
    print('Count Vectors:', compute_accuracy(gbc.predict(count_train), y_train[classes[i]]))
    predictions_gbc_count[:,i] = gbc.predict_proba(count_test)[:,1]
    
    gbc.fit(tfidf_train, y_train[classes[i]])
    print('TF-IDF Vectors:', compute_accuracy(gbc.predict(tfidf_train), y_train[classes[i]]))
    predictions_gbc_tfidf[:,i] = gbc.predict_proba(tfidf_test)[:,1]

Fitting toxic...
      Iter       Train Loss   Remaining Time 
         1           0.5865            2.41m
         2           0.5667            2.30m
         3           0.5524            2.20m
         4           0.5425            2.14m
         5           0.5329            2.10m
         6           0.5258            2.10m
         7           0.5180            2.10m
         8           0.5124            2.10m
         9           0.5050            2.10m
        10           0.5007            2.09m
        11           0.4942            2.06m
        12           0.4897            2.03m
        13           0.4840            2.02m
        14           0.4796            2.00m
        15           0.4761            1.99m
        16           0.4699            1.97m
        17           0.4667            1.93m
        18           0.4623            1.90m
        19           0.4596            1.87m
        20           0.4573            1.85m
        21           0.4547          

        80           0.3567            1.06m
        81           0.3558            1.01m
        82           0.3552           57.21s
        83           0.3539           54.05s
        84           0.3532           50.87s
        85           0.3525           47.67s
        86           0.3515           44.48s
        87           0.3508           41.30s
        88           0.3501           38.12s
        89           0.3496           34.93s
        90           0.3489           31.75s
        91           0.3483           28.57s
        92           0.3477           25.39s
        93           0.3469           22.21s
        94           0.3458           19.03s
        95           0.3450           15.86s
        96           0.3444           12.68s
        97           0.3438            9.51s
        98           0.3429            6.34s
        99           0.3424            3.17s
       100           0.3417            0.00s
TF-IDF Vectors: 94.43382569514512
Fitting severe_toxic.

        58           0.0540            2.30m
        59           0.0538            2.25m
        60           0.0536            2.19m
        61           0.0535            2.14m
        62           0.0532            2.08m
        63           0.0529            2.03m
        64           0.0527            1.97m
        65           0.0527            1.93m
        66           0.0525            1.87m
        67           0.0523            1.82m
        68           0.0522            1.76m
        69           0.0519            1.71m
        70           0.0519            1.65m
        71           0.0517            1.60m
        72           0.0517            1.54m
        73           0.0515            1.49m
        74           0.0514            1.43m
        75           0.0513            1.38m
        76           0.0510            1.32m
        77           0.0508            1.27m
        78           0.0506            1.21m
        79           0.0505            1.16m
        80

        37           0.1965            3.28m
        38           0.1957            3.23m
        39           0.1948            3.17m
        40           0.1937            3.12m
        41           0.1926            3.07m
        42           0.1908            3.02m
        43           0.1896            2.97m
        44           0.1887            2.91m
        45           0.1877            2.86m
        46           0.1869            2.81m
        47           0.1862            2.76m
        48           0.1853            2.71m
        49           0.1845            2.65m
        50           0.1836            2.60m
        51           0.1816            2.55m
        52           0.1810            2.50m
        53           0.1802            2.45m
        54           0.1796            2.40m
        55           0.1788            2.35m
        56           0.1778            2.29m
        57           0.1772            2.24m
        58           0.1757            2.19m
        59

        60 325854827083698219568101057963223825814381632814998035103744.0000           50.14s
        61 325854827083698219568101057963223825814381632814998035103744.0000           48.80s
        62 325854827083698219568101057963223825814381632814998035103744.0000           47.47s
        63 325854827083698219568101057963223825814381632814998035103744.0000           46.13s
        64 325854827083698219568101057963223825814381632814998035103744.0000           44.81s
        65 325854827083698219568101057963223825814381632814998035103744.0000           43.50s
        66 325854827083698219568101057963223825814381632814998035103744.0000           42.21s
        67 325854827083698219568101057963223825814381632814998035103744.0000           40.91s
        68 325854827083698219568101057963223825814381632814998035103744.0000           39.61s
        69 325854827083698219568101057963223825814381632814998035103744.0000           38.32s
        70 3258548270836982195681010579632238258143816328149

        96    35116343.4071           12.23s
        97    35116343.4071            9.17s
        98    35116343.4071            6.11s
        99    35116343.4071            3.06s
       100    35116343.4071            0.00s
TF-IDF Vectors: 99.78066189971862
Fitting insult...
      Iter       Train Loss   Remaining Time 
         1           0.3446            2.23m
         2           0.3304            2.12m
         3           0.3197            2.06m
         4           0.3116            2.01m
         5           0.3040            2.00m
         6           0.2994            1.97m
         7           0.2953            1.95m
         8           0.2903            1.92m
         9           0.2856            1.91m
        10           0.2830            1.89m
        11           0.2796            1.86m
        12           0.2771            1.84m
        13           0.2739            1.82m
        14           0.2715            1.79m
        15           0.2697            1.77m
  

        75           0.1994            1.28m
        76           0.1989            1.23m
        77           0.1979            1.18m
        78           0.1974            1.13m
        79           0.1970            1.07m
        80           0.1967            1.02m
        81           0.1964           58.33s
        82           0.1960           55.27s
        83           0.1957           52.19s
        84           0.1949           49.12s
        85           0.1945           46.05s
        86           0.1941           42.98s
        87           0.1937           39.90s
        88           0.1926           36.83s
        89           0.1922           33.76s
        90           0.1918           30.69s
        91           0.1915           27.62s
        92           0.1910           24.55s
        93           0.1901           21.48s
        94           0.1898           18.41s
        95           0.1894           15.35s
        96           0.1891           12.28s
        97

        53           0.0492            2.38m
        54           0.0491            2.33m
        55           0.0490            2.28m
        56           0.0486            2.23m
        57           0.0484            2.18m
        58           0.0481            2.13m
        59           0.0479            2.08m
        60           0.0474            2.03m
        61           0.0473            1.98m
        62           0.0473            1.92m
        63           0.0470            1.87m
        64           0.0465            1.82m
        65           0.0462            1.77m
        66           0.0460            1.72m
        67           0.0458            1.67m
        68           0.0456            1.62m
        69           0.0454            1.57m
        70           0.0452            1.51m
        71           0.0449            1.46m
        72           0.0447            1.41m
        73           0.0445            1.36m
        74           0.0444            1.31m
        75

In [54]:
to_submission_csv(predictions_gbc_count, 'submission_gbc_count')
to_submission_csv(predictions_gbc_tfidf, 'submission_gbc_tfidf')

In [75]:
pd.DataFrame(
    data={'private': [0.90663, 0.92569], 'public': [0.92024, 0.93239]}, 
    index=['submission_gbc_count.csv', 'submission_gbc_tfidf.csv']
)

Unnamed: 0,private,public
submission_gbc_count.csv,0.90663,0.92024
submission_gbc_tfidf.csv,0.92569,0.93239


### Ensemble Models: AdaBoostClassifier

In [58]:
predictions_adb_count = np.zeros((len(test), len(classes)))
predictions_adb_tfidf = np.zeros((len(test), len(classes)))

for i in range(6):
    print('Fitting', classes[i] + '...')
    
    adb = AdaBoostClassifier()
    
    adb.fit(count_train, y_train[classes[i]])
    print('Count Vectors:', compute_accuracy(adb.predict(count_train), y_train[classes[i]]))
    predictions_adb_count[:,i] = adb.predict_proba(count_test)[:,1]
    
    adb.fit(tfidf_train, y_train[classes[i]])
    print('TF-IDF Vectors:', compute_accuracy(adb.predict(tfidf_train), y_train[classes[i]]))
    predictions_adb_tfidf[:,i] = adb.predict_proba(tfidf_test)[:,1]

Fitting toxic...
Count Vectors: 94.5434947452858
TF-IDF Vectors: 94.8054471050504
Fitting severe_toxic...
Count Vectors: 98.9615907652393
TF-IDF Vectors: 98.99981826271691
Fitting obscene...
Count Vectors: 97.11977740316223
TF-IDF Vectors: 97.49139881306755
Fitting threat...
Count Vectors: 99.69480670046562
TF-IDF Vectors: 99.69606006103866
Fitting insult...
Count Vectors: 96.59211260191388
TF-IDF Vectors: 96.79327697388624
Fitting identity_hate...
Count Vectors: 99.15084821176781
TF-IDF Vectors: 99.17278202179594


In [59]:
to_submission_csv(predictions_adb_count, 'submission_adb_count')
to_submission_csv(predictions_adb_tfidf, 'submission_adb_tfidf')

In [77]:
pd.DataFrame(
    data={'private': [0.93539, 0.93830], 'public': [0.94218, 0.94145]}, 
    index=['submission_adb_count.csv', 'submission_adb_tfidf.csv']
)

Unnamed: 0,private,public
submission_adb_count.csv,0.93539,0.94218
submission_adb_tfidf.csv,0.9383,0.94145


### OneVsRestClassifier Classifier: Logistic Regression using Count Vector

In [None]:
from sklearn.multiclass import OneVsRestClassifier
lr_oc = OneVsRestClassifier(LogisticRegression(max_iter = 3000))
lr_oc.fit(count_train, y_train)

In [None]:
predictions = lr_oc.predict(count_train)
print(compute_accuracy(predictions, y_train))

In [None]:
predictions = lr_oc.predict(count_test)

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_oc_lr_count.csv', index = False) 

### MultiOutput Classifier: Logistic Regression

#### Model Training

In [142]:
X_train = train ['comment_text']
X_test = test ['comment_text']
y_train = train.loc [ : , 'toxic' : ]
y_train

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [143]:
lr_mo_count = MultiOutputClassifier(LogisticRegression(class_weight = 'balanced', max_iter = 3000))
lr_mo_count.fit(count_train, y_train)

MultiOutputClassifier(estimator=LogisticRegression(class_weight='balanced',
                                                   max_iter=3000))

In [144]:
lr_mo_tf = MultiOutputClassifier(LogisticRegression(class_weight = 'balanced', max_iter = 3000))
lr_mo_tf.fit(tf_idf_train, y_train)

MultiOutputClassifier(estimator=LogisticRegression(class_weight='balanced',
                                                   max_iter=3000))

In [146]:
predictions = lr_mo_tf.predict(tf_idf_train)
print('TF-IDF Vectors: ' , compute_accuracy(predictions, y_train))

predictions = lr_mo_count.predict(count_train)
print('Count Vectors: ', compute_accuracy(predictions, y_train))

TF-IDF Vectors:  toxic            94.403118
severe_toxic     97.335982
obscene          97.324702
threat           99.067500
insult           95.850750
identity_hate    97.123537
dtype: float64
Count Vectors:  toxic            97.955142
severe_toxic     98.672064
obscene          98.804294
threat           99.741808
insult           97.874927
identity_hate    98.947177
dtype: float64


In [149]:
predictions_lr_mo_tf = lr_mo_tf.predict_proba(tf_idf_test)
predictions_lr_mo_count = lr_mo_count.predict_proba(count_test)

In [150]:
to_submission_csv_multiclass(predictions_lr_mo_tf, 'submission_mo_lr_tf')
to_submission_csv_multiclass(predictions_lr_mo_count, 'submission_mo_lr_count')

In [154]:
pd.DataFrame(
    data={'private': [0.94036, 0.97063], 'public': [0.94400, 0.97183]}, 
    index=['submission_mo_lr_count.csv', 'submission_mo_lr_tf.csv']
)

Unnamed: 0,private,public
submission_mo_lr_count.csv,0.94036,0.944
submission_mo_lr_tf.csv,0.97063,0.97183


#### Hyperparameter Tuning

In [151]:
parameters_lr_mo = [
    {
        'estimator__C': [1, 12, 15],
        'estimator__max_iter': [600, 1800, 3000],
        'estimator__class_weight' : ['balanced', None]
    }
]

In [152]:
predictions_lr_count_tuned = np.zeros((len(test), len(classes)))
predictions_lr_tfidf_tuned = np.zeros((len(test), len(classes)))

In [155]:
estimator = MultiOutputClassifier(LogisticRegression ())
lr_mo_tuned = GridSearchCV(estimator, parameters_lr_mo, n_jobs = -1, verbose = 10, scoring = 'f1')
lr_mo_tuned.fit(tf_idf_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits




GridSearchCV(estimator=MultiOutputClassifier(estimator=LogisticRegression()),
             n_jobs=-1,
             param_grid=[{'estimator__C': [1, 12, 15],
                          'estimator__class_weight': ['balanced', None],
                          'estimator__max_iter': [600, 1800, 3000]}],
             scoring='f1', verbose=10)

In [156]:
predictions = lr_mo_tuned.predict(tf_idf_train)
print('TF-IDF Vectors: ', compute_accuracy(predictions, y_train), lr_mo_tuned.best_params_)

TF-IDF Vectors:  toxic            94.403118
severe_toxic     97.335982
obscene          97.324702
threat           99.067500
insult           95.850750
identity_hate    97.123537
dtype: float64 {'estimator__C': 1, 'estimator__class_weight': 'balanced', 'estimator__max_iter': 600}


In [157]:
predictions_lr_tfidf_tuned = lr_mo_tuned.predict_proba(tf_idf_test)
to_submission_csv_multiclass(predictions_lr_tfidf_tuned, 'submission_mo_lr_tf_tuned')

In [158]:
lr_mo_tuned = GridSearchCV(estimator, parameters_lr_mo, n_jobs = -1, verbose = 10, scoring = 'f1')
lr_mo_tuned.fit(count_train, y_train)   

Fitting 5 folds for each of 18 candidates, totalling 90 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

GridSearchCV(estimator=MultiOutputClassifier(estimator=LogisticRegression()),
             n_jobs=-1,
             param_grid=[{'estimator__C': [1, 12, 15],
                          'estimator__class_weight': ['balanced', None],
                          'estimator__max_iter': [600, 1800, 3000]}],
             scoring='f1', verbose=10)

In [159]:
predictions = lr_mo_tuned.predict(count_train)
print('Count Vectors: ', compute_accuracy(predictions, y_train), lr_mo_tuned.best_params_)

Count Vectors:  toxic            97.936342
severe_toxic     98.640730
obscene          98.735359
threat           99.741808
insult           97.858633
identity_hate    98.945924
dtype: float64 {'estimator__C': 1, 'estimator__class_weight': 'balanced', 'estimator__max_iter': 600}


In [160]:
predictions_lr_count_tuned = lr_mo_tuned.predict_proba(count_test)
to_submission_csv_multiclass(predictions_lr_count_tuned, 'submission_mo_lr_count_tuned')

In [161]:
pd.DataFrame(
    data={'private': [0.93996, 0.97063], 'public': [0.94410, 0.97183]}, 
    index=['submission_mo_lr_count_tuned.csv', 'submission_mo_lr_tf_tuned.csv']
)

Unnamed: 0,private,public
submission_mo_lr_count_tuned.csv,0.93996,0.9441
submission_mo_lr_tf_tuned.csv,0.97063,0.97183


### MultiOutput Classifier: Multinomial Naive Bayes using Count Vector

In [None]:
mn_mo = MultiOutputClassifier(MultinomialNB())
mn_mo.fit(count_train, y_train)

In [None]:
predictions = mn_mo.predict(count_train)
print(compute_accuracy(predictions, y_train))

In [None]:
predictions = mn_mo.predict(count_test)

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_mo_mn_count.csv', index = False) 

### Classifier Chain: Multinomial Naive Bayes using Count Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
count_vectorizer = CountVectorizer(stop_words = 'english', max_features = 10000)
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
mn_cc = ClassifierChain(
    classifier = MultinomialNB(alpha = 1.0, fit_prior = True),
)

mn_cc.fit(count_train, y_train)

predictions = mn_cc.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_cc.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_count_mn_cc.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
mn_cc_tuned = RandomizedSearchCV(ClassifierChain(), parameters_mn, scoring = 'accuracy', n_jobs = 3)

In [None]:
# train
mn_cc_tuned.fit(count_train, y_train)
print (mn_cc_tuned.best_params_, mn_cc_tuned.best_score_)

In [None]:
predictions = mn_cc_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_cc_tuned.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_count_mn_cc_tuned.csv', index = False) 

### Classifier Chain: Multinomial Naive Bayes using TF-IDF Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
tf_idf_vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 5000)

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
mn_cc = ClassifierChain(
    classifier = MultinomialNB(alpha = 1.0, fit_prior = True),
)

mn_cc.fit(tf_idf_train, y_train)

predictions = mn_cc.predict(tf_idf_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_cc.predict(tf_idf_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_tfidf_mn_cc.csv', index = False) 

#### Hyperparameter Tuning

### Binary Relevance: Logistic Regression using Count Vectorizer

#### Model Training

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
binary_lr = BinaryRelevance(classifier = LogisticRegression())

In [None]:
binary_lr.fit(count_train, y_train)

In [None]:
predictions = binary_lr.predict(count_train)
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = binary_lr.predict(count_test)
predictions = predictions.todense()

In [None]:
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_binary_lr_count.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
lr_br_tuned = RandomizedSearchCV(BinaryRelevance(), parameters_lr, scoring = 'accuracy', n_jobs = 3)

In [None]:
# train
lr_br_tuned.fit(count_train, y_train)
print (lr_br_tuned.best_params_, lr_br_tuned.best_score_)

In [None]:
predictions = lr_br_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = lr_br_tuned.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_binary_lr_count_tuned.csv', index = False) 

#### Model Selection

### Binary Relevance: Multinomial Naive Bayes using TF-IDF Vectorizer

#### Model Training

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
binary_mn = BinaryRelevance(classifier = MultinomialNB())

In [None]:
binary_mn.fit(tf_idf_train, y_train)

In [None]:
predictions = binary_mn.predict(tf_idf_train)
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = binary_mn.predict(tf_idf_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv('results/submission_binary_mn_tfidf.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
mn_br_tuned = RandomizedSearchCV(BinaryRelevance(), parameters_mn, scoring = 'accuracy', n_jobs = 3)

In [None]:
# train
mn_br_tuned.fit(tf_idf_train, y_train)
print (mn_br_tuned.best_params_, v.best_score_)

In [None]:
predictions = mn_br_tuned.predict(tf_idf_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_br_tuned.predict(tf_idf_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_binary_mn_tfidf_tuned.csv', index = False) 

#### Model Selection

### Binary Relevance: Multinomial Naive Bayes using Count Vectorizer

#### Model Training

In [None]:
count_vectorizer = CountVectorizer(max_features = 15000)

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
y_train = train.loc [ : , 'toxic' : ]
y_train

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
binary_mn = BinaryRelevance(classifier = MultinomialNB())

In [None]:
binary_mn.fit(count_train, y_train)

In [None]:
predictions = binary_mn.predict(count_train)
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id']
counter = 0

predictions = binary_mn.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv('results/submission_binary_mn_count.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
mn_br_tuned = RandomizedSearchCV(BinaryRelevance(), parameters_mn, scoring = 'accuracy', n_jobs = 3)

In [None]:
# train
mn_br_tuned.fit(count_train, y_train)

In [None]:
print (mn_br_tuned.best_params_, mn_br_tuned.best_score_)

In [None]:
predictions = mn_br_tuned.predict(count_train)

In [None]:
print(compute_accuracy(predictions.todense(), y_train))

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

predictions = mn_br_tuned.predict(count_test)
predictions = predictions.todense()
for i in range (6):
    sample_submission[classes [i]] = predictions[:, i : i + 1]

sample_submission.to_csv(f'results/submission_binary_mn_count_tuned.csv', index = False) 

#### Model Selection

### Multinomial Naive Bayes using TF-IDF Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
arr_model = []
counter = 0
for class_ in classes:
    y_train = train[class_]
    model = MultinomialNB ()
    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    arr_model.append(model)
    counter = counter + 1

In [None]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 

counter = 0
for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tfidf_nb.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
X = train ['comment_text']

In [None]:
hyperparameters = [{
    'alpha' : [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000], 
    'fit_prior' : [False, True]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = MultinomialNB ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = tf_idf_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = tf_idf_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = mn_hyper_parameter (class_, best_grid['alpha'], best_grid['fit_prior'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = MultinomialNB (alpha = temp.alpha, fit_prior = temp.fit_prior)

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tf_idf_mn_tuned.csv', index = False) 

### Multinomial Naive Bayes using Count Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    y_train = train[class_]
    model = MultinomialNB ()
    model.fit(count_train, y_train)
    
    predictions = model.predict(count_train)
    arr_model.append(model)
    counter = counter + 1

In [None]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(count_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')

sample_submission ['id'] = test ['id'] 
counter = 0
for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
sample_submission.to_csv(f'results/submission_count_nb.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
X = train ['comment_text']

In [None]:
hyperparameters = [{
    'alpha' : [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
    'fit_prior' : [False, True]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = MultinomialNB ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = count_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = count_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = mn_hyper_parameter (class_, best_grid['alpha'], best_grid['fit_prior'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = MultinomialNB (alpha = temp.alpha, fit_prior = temp.fit_prior)

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_mn_tuned.csv', index = False) 

### Logistic Regression using TF-IDF Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)

In [None]:
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [31]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    model = LogisticRegression (n_jobs=-1)

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

Class:  toxic
96.23615819917153
Class:  severe_toxic
99.12578100030707
Class:  obscene
97.95514222509102
Class:  threat
99.73366087822976
Class:  insult
97.39551672923025
Class:  identity_hate
99.24109017302642


In [41]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict_proba(tf_idf_test)[:,1]
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_logreg_1.csv', index = False) 

PermissionError: [Errno 13] Permission denied: 'results/submission_logreg_1.csv'

#### Hyperparameter Tuning

In [None]:
X = train ['comment_text']

In [None]:
hyperparameters = [{
    'C' : [1, 12, 15],
    'max_iter' :[600, 1800, 3000, 4200]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = LogisticRegression ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = tf_idf_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = tf_idf_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = lr_hyperparameter (class_, best_grid['C'], best_grid['max_iter'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
tf_idf_train = tf_idf_vectorizer.fit_transform(X_train)
tf_idf_test = tf_idf_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = LogisticRegression (C = temp.c, max_iter = temp.max_iter)

    model.fit(tf_idf_train, y_train)
    predictions = model.predict(tf_idf_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(tf_idf_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_tf_idf_log_reg_tuned.csv', index = False) 

### Logistic Regression using Count Vectorizer

#### Model Training

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)

In [None]:
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    model = LogisticRegression ()

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    predictions = arr_model [counter].predict(count_train)
    print(compute_accuracy(predictions, y_train))
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_log_reg.csv', index = False) 

#### Hyperparameter Tuning

In [None]:
X = train ['comment_text']

In [None]:
hyperparameters = [{
    'C' : [1, 12, 15],
    'max_iter' :[600, 1800, 3000, 4200]
}]

In [None]:
final_hyperparameters = []
classes = train.columns [2:]
arr_model = []
counter = 0

for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    best_score = 0
    
    model = LogisticRegression ()
    
    X_train, X_val, y_train, y_val = train_test_split (X, y_train, test_size = 0.25, stratify = y_train)
    
    X_train_sparse_matrix = count_vectorizer.fit_transform(X_train)
    X_validation_sparse_matrix = count_vectorizer.transform(X_val)
    
    for g in ParameterGrid(hyperparameters):

        model.set_params(**g)

        model.fit(X_train_sparse_matrix, y_train)
        predictions = model.predict (X_train_sparse_matrix)
        train_acc = compute_accuracy (predictions, y_train)

        predictions = model.predict (X_validation_sparse_matrix)
        val_acc = compute_accuracy (predictions, y_val)

        if val_acc > best_score:
            best_score = val_acc
            best_grid = g
    
    print("Best accuracy: ", best_score, "%")
    print("Best grid: ", best_grid)
    temp = lr_hyperparameter (class_, best_grid['C'], best_grid['max_iter'])
    final_hyperparameters.append(temp)

#### Model Selection

In [None]:
X_train = train ['comment_text']
X_test = test ['comment_text']

In [None]:
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

In [None]:
classes = train.columns [2:]
arr_model = []
counter = 0
for class_ in classes:
    print("Class: ", class_)
    y_train = train[class_]
    temp = final_hyperparameters [counter]
    model = LogisticRegression (C = temp.c, max_iter = temp.max_iter)

    model.fit(count_train, y_train)
    predictions = model.predict(count_train)
    print(compute_accuracy(predictions, y_train))
    arr_model.append(model)
    counter = counter + 1

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission ['id'] = test ['id'] 
counter = 0

for class_ in classes:
    predictions = arr_model [counter].predict(count_test)
    sample_submission [class_] = predictions
    counter = counter + 1
    
sample_submission.to_csv(f'results/submission_count_log_reg_tuned.csv', index = False) 