## Logistic Regression Multilabel Classifier

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report
from ast import literal_eval
import warnings 
warnings.filterwarnings("ignore")

pd.options.display.max_colwidth = 63

### Import Training/Test Set 

In [2]:
df_train = pd.read_csv('../data/train_cleaned.zip', compression='zip')
df_train.shape

(159571, 17)

In [3]:
df_train.iloc[[1]]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxicity,clean_text,tokenized,tokenized_no_stopwords,tokenized_stemmed,tokenized_pos,tokenized_lemmatized,tokenized_lemmatized_pos,tokenized_bigram
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm seemingly stuc...,0,0,0,0,0,0,0,daww he matches this background colour i am seemingly stuck...,"['daww', 'he', 'matches', 'this', 'background', 'colour', '...","['daww', 'matches', 'background', 'colour', 'seemingly', 's...","['daww', 'match', 'background', 'colour', 'seemingli', 'stu...","[('daww', 'NN'), ('matches', 'NNS'), ('background', 'IN'), ...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","[('daww', 'matches'), ('matches', 'background'), ('backgrou..."


In [4]:
df_test = pd.read_csv('../data/test_cleaned.zip', compression='zip')
df_test.shape

(63978, 17)

## Text Encoding

In [5]:
data = 'tokenized_stemmed'
labels = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

#### Encoding Training Data

In [6]:
X_train = df_train[data].apply(lambda x: literal_eval(x))
y_train = df_train[labels]

In [7]:
X_train[1]

['daww',
 'match',
 'background',
 'colour',
 'seemingli',
 'stuck',
 'thank',
 'talk',
 '2151',
 'januari',
 '11',
 '2016',
 'utc']

In [8]:
y_train.iloc[1,:]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 1, dtype: int64

#### Encoding Training Data

In [9]:
X_test = df_test[data].apply(lambda x: literal_eval(x))
y_test = df_test[labels]

## Logistic Regression with OneVsRest

In [10]:
lr_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x)), 
    ('tfidf', TfidfTransformer()),  
    ('ovr-lr', OneVsRestClassifier(LogisticRegression(random_state=42))) ])

In [11]:
lr_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function <lambda> at 0x7fd7038beaf0>,
                                 max_features=15000, min_df=5)),
                ('tfidf', TfidfTransformer()),
                ('ovr-lr',
                 OneVsRestClassifier(estimator=LogisticRegression(random_state=42)))])

In [12]:
y_pred = lr_pipeline.predict(X_test)
print(classification_report(y_test,y_pred ))

              precision    recall  f1-score   support

           0       0.65      0.73      0.68      6090
           1       0.39      0.31      0.34       367
           2       0.75      0.64      0.69      3691
           3       0.40      0.15      0.22       211
           4       0.73      0.53      0.61      3427
           5       0.68      0.26      0.38       712

   micro avg       0.68      0.61      0.65     14498
   macro avg       0.60      0.43      0.49     14498
weighted avg       0.68      0.61      0.64     14498
 samples avg       0.07      0.06      0.06     14498



## Hyper-Parameters tuning

#### Coarse RandomizedSearch

In [20]:
lr_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x)), 
    ('tfidf', TfidfTransformer()),  
    ('ovr-lr', OneVsRestClassifier(LogisticRegression(solver='newton-cg',max_iter=100,random_state=42))) ])

In [21]:
lr_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'ovr-lr', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'ovr-lr__estimator__C', 'ovr-lr__estimator__class_weight', 'ovr-lr__estimator__dual', 'ovr-lr__estimator__fit_intercept', 'ovr-lr__estimator__intercept_scaling', 'ovr-lr__estimator__l1_ratio', 'ovr-lr__estimator__max_iter', 'ovr-lr__estimator__multi_class', 'ovr-lr__estimator__n_jobs', 'ovr-lr__estimator__penalty', 'ovr-lr__estimator__random_state', 'ovr-lr__estimator__solver', 'ovr-lr__estimator__tol', 'ovr-lr__estimator__verbose', 'ovr-lr__estimator__warm_start', 'ovr-lr__estimator', 'ovr-lr__n_jobs'])

In [22]:
params = {
          'ovr-lr__estimator__C': [1, 2, 3, 5],
         }

In [23]:
rs = RandomizedSearchCV(estimator=lr_pipeline, param_distributions=params, n_jobs=-1, cv=4)
rs_results = rs.fit(X_train, y_train)

In [24]:
print(f"best parameters:\n{rs_results.best_params_}\naccuracy: {rs_results.best_score_}")

best parameters:
{'ovr-lr__estimator__C': 2}
accuracy: 0.9205557415218367


#### Fine RandomizedSearch

In [25]:
params = {'ovr-lr__estimator__C': list(np.arange(1.8,2.4,0.01)),}

In [26]:
rs = RandomizedSearchCV(estimator=lr_pipeline, param_distributions=params, n_jobs=-1, cv=4)
rs_results = rs.fit(X_train, y_train)

In [27]:
print(f"Best parameters:\n{rs_results.best_params_}\naccuracy (training set): {rs_results.best_score_}")

Best parameters:
{'ovr-lr__estimator__C': 2.3100000000000005}
accuracy (training set): 0.9205808074765545


### Retrain and test the model

In [28]:
lr_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x)), 
    ('tfidf', TfidfTransformer()),  
    ('ovr-lr', OneVsRestClassifier(LogisticRegression(C=2.31, random_state=42))) ])

In [29]:
lr_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function <lambda> at 0x7fd7038beca0>,
                                 max_features=15000, min_df=5)),
                ('tfidf', TfidfTransformer()),
                ('ovr-lr',
                 OneVsRestClassifier(estimator=LogisticRegression(C=2.31,
                                                                  random_state=42)))])

In [30]:
y_pred = lr_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.76      0.68      6090
           1       0.38      0.34      0.36       367
           2       0.72      0.67      0.69      3691
           3       0.42      0.21      0.28       211
           4       0.71      0.55      0.62      3427
           5       0.65      0.31      0.42       712

   micro avg       0.65      0.65      0.65     14498
   macro avg       0.58      0.47      0.51     14498
weighted avg       0.66      0.65      0.64     14498
 samples avg       0.07      0.06      0.06     14498

