## SVM Multilabel Classifier

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report

import warnings 
warnings.filterwarnings("ignore")

from ast import literal_eval

In [3]:
pd.options.display.max_colwidth = 63

### Import Training/Test Set 

In [4]:
df_train = pd.read_csv('../train_cleaned.zip', compression='zip')
df_train.shape

(159571, 16)

In [5]:
df_train.iloc[[1]]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,tokenized,tokenized_no_stopwords,tokenized_stemmed,tokenized_pos,tokenized_lemmatized,tokenized_lemmatized_pos,tokenized_bigram
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm seemingly stuc...,0,0,0,0,0,0,daww he matches this background colour i am seemingly stuck...,"['daww', 'he', 'matches', 'this', 'background', 'colour', '...","['daww', 'matches', 'background', 'colour', 'seemingly', 's...","['daww', 'match', 'background', 'colour', 'seemingli', 'stu...","[('daww', 'NN'), ('matches', 'NNS'), ('background', 'IN'), ...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","[('daww', 'matches'), ('matches', 'background'), ('backgrou..."


In [6]:
df_test = pd.read_csv('../test_cleaned.zip', compression='zip')
df_test.shape

(63978, 16)

## Text Encoding

In [7]:
data = 'tokenized_stemmed'
labels = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

##### Encoding Training Data

In [8]:
X_train = df_train[data].apply(lambda x: literal_eval(x))
y_train = df_train[labels]

In [9]:
X_train[1]

['daww',
 'match',
 'background',
 'colour',
 'seemingli',
 'stuck',
 'thank',
 'talk',
 '2151',
 'januari',
 '11',
 '2016',
 'utc']

In [10]:
y_train.loc[1,:]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 1, dtype: int64

##### Encoding Test Data

In [11]:
X_test = df_test[data].apply(lambda x: literal_eval(x))
y_test = df_test[labels]

## LinearSVC with OneVsRest

In [12]:
lsvc_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x)), 
    ('tfidf', TfidfTransformer()),  
    ('ovr-svc', OneVsRestClassifier(LinearSVC(random_state=42))) ])

In [13]:
lsvc_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function <lambda> at 0x7f9a18e04430>,
                                 max_features=15000, min_df=5)),
                ('tfidf', TfidfTransformer()),
                ('ovr-svc',
                 OneVsRestClassifier(estimator=LinearSVC(random_state=42)))])

In [14]:
y_pred = lsvc_pipeline.predict(X_test)
print(classification_report(y_test,y_pred ))

              precision    recall  f1-score   support

           0       0.58      0.79      0.67      6090
           1       0.38      0.34      0.36       367
           2       0.68      0.69      0.68      3691
           3       0.39      0.25      0.30       211
           4       0.67      0.57      0.62      3427
           5       0.64      0.33      0.44       712

   micro avg       0.62      0.67      0.64     14498
   macro avg       0.56      0.49      0.51     14498
weighted avg       0.62      0.67      0.64     14498
 samples avg       0.07      0.06      0.06     14498



## Hyper-Parameters tuning (coarse+fine gridsearch)

#### Coarse Search

In [15]:
lsvc_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x)), 
    ('tfidf', TfidfTransformer()),  
    ('ovr-svc', OneVsRestClassifier(LinearSVC(random_state=42))) ])

In [74]:
lsvc_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'ovr-svc', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'ovr-svc__estimator__C', 'ovr-svc__estimator__class_weight', 'ovr-svc__estimator__dual', 'ovr-svc__estimator__fit_intercept', 'ovr-svc__estimator__intercept_scaling', 'ovr-svc__estimator__loss', 'ovr-svc__estimator__max_iter', 'ovr-svc__estimator__multi_class', 'ovr-svc__estimator__penalty', 'ovr-svc__estimator__random_state', 'ovr-svc__estimator__tol', 'ovr-svc__estimator__verbose', 'ovr-svc__estimator', 'ovr-svc__n_jobs'])

In [75]:
params = {'vect__ngram_range': [(1, 1), (1, 2),(1,3)],
          'tfidf__use_idf': (True, False),
          'ovr-svc__estimator__C': [0.2, 0.5, 1, 2, 3, 5],
         }

In [77]:
rs = RandomizedSearchCV(estimator=lsvc_pipeline, param_distributions=params, 
                        n_jobs=-1, cv=4, scoring='f1_micro')
rs_results = rs.fit(X_train, y_train)

In [78]:
print(f"Best parameters:\n{rs_results.best_params_}\nf1 weighted avg(training set): {rs_results.best_score_}")


Best parameters:
{'vect__ngram_range': (1, 3), 'tfidf__use_idf': False, 'ovr-svc__estimator__C': 2}
f1 weighted avg(training set): 0.7179229165420002


#### Fine GrideSearchCV

In [53]:
params = {'vect__ngram_range': [(1, 1), (1, 2),(1,3)],
          'tfidf__use_idf': (True, False),
          'ovr-svc__estimator__C': list(np.arange(0.5,3,0.05)),
         }

In [54]:
rs = RandomizedSearchCV(estimator=lsvc_pipeline, param_distributions=params, 
                        n_jobs=-1, cv=4, scoring='f1_weighted')
rs_results = rs.fit(X_train, y_train)

In [55]:
print(f"Best parameters:\n{rs_results.best_params_}\nf1 weighted avg(training set): {rs_results.best_score_}")

Best parameters:
{'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'ovr-svc__estimator__C': 1.750000000000001}
f1 weighted avg(training set): 0.71128864681083


### Retrain and test the model

In [57]:
lsvc_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x,ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=False)), 
    ('ovr-svc', OneVsRestClassifier(LinearSVC(C=1.75, random_state=42))) ]) 

In [58]:
lsvc_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function <lambda> at 0x7fbb24f89f70>,
                                 max_features=15000, min_df=5,
                                 ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('ovr-svc',
                 OneVsRestClassifier(estimator=LinearSVC(C=1.75,
                                                         random_state=42)))])

In [59]:
y_pred = lsvc_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.58      0.80      0.67      6090
           1       0.38      0.35      0.36       367
           2       0.67      0.69      0.68      3691
           3       0.37      0.29      0.33       211
           4       0.67      0.58      0.62      3427
           5       0.62      0.36      0.46       712

   micro avg       0.61      0.68      0.64     14498
   macro avg       0.55      0.51      0.52     14498
weighted avg       0.62      0.68      0.64     14498
 samples avg       0.07      0.06      0.06     14498

