## SVM Multilabel Classifier

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report

import warnings 
warnings.filterwarnings("ignore")

from ast import literal_eval

In [2]:
pd.options.display.max_colwidth = 63

### Import Training/Test Set 

In [3]:
df_train = pd.read_csv('../data/train_cleaned.zip', compression='zip')
df_train.shape

(159571, 17)

In [4]:
df_train.iloc[[1]]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxicity,clean_text,tokenized,tokenized_no_stopwords,tokenized_stemmed,tokenized_pos,tokenized_lemmatized,tokenized_lemmatized_pos,tokenized_bigram
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm seemingly stuc...,0,0,0,0,0,0,0,daww he matches this background colour i am seemingly stuck...,"['daww', 'he', 'matches', 'this', 'background', 'colour', '...","['daww', 'matches', 'background', 'colour', 'seemingly', 's...","['daww', 'match', 'background', 'colour', 'seemingli', 'stu...","[('daww', 'NN'), ('matches', 'NNS'), ('background', 'IN'), ...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","[('daww', 'matches'), ('matches', 'background'), ('backgrou..."


In [5]:
df_test = pd.read_csv('../data/test_cleaned.zip', compression='zip')
df_test.shape

(63978, 17)

## Training set resampling

In [15]:
np.random.seed(42)

In [43]:
temp_df = df_train.iloc[np.random.choice(np.where(df_train['severe_toxic']==1)[0], size=10000)]
df_resampled = df_train.append(temp_df, ignore_index=True)

temp_df = df_train.iloc[np.random.choice(np.where(df_train['threat']==1)[0], size=10000)]
df_resampled = df_resampled.append(temp_df, ignore_index=True)

temp_df = df_train.iloc[np.random.choice(np.where(df_train['identity_hate']==1)[0], size=10000)]
df_resampled = df_resampled.append(temp_df, ignore_index=True)

## Text Encoding

In [44]:
data = 'tokenized_stemmed'
labels = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

##### Encoding Training Data

In [45]:
X_train = df_resampled[data].apply(lambda x: literal_eval(x))
y_train = df_resampled[labels]

In [46]:
X_train[1]

['daww',
 'match',
 'background',
 'colour',
 'seemingli',
 'stuck',
 'thank',
 'talk',
 '2151',
 'januari',
 '11',
 '2016',
 'utc']

In [47]:
y_train.loc[1,:]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 1, dtype: int64

##### Encoding Test Data

In [48]:
X_test = df_test[data].apply(lambda x: literal_eval(x))
y_test = df_test[labels]

## LinearSVC with OneVsRest

In [49]:
lsvc_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, analyzer=lambda x: x)), 
    ('tfidf', TfidfTransformer()),  
    ('ovr-svc', OneVsRestClassifier(LinearSVC(random_state=42))) ])

In [50]:
lsvc_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function <lambda> at 0x7efe97a8f130>,
                                 min_df=5)),
                ('tfidf', TfidfTransformer()),
                ('ovr-svc',
                 OneVsRestClassifier(estimator=LinearSVC(random_state=42)))])

In [51]:
y_pred = lsvc_pipeline.predict(X_test)
print(classification_report(y_test,y_pred ))

              precision    recall  f1-score   support

           0       0.57      0.80      0.66      6090
           1       0.21      0.51      0.29       367
           2       0.66      0.69      0.67      3691
           3       0.27      0.42      0.33       211
           4       0.60      0.56      0.58      3427
           5       0.40      0.48      0.44       712

   micro avg       0.56      0.68      0.62     14498
   macro avg       0.45      0.58      0.50     14498
weighted avg       0.58      0.68      0.62     14498
 samples avg       0.07      0.06      0.06     14498



## Hyper-Parameters tuning (coarse+fine gridsearch)

#### Coarse Search

In [52]:
lsvc_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, analyzer=lambda x: x)), 
    ('tfidf', TfidfTransformer()),  
    ('ovr-svc', OneVsRestClassifier(LinearSVC(random_state=42))) ])

In [53]:
lsvc_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'ovr-svc', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'ovr-svc__estimator__C', 'ovr-svc__estimator__class_weight', 'ovr-svc__estimator__dual', 'ovr-svc__estimator__fit_intercept', 'ovr-svc__estimator__intercept_scaling', 'ovr-svc__estimator__loss', 'ovr-svc__estimator__max_iter', 'ovr-svc__estimator__multi_class', 'ovr-svc__estimator__penalty', 'ovr-svc__estimator__random_state', 'ovr-svc__estimator__tol', 'ovr-svc__estimator__verbose', 'ovr-svc__estimator', 'ovr-svc__n_jobs'])

In [119]:
params = {
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],
    'tfidf__use_idf': (True, False),
    'ovr-svc__estimator__C': [0.0001, 00.1, 0.1, 1.0],
}

In [120]:
rs = RandomizedSearchCV(estimator=lsvc_pipeline, param_distributions=params, scoring='f1_weighted',
                            n_jobs=-1, cv=3, verbose=4)
rs_results = rs.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [123]:
print(f"Best parameters:\n{rs_results.best_params_}\nf1_weighted (training set): {rs_results.best_score_}")

Best parameters:
{'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'ovr-svc__estimator__C': 1.0}
f1_weighted (training set): 0.7825954938904663


#### Fine GrideSearchCV

params = {'vect__ngram_range': [(1, 1), (1, 2),(1,3)],
          'tfidf__use_idf': (True, False),
          'ovr-svc__estimator__C': list(np.arange(0.25,0.75,0.02)),
         }

### Retrain and test the model

In [130]:
lsvc_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, analyzer=lambda x: x, ngram_range=(1, 1))),
    ('tfidf', TfidfTransformer(use_idf=True)), 
    ('ovr-svc', OneVsRestClassifier(LinearSVC(C=1.0, random_state=42))) ]) 

In [131]:
lsvc_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function <lambda> at 0x7efe60f381f0>,
                                 min_df=5)),
                ('tfidf', TfidfTransformer()),
                ('ovr-svc',
                 OneVsRestClassifier(estimator=LinearSVC(random_state=42)))])

In [132]:
y_pred = lsvc_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.57      0.80      0.66      6090
           1       0.21      0.51      0.29       367
           2       0.66      0.69      0.67      3691
           3       0.27      0.42      0.33       211
           4       0.60      0.56      0.58      3427
           5       0.40      0.48      0.44       712

   micro avg       0.56      0.68      0.62     14498
   macro avg       0.45      0.58      0.50     14498
weighted avg       0.58      0.68      0.62     14498
 samples avg       0.07      0.06      0.06     14498

