## SVM Multilabel Classifier

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

import warnings 
warnings.filterwarnings("ignore")

from ast import literal_eval

In [22]:
pd.options.display.max_colwidth = 63

### Import Training/Test Set 

In [23]:
df_train = pd.read_csv('../train_cleaned.zip', compression='zip')
df_train.shape

(159571, 16)

In [24]:
df_train.iloc[[1]]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,tokenized,tokenized_no_stopwords,tokenized_stemmed,tokenized_pos,tokenized_lemmatized,tokenized_lemmatized_pos,tokenized_bigram
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm seemingly stuc...,0,0,0,0,0,0,daww he matches this background colour i am seemingly stuck...,"['daww', 'he', 'matches', 'this', 'background', 'colour', '...","['daww', 'matches', 'background', 'colour', 'seemingly', 's...","['daww', 'match', 'background', 'colour', 'seemingli', 'stu...","[('daww', 'NN'), ('matches', 'NNS'), ('background', 'IN'), ...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","[('daww', 'matches'), ('matches', 'background'), ('backgrou..."


In [25]:
df_test = pd.read_csv('../test_cleaned.zip', compression='zip')
df_test.shape

(63978, 16)

## Text Encoding

In [26]:
feature = 'tokenized_stemmed'
labels = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

##### Encoding Training Data

In [27]:
X_train = df_train[feature].apply(lambda x: literal_eval(x))
y_train = df_train[labels]

In [28]:
X_train[1]

['daww',
 'match',
 'background',
 'colour',
 'seemingli',
 'stuck',
 'thank',
 'talk',
 '2151',
 'januari',
 '11',
 '2016',
 'utc']

In [29]:
y_train.loc[1,:]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 1, dtype: int64

##### Encoding Test Data

In [30]:
X_test = df_test[feature].apply(lambda x: literal_eval(x))
y_test = df_test[labels]

In [31]:
df_train["tags"] = "" 

for label in labels:
    df_train.loc[df_train[label]==1, 'tags'] = df_train['tags']+' '+label

In [32]:
tags = dict(df_train.tags.value_counts())
tags

{'': 143346,
 ' toxic': 5666,
 ' toxic obscene insult': 3800,
 ' toxic obscene': 1758,
 ' toxic insult': 1215,
 ' toxic severe_toxic obscene insult': 989,
 ' toxic obscene insult identity_hate': 618,
 ' obscene': 317,
 ' insult': 301,
 ' toxic severe_toxic obscene insult identity_hate': 265,
 ' obscene insult': 181,
 ' toxic severe_toxic obscene': 158,
 ' toxic identity_hate': 136,
 ' toxic insult identity_hate': 134,
 ' toxic obscene threat insult': 131,
 ' toxic threat': 113,
 ' toxic severe_toxic obscene threat insult': 64,
 ' toxic obscene threat insult identity_hate': 56,
 ' identity_hate': 54,
 ' toxic severe_toxic': 41,
 ' toxic obscene identity_hate': 35,
 ' toxic severe_toxic obscene threat insult identity_hate': 31,
 ' insult identity_hate': 28,
 ' threat': 22,
 ' obscene insult identity_hate': 18,
 ' toxic threat insult': 16,
 ' toxic severe_toxic insult': 14,
 ' toxic obscene threat': 11,
 ' toxic severe_toxic threat': 11,
 ' toxic threat identity_hate': 7,
 ' toxic severe_

## LinearSVC with OneVsRest

In [33]:
#Classi di minoranza -> si cerca di agire solo su queste
to_resample = ['severe_toxic', 'threat', 'identity_hate']

In [34]:
vect = CountVectorizer(min_df=5, analyzer=lambda x: x, ngram_range=(1, 2)) 
X_train_tok = vect.fit_transform(X_train)
X_test_tok = vect.transform(X_test)

tfidf = TfidfTransformer() 
X_train_vec = tfidf. fit_transform(X_train_tok)
X_test_vec = tfidf. fit_transform(X_test_tok)

In [35]:
preds = list()

for label in labels:
    print(label)
    y_train = df_train[label].values

    if label in to_resample:
        ros = RandomOverSampler(random_state=42, sampling_strategy=0.3)
        X_resampled, y_resampled = ros.fit_resample(X_train_vec, y_train)
    else:
        #rus = RandomUnderSampler(random_state=42)
        #X_resampled, y_resampled = rus.fit_resample(X_train_vec, y_train)
        
        X_resampled, y_resampled = X_train_vec, y_train

    print(sorted(Counter(y_resampled).items()))
    
  svm = LinearSVC(C=0.5, random_state=42)
    svm.fit(X_resampled, y_resampled)
    
    preds.append(svm.predict(X_test_vec))

toxic
[(0, 144277), (1, 15294)]
severe_toxic
[(0, 157976), (1, 47392)]
obscene
[(0, 151122), (1, 8449)]
threat
[(0, 159093), (1, 47727)]
insult
[(0, 151694), (1, 7877)]
identity_hate
[(0, 158166), (1, 47449)]


In [18]:
preds[0], preds[1], preds[2], preds[3], preds[4], preds[5]

(array([0, 0, 0, ..., 1, 1, 0]),
 array([0, 0, 0, ..., 0, 1, 0]),
 array([0, 0, 0, ..., 0, 1, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 1, 0]),
 array([0, 0, 0, ..., 1, 1, 0]))

In [19]:
y_pred = np.column_stack([preds[0], preds[1], preds[2], preds[3], preds[4], preds[5]])
y_pred

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [1, 0, 0, 0, 0, 1],
       [1, 1, 1, 0, 1, 1],
       [0, 0, 0, 0, 0, 0]])

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.76      0.68      6090
           1       0.16      0.76      0.26       367
           2       0.71      0.66      0.69      3691
           3       0.24      0.56      0.34       211
           4       0.71      0.53      0.61      3427
           5       0.29      0.62      0.40       712

   micro avg       0.57      0.67      0.61     14498
   macro avg       0.46      0.65      0.50     14498
weighted avg       0.63      0.67      0.64     14498
 samples avg       0.06      0.06      0.06     14498

