## Adaboost Multilabel Classifier

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import confusion_matrix, classification_report
from ast import literal_eval
import warnings 
warnings.filterwarnings("ignore")

pd.options.display.max_colwidth = 63

## Import Training/Test Set 

In [2]:
df_train = pd.read_csv('../train_cleaned.zip', compression='zip')
df_train.shape

(159571, 16)

In [3]:
df_train.iloc[[1]]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_text,tokenized,tokenized_no_stopwords,tokenized_stemmed,tokenized_pos,tokenized_lemmatized,tokenized_lemmatized_pos,tokenized_bigram
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm seemingly stuc...,0,0,0,0,0,0,daww he matches this background colour i am seemingly stuck...,"['daww', 'he', 'matches', 'this', 'background', 'colour', '...","['daww', 'matches', 'background', 'colour', 'seemingly', 's...","['daww', 'match', 'background', 'colour', 'seemingli', 'stu...","[('daww', 'NN'), ('matches', 'NNS'), ('background', 'IN'), ...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","['daww', 'match', 'background', 'colour', 'seemingly', 'stu...","[('daww', 'matches'), ('matches', 'background'), ('backgrou..."


In [4]:
df_test = pd.read_csv('../test_cleaned.zip', compression='zip')
df_test.shape

(63978, 16)

## Text Encoding

In [5]:
data = 'tokenized_stemmed'
labels = ['toxic', 'severe_toxic', 'obscene', 'threat','insult', 'identity_hate']

#### Encoding Training Data

In [6]:
X_train = df_train[data].apply(lambda x: literal_eval(x))
y_train = df_train[labels]

In [7]:
X_train[1]

['daww',
 'match',
 'background',
 'colour',
 'seemingli',
 'stuck',
 'thank',
 'talk',
 '2151',
 'januari',
 '11',
 '2016',
 'utc']

In [8]:
y_train.loc[1,:]

toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
Name: 1, dtype: int64

#### Encoding Training Data

In [9]:
X_test = df_test[data].apply(lambda x: literal_eval(x))
y_test = df_test[labels]

## Adaboost with OneVsRest

In [10]:
ada_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x)),   
    ('tfidf', TfidfTransformer()),  
    ('ovr-ada', OneVsRestClassifier(AdaBoostClassifier(n_estimators=100, random_state=42))) ])

In [11]:
ada_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function <lambda> at 0x7f7efce52160>,
                                 max_features=15000, min_df=5)),
                ('tfidf', TfidfTransformer()),
                ('ovr-ada',
                 OneVsRestClassifier(estimator=AdaBoostClassifier(n_estimators=100,
                                                                  random_state=42)))])

In [12]:
y_pred = ada_pipeline.predict(X_test)
print(classification_report(y_test,y_pred ))

              precision    recall  f1-score   support

           0       0.62      0.67      0.65      6090
           1       0.36      0.41      0.38       367
           2       0.71      0.59      0.64      3691
           3       0.32      0.30      0.31       211
           4       0.68      0.44      0.53      3427
           5       0.53      0.33      0.41       712

   micro avg       0.63      0.57      0.60     14498
   macro avg       0.54      0.46      0.49     14498
weighted avg       0.64      0.57      0.59     14498
 samples avg       0.06      0.05      0.05     14498



## Hyper-Parameters tuning

In [10]:
ada_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x)),  
    ('tfidf', TfidfTransformer()),  
    ('ovr-ada', OneVsRestClassifier(AdaBoostClassifier(random_state=42))) ])

In [11]:
ada_pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'vect', 'tfidf', 'ovr-ada', 'vect__analyzer', 'vect__binary', 'vect__decode_error', 'vect__dtype', 'vect__encoding', 'vect__input', 'vect__lowercase', 'vect__max_df', 'vect__max_features', 'vect__min_df', 'vect__ngram_range', 'vect__preprocessor', 'vect__stop_words', 'vect__strip_accents', 'vect__token_pattern', 'vect__tokenizer', 'vect__vocabulary', 'tfidf__norm', 'tfidf__smooth_idf', 'tfidf__sublinear_tf', 'tfidf__use_idf', 'ovr-ada__estimator__algorithm', 'ovr-ada__estimator__base_estimator', 'ovr-ada__estimator__learning_rate', 'ovr-ada__estimator__n_estimators', 'ovr-ada__estimator__random_state', 'ovr-ada__estimator', 'ovr-ada__n_jobs'])

In [12]:
params = {'vect__ngram_range': [(1, 1), (1, 2)],
          'tfidf__use_idf': (True, False),
          'ovr-ada__estimator__learning_rate':list(np.arange(0.1,1.0,0.1)),
          'ovr-ada__estimator__n_estimators':list(np.arange(30,110,10)),
         }

In [13]:
rs = RandomizedSearchCV(estimator=ada_pipeline, param_distributions=params, 
                        n_jobs=-1, cv=4, scoring='f1_weighted')
rs_results = rs.fit(X_train, y_train)

In [14]:
print(f"best parameters:\n{rs_results.best_params_}\nf1 weighted avg(training set): {rs_results.best_score_}")

best parameters:
{'vect__ngram_range': (1, 2), 'tfidf__use_idf': False, 'ovr-ada__estimator__n_estimators': 100, 'ovr-ada__estimator__learning_rate': 0.9}
f1 weighted avg(training set): 0.6460597472005528


### Retrain and test the model

In [15]:
ada_pipeline = Pipeline([
    ('vect', CountVectorizer(min_df=5, max_features=15000, analyzer=lambda x: x,ngram_range=(1, 2))),   
    ('tfidf', TfidfTransformer(use_idf=False)), 
    ('ovr-ada', OneVsRestClassifier(AdaBoostClassifier(n_estimators=100, learning_rate=0.9, random_state=42))) ])

In [16]:
ada_pipeline.fit(X_train,y_train)

Pipeline(steps=[('vect',
                 CountVectorizer(analyzer=<function <lambda> at 0x7fbc77448dc0>,
                                 max_features=15000, min_df=5,
                                 ngram_range=(1, 2))),
                ('tfidf', TfidfTransformer(use_idf=False)),
                ('ovr-ada',
                 OneVsRestClassifier(estimator=AdaBoostClassifier(learning_rate=0.9,
                                                                  n_estimators=100,
                                                                  random_state=42)))])

In [17]:
y_pred = ada_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.63      0.64      0.64      6090
           1       0.36      0.39      0.37       367
           2       0.72      0.57      0.64      3691
           3       0.36      0.27      0.31       211
           4       0.68      0.45      0.54      3427
           5       0.59      0.35      0.44       712

   micro avg       0.65      0.55      0.60     14498
   macro avg       0.56      0.45      0.49     14498
weighted avg       0.65      0.55      0.59     14498
 samples avg       0.06      0.05      0.05     14498

