# CUNEF MUCD 2021/2022
## News Classification
Autor:  
- Antonio Tello Gómez

# 3. Hyperparameter Optimization

In [1]:
#Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


#Functionalities
from collections import Counter
import sys, os
import warnings
warnings.filterwarnings('ignore')

#NLP
import string
import re
import nltk


# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV

#Metrics
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, roc_auc_score, roc_curve

# Custom Transformer
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../src'))
from src.Preprocessor import TextPreprocessor

# Models
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier

In [4]:
def evaluate_model(ytest, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model: {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

## Load the data

In [5]:
xtrain = pd.read_csv('../data/train/X_train.csv')['full_text']
ytrain = pd.read_csv('../data/train/y_train.csv')['label']
xtest = pd.read_csv('../data/test/X_test.csv')['full_text']
ytest = pd.read_csv('../data/test/y_test.csv')['label']

## LightGBM

In [14]:
pipe = Pipeline(steps=[
    ("preprocessor", TextPreprocessor()),
    ("vectorizer", TfidfVectorizer()),
    ("clf", LGBMClassifier(random_state=2022))]) 

In [16]:
#Hyperparameters for lightgbm
params = {
    "preprocessor__remove_numbers": [True, False],
    "vectorizer__ngram_range": [(1, 1), (1, 2), (1,3)],
    #"vectorizer__max_features": [None, 5000, 10000, 50000],
    #"vectorizer__use_idf": [True, False],
    #"vectorizer__smooth_idf": [True, False],
    "vectorizer__min_df": [0.01],
    "vectorizer__max_df": [0.995],
    "vectorizer__norm": ["l1", "l2"],
    'clf__learning_rate': [0.1, 0.05], 
    #'clf__n_estimators' : [200, 300, 400], 
    'clf__importance_type' : ['split']
}

In [16]:
%%time
lgbm = GridSearchCV(pipe, params, n_jobs=-1, verbose=1, cv=5, scoring='accuracy')
lgbm.fit(xtrain, ytrain)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: total: 1min 24s
Wall time: 12min 4s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor', TextPreprocessor()),
                                       ('vectorizer', TfidfVectorizer()),
                                       ('clf',
                                        LGBMClassifier(random_state=2022))]),
             n_jobs=-1,
             param_grid={'clf__importance_type': ['split'],
                         'clf__learning_rate': [0.1, 0.05],
                         'preprocessor__remove_numbers': [True, False],
                         'vectorizer__max_df': [0.995],
                         'vectorizer__min_df': [0.01],
                         'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
                         'vectorizer__norm': ['l1', 'l2']},
             scoring='accuracy', verbose=1)

In [17]:
best_parameters = lgbm.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__importance_type: 'split'
	clf__learning_rate: 0.1
	preprocessor__remove_numbers: False
	vectorizer__max_df: 0.995
	vectorizer__min_df: 0.01
	vectorizer__ngram_range: (1, 2)
	vectorizer__norm: 'l2'


In [24]:
pickle.dump(lgbm, open('../models/' + 'optim_lgbm' + '.pkl', 'wb'))

In [14]:
lgbm = pickle.load(open('../models/' + 'optim_lgbm' + '.pkl', 'rb'))

In [11]:
%%time
ypred = lgbm.predict(xtest)
ypred_proba = lgbm.predict_proba(xtest)

CPU times: total: 14.8 s
Wall time: 13.7 s


In [12]:
evaluate_model(ytest,ypred, ypred_proba)

ROC-AUC score of the model: 0.9860422175471104
Accuracy of the model: 0.9435637285986049

Classification report: 
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       789
           1       0.94      0.95      0.94       788

    accuracy                           0.94      1577
   macro avg       0.94      0.94      0.94      1577
weighted avg       0.94      0.94      0.94      1577


Confusion matrix: 
[[741  48]
 [ 41 747]]



### Adjusting Threshold 

One of the advantages of the LightGBMClassifier compared to the SGDClassifier (with loss hinge) is that we can predict probabilities and hence, play with the threshold.

The Geometric Mean or G-Mean is a metric for imbalanced classification that, if optimized, will seek a balance between the sensitivity and the specificity.  
G-Mean = sqrt(Sensitivity * Specificity)

In [13]:
# keep probabilities for the positive outcome only
yhat = ypred_proba[:, 1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(ytest, yhat)

gmeans = np.sqrt(tpr * (1-fpr))
# locate the index of the largest g-mean
ix = np.argmax(gmeans)
print('Best Threshold=%f, G-Mean=%.3f' % (thresholds[ix], gmeans[ix]))

ypred_new_threshold = (ypred_proba[:,1]>thresholds[ix]).astype(int)
evaluate_model(ytest,ypred_new_threshold,ypred_proba)

Best Threshold=0.525891, G-Mean=0.945
ROC-AUC score of the model: 0.9860422175471104
Accuracy of the model: 0.9448319594166138

Classification report: 
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       789
           1       0.94      0.95      0.94       788

    accuracy                           0.94      1577
   macro avg       0.94      0.94      0.94      1577
weighted avg       0.94      0.94      0.94      1577


Confusion matrix: 
[[745  44]
 [ 43 745]]



## SGDClassifier

In [46]:
pipe = Pipeline(steps=[
    ("preprocessor", TextPreprocessor()),
    ("vectorizer", TfidfVectorizer()),
    ("clf", SGDClassifier(random_state=2022))]) 

In [47]:
#Hyperparameters for sgd
params = {
    #"preprocessor__remove_numbers": [True, False],      #Removing numbers does not improve score
    "vectorizer__min_df": [0.005],
    "vectorizer__max_df": [0.995],
    "vectorizer__ngram_range": [(1, 1), (1, 2), (1,3)],
    "vectorizer__norm": ["l1", "l2"],
    "clf__alpha": [0.0001, 0.001],
    "clf__penalty": ["l2", "elasticnet"],
    "clf__loss": ["hinge", "log", "squared_hinge"],
}

In [48]:
%%time
sgd = GridSearchCV(pipe, params, n_jobs=-1, verbose=1, cv=5, scoring='accuracy')
sgd.fit(xtrain, ytrain)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
CPU times: total: 52.6 s
Wall time: 22min 14s


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessor', TextPreprocessor()),
                                       ('vectorizer', TfidfVectorizer()),
                                       ('clf',
                                        SGDClassifier(random_state=2022))]),
             n_jobs=-1,
             param_grid={'clf__alpha': [0.0001, 0.001],
                         'clf__loss': ['hinge', 'log', 'squared_hinge'],
                         'clf__penalty': ['l2', 'elasticnet'],
                         'vectorizer__max_df': [0.995],
                         'vectorizer__min_df': [0.005],
                         'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
                         'vectorizer__norm': ['l1', 'l2']},
             scoring='accuracy', verbose=1)

In [49]:
best_parameters = sgd.best_estimator_.get_params()
for param_name in sorted(params.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	clf__alpha: 0.0001
	clf__loss: 'hinge'
	clf__penalty: 'elasticnet'
	vectorizer__max_df: 0.995
	vectorizer__min_df: 0.005
	vectorizer__ngram_range: (1, 3)
	vectorizer__norm: 'l2'


In [53]:
pickle.dump(sgd, open('../models/' + 'optim_sgd' + '.pkl', 'wb'))

In [2]:
sgd = pickle.load(open('../models/' + 'optim_sgd' + '.pkl', 'rb'))

In [6]:
%%time
ypred = sgd.predict(xtest)

CPU times: total: 9.06 s
Wall time: 9.29 s


In [7]:
evaluate_model(ytest,ypred)

Accuracy of the model: 0.9441978440076094

Classification report: 
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       789
           1       0.94      0.95      0.94       788

    accuracy                           0.94      1577
   macro avg       0.94      0.94      0.94      1577
weighted avg       0.94      0.94      0.94      1577


Confusion matrix: 
[[741  48]
 [ 40 748]]



## Conclusion

We will move forward with the LightGBMClassifier even though it has a slightly lower accuracy it gives us more flexibility as we can play with the threshold.