# CUNEF MUCD 2021/2022
## News Classification
Autor:  
- Antonio Tello Gómez

# 2. Model Selection

In [2]:
#Basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle


#Functionalities
from collections import Counter
import sys, os
import warnings
warnings.filterwarnings('ignore')

#NLP
import string
import re
import nltk


# Machine Learning
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import train_test_split

#Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

# Custom Transformer
sys.path.append(os.path.abspath('..'))
sys.path.append(os.path.abspath('../src'))
from src.Preprocessor import TextPreprocessor

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import SGDClassifier

## Load the Data

In [45]:
#Load the clean data 
#(However, we will use the original data to test the custom transformer)    
df = pd.read_csv('../data/fake_or_real_news_clean.csv')

## Train Test Split

In [48]:
# Train-test split
xtrain, xtest, ytrain, ytest = train_test_split(df['full_text'], df['label'], test_size=0.25, random_state=2022, stratify=df['label'])

In [49]:
# Save train in train folder
xtrain.to_csv('../data/train/X_train.csv', index=False)
ytrain.to_csv('../data/train/y_train.csv', index=False, header=['label'])

# Save test in test folder 
xtest.to_csv('../data/test/X_test.csv', index=False)
ytest.to_csv('../data/test/y_test.csv', index=False, header=['label'])

## Base Model

The base model is a dummy model that always predicts the same value. i.e. all news are real.  
It is used to compare the performance of the other models.

In [50]:
base_model = Pipeline([
    ("preprocessor", TextPreprocessor()),
    ("vectorizer", TfidfVectorizer()),
    ("clf", DummyClassifier(strategy="most_frequent"))
])

In [51]:
base_model.fit(xtrain, ytrain)

Pipeline(steps=[('preprocessor', TextPreprocessor()),
                ('vectorizer', TfidfVectorizer()),
                ('clf', DummyClassifier(strategy='most_frequent'))])

In [52]:
base_model.score(xtest, ytest)

0.5003170577045022

In [53]:
ypred = base_model.predict(xtest)
ypred_proba = base_model.predict_proba(xtest)

Function to evaluate the model using popular metrics:  
ROC-AUC,Accuracy,Precision,Recall,F1-score,Confusion matrix...

In [4]:
def evaluate_model(ytest, ypred, ypred_proba = None):
    if ypred_proba is not None:
        print('ROC-AUC score of the model: {}'.format(roc_auc_score(ytest, ypred_proba[:, 1])))
    print('Accuracy of the model: {}\n'.format(accuracy_score(ytest, ypred)))
    print('Classification report: \n{}\n'.format(classification_report(ytest, ypred)))
    print('Confusion matrix: \n{}\n'.format(confusion_matrix(ytest, ypred)))

In [55]:
evaluate_model(ytest, ypred, ypred_proba)

ROC-AUC score of the model: 0.5
Accuracy of the model: 0.5003170577045022

Classification report: 
              precision    recall  f1-score   support

           0       0.50      1.00      0.67       789
           1       0.00      0.00      0.00       788

    accuracy                           0.50      1577
   macro avg       0.25      0.50      0.33      1577
weighted avg       0.25      0.50      0.33      1577


Confusion matrix: 
[[789   0]
 [788   0]]



The accuracy of the base model is 0.5


## Training and Model Evaluation

Now that we have a baseline we can start to compare the performance of the other models.

We are going to use the following models:

In [5]:
classifiers =[
    LogisticRegression(),
    MultinomialNB(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    SGDClassifier(),
    RandomForestClassifier(),
    XGBClassifier(),
    LGBMClassifier()
    ]

This loop fits the models in the train data and then generates a report with the metrics in the test data.

In [6]:
%%time
np.random.seed(2022) 
for classifier in classifiers:
    
    # Pipeline
    model = Pipeline(steps=[
        ("preprocessor", TextPreprocessor()),
        ("vectorizer", TfidfVectorizer()),
        ("clf", classifier)]) 
        
    # Fit And Predict Model
    model = model.fit(xtrain, ytrain)
    ypred = model.predict(xtest)
    pickle.dump(model, open('../models/model_selection/' + classifier.__class__.__name__ + '.pkl', 'wb'))
    print('Model Saved')
    # Print Metrics
    print(classifier)
    evaluate_model(ytest, ypred)
    print('------------------------------------------------------')

Model Saved
LogisticRegression()
Accuracy of the model: 0.9226379201014585

Classification report: 
              precision    recall  f1-score   support

           0       0.95      0.89      0.92       789
           1       0.90      0.95      0.92       788

    accuracy                           0.92      1577
   macro avg       0.92      0.92      0.92      1577
weighted avg       0.92      0.92      0.92      1577


Confusion matrix: 
[[705  84]
 [ 38 750]]

------------------------------------------------------
Model Saved
MultinomialNB()
Accuracy of the model: 0.8344958782498415

Classification report: 
              precision    recall  f1-score   support

           0       0.76      0.98      0.86       789
           1       0.97      0.69      0.81       788

    accuracy                           0.83      1577
   macro avg       0.87      0.83      0.83      1577
weighted avg       0.87      0.83      0.83      1577


Confusion matrix: 
[[775  14]
 [247 541]]

--------

## Model Selection


Choosing a model is not a trivial choice and it will mostly depend on the particular problem. In a classification problem accuracy on the test set is the most popular metric to assess the performance of a model.  
In our case, we have a balanced dataset, we don't have any particular requirements,  of cost-sensitivity, time, or computational resources and we do not have to deploy our model. Therefore I will use accuracy to evaluate the best model.  
Nevertheless, we can imagine a scenario in which we were developing a real tool or app to detect fake news. In that case, we might want to detect as many fake news as possible even if we classify some real news as fake or we might have a hard time creating a balanced dataset due to most of the news are real. In such cases, metrics like accuracy would not be very useful.


### Winners
Winners: **SGDClassifier and LightGBM**  
Special Mention: Logistic Regression, Passive Aggressive Classifier  
In the following notebook we will optimize the hyperparameters of the winners. 