In [1]:
# Importing libraries
import os
import pickle
import warnings

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     learning_curve)
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier


warnings.filterwarnings('ignore')

In [2]:
np.random.seed(42)


In [3]:
# Creating a list of stopwords
nltk.download('stopwords')
stopwords_list = list(stopwords.words('english'))


[nltk_data] Downloading package stopwords to C:\Users\yassi.DESKTOP-5N
[nltk_data]     OV12J\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Helper function to display the evaluation metrics of the different models
def show_eval_scores(model, test_set, model_name):

    y_pred = model.predict(test_set['news'])
    y_true = test_set['label']
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    
    print('Report for ---> {}'.format(model_name))
    print('Accuracy is: {}'.format(accuracy))
    print('F1 score is: {}'.format(f1))
    print('Precision score is: {}'.format(precision))
    print('Recall score is: {}'.format(recall))

In [6]:
# Importing the datasets
train_data = pd.read_csv('../datasets/train.csv')
valid_data = pd.read_csv('../datasets/valid.csv')
test_data = pd.read_csv('../datasets/test.csv')

Viewing random rows of all the datasets

In [17]:
train_data.sample(5)

Unnamed: 0,label,news
3842,True,Polling shows that nearly 74 percent of Nation...
6480,False,I left the city with $43 million in the bank.
4521,False,Says she couldn't take stimulus money because ...
4026,True,The United States is the only industrialized c...
10111,False,The Health Care and Education Reconciliation A...


In [18]:
valid_data.sample(5)

Unnamed: 0,label,news
824,True,Al-Qaida has grown fourfold in five years.
548,True,"Under the clear letter of the law, (Justice Cl..."
870,True,"For immigrants with visa overstays, we make no..."
1047,True,The governors budget proposal reduces the stat...
1155,True,Says the director of NASA says its main missio...


In [19]:
test_data.sample(5)

Unnamed: 0,label,news
38,True,"The Fed created $1.2 trillion out of nothing, ..."
734,True,Says Rick Scott stripped women of access to pu...
138,True,Says NFL Commissioner Roger Goodell interviewe...
128,True,The federal government reviewed and verified h...
700,True,"In 1981, Matagorda, Brazoria, and Galveston Co..."


In [20]:
print('Train dataset size: {}'.format(train_data.shape))
print('Valid dataset size: {}'.format(valid_data.shape))
print('Test dataset size: {}'.format(test_data.shape))

Train dataset size: (10240, 2)
Valid dataset size: (1284, 2)
Test dataset size: (1267, 2)


Combining train_data and valid_data into a single training set as GridSearchCV with 5 fold cross validation will be used for hyperparameter tuning the different models

In [7]:
training_set = pd.concat([train_data, valid_data], ignore_index=True)
print('Training set size: {}'.format(training_set.shape))
training_set.sample(5)

Training set size: (11524, 2)


Unnamed: 0,label,news
8138,True,Were the only advanced democracy in the world ...
2271,True,"Says under Mayor Cory Booker, Newark has seen ..."
9234,True,Dan Webster would force victims of rape and in...
7441,True,Rep. Paul Ryans budget proposal cuts nothing f...
3465,False,"(Big banks) have invested over $300,000 in (Jo..."


Creating a CountVectorizer object and analyzing the training set

In [8]:
countV = CountVectorizer()
train_count = countV.fit_transform(training_set['news'].values)

#### Building and tuning Logistic Regression pipeline 

In [74]:
# lr_pipeline = Pipeline([
#     ('lrCV', CountVectorizer(stop_words=stopwords_list)),
#     ('lr_clf', LogisticRegression(random_state=42, n_jobs=-1))
# ])

In [75]:
# param_grid = [
#     {
#         'lrCV__lowercase': [True, False],
#         'lrCV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'lr_clf__C': [0.0001, 0.00005, 0.00001]
#     }
# ]

# lr_gs = GridSearchCV(lr_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# lr_gs.fit(training_set['news'], training_set['label'])

In [76]:
# lr_gs.best_params_

In [77]:
# lr_gs.best_score_

In [9]:
lr_pipeline = Pipeline([
    ('lrCV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 1))),
    ('lr_clf', LogisticRegression(C=0.0001,random_state=42, n_jobs=-1))
])

In [10]:
lr_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(steps=[('lrCV',
                 CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('lr_clf',
                 LogisticRegression(C=0.0001, n_jobs=-1, random_state=42))])

In [11]:
show_eval_scores(lr_pipeline, test_data, 'Logistic Regression Count Vectorizer')

Report for ---> Logistic Regression Count Vectorizer
Accuracy is: 0.56353591160221
F1 score is: 0.7208480565371024
Precision score is: 0.56353591160221
Recall score is: 1.0


#### Building and tuning Naive Bayes pipeline

In [92]:
# nb_pipeline = Pipeline([
#     ('nb_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('nb_clf', MultinomialNB())
# ])

In [93]:
# param_grid = {
#     'nb_clf__alpha': [i/10.0 for i in range(60, 71)],
#     'nb_CV__lowercase': [True, False],
#     'nb_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)]
# }

# nb_gs = GridSearchCV(nb_pipeline, param_grid, scoring = 'f1', cv=5, n_jobs=-1, verbose=1)
# nb_gs.fit(training_set['news'], training_set['label'])

In [94]:
# nb_gs.best_params_

In [95]:
# nb_gs.best_score_

In [12]:
nb_pipeline = Pipeline([
    ('nb_CV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 4))),
    ('nb_clf', MultinomialNB(alpha=6.8))
])

In [13]:
nb_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(steps=[('nb_CV',
                 CountVectorizer(ngram_range=(1, 4),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('nb_clf', MultinomialNB(alpha=6.8))])

In [29]:
show_eval_scores(nb_pipeline, test_data, 'Naive Bayes Count Vectorizer')

Report for ---> Naive Bayes Count Vectorizer
Accuracy is: 0.6203630623520127
F1 score is: 0.7326292384658143
Precision score is: 0.6073732718894009
Recall score is: 0.9229691876750701


#### Building and tuning SVM classifier pipeline

In [103]:
# svm_pipeline = Pipeline([
#     ('svm_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('svm_clf', SVC(random_state=42))
# ])

In [104]:
# param_grid = [
#     {
#         'svm_CV__lowercase': [True, False],
#         'svm_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'svm_clf__kernel': ['poly'],
#         'svm_clf__degree': [1, 2, 3]
#     },
#     {
#         'svm_CV__lowercase': [True, False],
#         'svm_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
#         'svm_clf__kernel': ['rbf'],
#         'svm_clf__gamma': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
#     }
# ]

# svm_gs = GridSearchCV(svm_pipeline, param_grid, scoring='f1', n_jobs=-1, cv=5, verbose=1)
# svm_gs.fit(training_set['news'], training_set['label'])

In [105]:
# svm_gs.best_params_

In [106]:
# svm_gs.best_score_

In [14]:
svm_pipeline = Pipeline([
    ('svm_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('svm_clf', SVC(random_state=42, gamma=1.0, kernel='rbf'))
])

In [15]:
svm_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(steps=[('svm_CV',
                 CountVectorizer(lowercase=False,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('svm_clf', SVC(gamma=1.0, random_state=42))])

In [16]:
show_eval_scores(svm_pipeline, test_data, 'SVM Classifier Count Vectorizer')

Report for ---> SVM Classifier Count Vectorizer
Accuracy is: 0.5666929755327546
F1 score is: 0.7211782630777045
Precision score is: 0.5657370517928287
Recall score is: 0.9943977591036415


#### Building and Tuning Random Forest Classifier pipeline 

In [117]:
# rf_pipeline = Pipeline([
#     ('rf_CV', CountVectorizer(stop_words=stopwords_list)),
#     ('rf_clf', RandomForestClassifier(n_jobs=-1, random_state=42))
# ])

In [118]:
# param_grid = {
#     'rf_CV__lowercase': [True, False],
#     'rf_CV__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     'rf_clf__n_estimators': [200, 300, 400, 500],
#     'rf_clf__max_depth': [i for i in range(8, 13)],
#     'rf_clf__max_features': ['auto', 'sqrt', 'log2']
# }
# rf_gs = GridSearchCV(rf_pipeline, param_grid, scoring='f1', cv=5, verbose=1, n_jobs=-1)
# rf_gs.fit(training_set['news'], training_set['label'])

In [119]:
# rf_gs.best_params_

In [120]:
# rf_gs.best_score_

In [17]:
rf_pipeline = Pipeline([
    ('rf_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('rf_clf', RandomForestClassifier(max_depth=12, n_estimators=300, n_jobs=-1, random_state=42))
])

In [18]:
rf_pipeline.fit(training_set['news'], training_set['label'])

Pipeline(steps=[('rf_CV',
                 CountVectorizer(lowercase=False,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...])),
                ('rf_clf',
                 RandomForestClassifier(max_depth=12, n_estimators=300,
                                        n_jobs=-1, random_state=42))])

In [116]:
show_eval_scores(rf_pipeline, test_data, 'Random Forest Classifier Count Vectorizer')

Report for ---> Random Forest Classifier Count Vectorizer
Accuracy is: 0.5651144435674822
F1 score is: 0.7215765538150581
Precision score is: 0.5644268774703557
Recall score is: 1.0


#### Building a Voting Classifier using the above created models 

In [19]:
rf_voting_pipeline = Pipeline([
    ('rf_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('rf_clf', RandomForestClassifier(max_depth=12, n_estimators=300, n_jobs=-1, random_state=42))
])

In [20]:
svm_voting_pipeline = Pipeline([
    ('svm_CV', CountVectorizer(stop_words=stopwords_list, lowercase=False, ngram_range=(1, 1))),
    ('svm_clf', SVC(random_state=42, gamma=1.0, kernel='rbf', probability=True))
])

In [21]:
nb_voting_pipeline = Pipeline([
    ('nb_CV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 4))),
    ('nb_clf', MultinomialNB(alpha=6.8))
])

In [22]:
lr_voting_pipeline = Pipeline([
    ('lrCV', CountVectorizer(stop_words=stopwords_list, lowercase=True, ngram_range=(1, 1))),
    ('lr_clf', LogisticRegression(C=0.0001,random_state=42, n_jobs=-1))
])

In [23]:
voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_voting_pipeline), ('nb', nb_voting_pipeline),
    ('svm', svm_voting_pipeline), ('rf', rf_voting_pipeline)], voting='soft', n_jobs=-1)

Cette étape prends du temps (10 min)

In [24]:
voting_classifier.fit(training_set['news'], training_set['label'])

VotingClassifier(estimators=[('lr',
                              Pipeline(steps=[('lrCV',
                                               CountVectorizer(stop_words=['i',
                                                                           'me',
                                                                           'my',
                                                                           'myself',
                                                                           'we',
                                                                           'our',
                                                                           'ours',
                                                                           'ourselves',
                                                                           'you',
                                                                           "you're",
                                                                           "you'v

In [25]:
show_eval_scores(voting_classifier, test_data, 'Voting Classifier(soft) Count Vectorizer')

Report for ---> Voting Classifier(soft) Count Vectorizer
Accuracy is: 0.6045777426992897
F1 score is: 0.7319422150882826
Precision score is: 0.5922077922077922
Recall score is: 0.957983193277311


#### Saving the voting classifier model for future use

In [28]:
pickle.dump(voting_classifier, open(os.path.join('../models', 'voting_classifier_count_vectorizer.pkl'), 'wb'))