In [1]:
import pandas as pd
import numpy as np
import pickle
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df=pd.read_pickle(r'preprocessed_df.pkl')

In [3]:
df.Date=[str(x).replace('00:00:00','') for x in df.Date]

In [4]:
'''We now no more require the actual Headlines'''
df=df.drop('Headlines',axis=1)

In [5]:
df.head()

Unnamed: 0,Date,Label,headlines_str,headlines_words,word2vec
0,2000-01-03,0,hindrance operation extract leaked report scor...,"[hindrance, operation, extract, leaked, report...","[0.019919062, 0.051673025, -0.025186863, 0.062..."
1,2000-01-04,0,scorecard best lake scene leader german sleaze...,"[scorecard, best, lake, scene, leader, german,...","[0.038909823, 0.049511578, 0.07234156, 0.05103..."
2,2000-01-05,0,coventry caught counter flo united rival road ...,"[coventry, caught, counter, flo, united, rival...","[-0.028612338, 0.047972914, 0.013764942, 0.097..."
3,2000-01-06,1,pilgrim know progress thatcher facing ban mcil...,"[pilgrim, know, progress, thatcher, facing, ba...","[-0.021918356, 0.04694813, 0.03538333, 0.06647..."
4,2000-01-07,1,hitch horlocks beckham united survive breast c...,"[hitch, horlocks, beckham, united, survive, br...","[-0.014015404, 0.045476243, 0.021973068, 0.047..."


# Models

In [6]:
from sklearn import  svm, naive_bayes, neighbors, ensemble
from sklearn.linear_model import LogisticRegression

In [7]:
lr_model = LogisticRegression(n_jobs=-1)
nb_model = naive_bayes.MultinomialNB()
knn_model = neighbors.KNeighborsClassifier(n_jobs=-1)
svc_model = svm.SVC(probability=True, gamma="scale",)
rf_model = ensemble.RandomForestClassifier(n_estimators=100,n_jobs=-1)
et_model = ensemble.ExtraTreesClassifier(n_estimators=100,n_jobs=-1)
ada_model = ensemble.AdaBoostClassifier()

models = ["lr_model", "nb_model", "knn_model", "svc_model", "rf_model", "et_model", "ada_model"]

In [8]:
def baseline_model_filter(modellist, X, y):
    ''' 1. split the train data further into train and validation (17%). 
        2. fit the train data into each model of the model list
        3. get the classification report based on the model performance on validation data
    '''
    X_train, X_valid, y_train, y_valid = X[:3471],X[3471:],y[:3471],y[3471:]
    for model_name in modellist:
        curr_model = eval(model_name)
        curr_model.fit(X_train, y_train) 
        print(f'{model_name} \n report:{classification_report(y_valid, curr_model.predict(X_valid))}')

In [9]:
df.head()

Unnamed: 0,Date,Label,headlines_str,headlines_words,word2vec
0,2000-01-03,0,hindrance operation extract leaked report scor...,"[hindrance, operation, extract, leaked, report...","[0.019919062, 0.051673025, -0.025186863, 0.062..."
1,2000-01-04,0,scorecard best lake scene leader german sleaze...,"[scorecard, best, lake, scene, leader, german,...","[0.038909823, 0.049511578, 0.07234156, 0.05103..."
2,2000-01-05,0,coventry caught counter flo united rival road ...,"[coventry, caught, counter, flo, united, rival...","[-0.028612338, 0.047972914, 0.013764942, 0.097..."
3,2000-01-06,1,pilgrim know progress thatcher facing ban mcil...,"[pilgrim, know, progress, thatcher, facing, ba...","[-0.021918356, 0.04694813, 0.03538333, 0.06647..."
4,2000-01-07,1,hitch horlocks beckham united survive breast c...,"[hitch, horlocks, beckham, united, survive, br...","[-0.014015404, 0.045476243, 0.021973068, 0.047..."


In [10]:
''' It's a Time Series data and the previous data will be prediciting the upcoming data. 
    The Stocks follows yearly seasonality so it will be a good practice 
    to split data at the year end. '''

train = df[df['Date'] < '20150101'].drop('Date',axis=1)
test = df[df['Date'] > '20141231'].drop('Date',axis=1)

## Trying using different word embedding techniques to filter the baseline models

## `Bag-Of-Words`

In [11]:
count_vect = CountVectorizer(analyzer='word')
X = count_vect.fit_transform(train.headlines_str).toarray()
y = train.Label

In [12]:
baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.54      0.36      0.43       243
           1       0.55      0.72      0.62       261

    accuracy                           0.55       504
   macro avg       0.55      0.54      0.53       504
weighted avg       0.55      0.55      0.53       504

nb_model 
 report:              precision    recall  f1-score   support

           0       0.42      0.11      0.17       243
           1       0.51      0.86      0.64       261

    accuracy                           0.50       504
   macro avg       0.46      0.48      0.41       504
weighted avg       0.47      0.50      0.41       504

knn_model 
 report:              precision    recall  f1-score   support

           0       0.62      0.11      0.18       243
           1       0.53      0.94      0.68       261

    accuracy                           0.54       504
   macro avg       0.57      0.52      0.43       504
weighted avg       

## `Word level TF-IDF`

In [13]:
tfidf_vect = TfidfVectorizer(analyzer='word')
X = tfidf_vect.fit_transform(train.headlines_str).toarray()
y = train.Label

In [14]:
baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.45      0.13      0.20       243
           1       0.51      0.85      0.64       261

    accuracy                           0.50       504
   macro avg       0.48      0.49      0.42       504
weighted avg       0.48      0.50      0.43       504

nb_model 
 report:              precision    recall  f1-score   support

           0       0.00      0.00      0.00       243
           1       0.52      1.00      0.68       261

    accuracy                           0.52       504
   macro avg       0.26      0.50      0.34       504
weighted avg       0.27      0.52      0.35       504

knn_model 
 report:              precision    recall  f1-score   support

           0       0.55      0.47      0.50       243
           1       0.56      0.64      0.60       261

    accuracy                           0.56       504
   macro avg       0.55      0.55      0.55       504
weighted avg       

## `Word2Vec`

In [37]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [38]:
X = scaler.fit_transform(np.array(list(train.word2vec)))
y = train.Label

In [39]:
baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.50      0.24      0.33       243
           1       0.52      0.77      0.62       261

    accuracy                           0.52       504
   macro avg       0.51      0.51      0.48       504
weighted avg       0.51      0.52      0.48       504

nb_model 
 report:              precision    recall  f1-score   support

           0       0.00      0.00      0.00       243
           1       0.52      1.00      0.68       261

    accuracy                           0.52       504
   macro avg       0.26      0.50      0.34       504
weighted avg       0.27      0.52      0.35       504

knn_model 
 report:              precision    recall  f1-score   support

           0       0.49      0.31      0.38       243
           1       0.52      0.70      0.60       261

    accuracy                           0.51       504
   macro avg       0.51      0.51      0.49       504
weighted avg       

## Best Baseline Model

We'll select multiple model from the process we'll finetune them and get the result

1. Naive Baye's Model with Count Vectorizer --> Decent F1 Scores and 86% accurate in predicting 1
2. Rf with Count Vectorizer --> Decent F1 Scores and 86% accurate in predicting 1
3. KNN Model with TFID Vectorizer --> Highest Accuracy among all Models