In [5]:
import pandas as pd
import numpy as np
import pickle
%matplotlib inline
pd.options.display.max_rows = 6000
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [6]:
with open('label_news_docvec_newsstr.pkl', 'rb') as r:
    df = pickle.load(r)
df.head()

Unnamed: 0,Label,news,doc_vec,news_str
0,0,"[georgia, two, russian, warplane, country, mov...","[-0.0046601472, 0.046057668, 0.035470575, 0.10...",georgia two russian warplane country move brin...
1,1,"[wont, america, nato, help, wont, help, help, ...","[-0.01796527, 0.026893076, 0.05216946, 0.11043...",wont america nato help wont help help iraq put...
2,0,"[adorable, sang, opening, ceremony, wa, fake, ...","[0.020226372, 0.05665661, 0.038335405, 0.09110...",adorable sang opening ceremony wa fake russia ...
3,0,"[america, refuse, israel, weapon, attack, iran...","[0.009319111, 0.04263116, 0.062353328, 0.08478...",america refuse israel weapon attack iran repor...
4,1,"[expert, admit, legalise, drug, south, osetia,...","[0.01713654, 0.04969087, 0.062367942, 0.105228...",expert admit legalise drug south osetia pictur...


In [3]:
df.iloc[0,2][:20]

array([-0.00466015,  0.04605767,  0.03547058,  0.10155483, -0.03422928,
       -0.01389613,  0.0094356 , -0.14502455,  0.03542512,  0.0929451 ,
       -0.02405647, -0.13372633, -0.0746136 ,  0.03855896, -0.08115924,
        0.10458709, -0.0641892 ,  0.0719087 ,  0.0019091 , -0.12026239],
      dtype=float32)

In [4]:
df.iloc[0,3]

'georgia two russian warplane country move brink war musharraf impeached columns troop roll south ossetia footage fighting youtube tank moving towards capital south ossetia reportedly completely destroyed georgian artillery fire afghan child raped new jersey official sick three year old wa raped nothing russian tank entered south ossetia georgia shoot two russian jet georgia invades south ossetia russia warned would intervene side combatent trial nothing sham salim haman sentenced year kept longer anyway feel troop retreat osettain capital presumably leaving several hundred people killed video america prep georgia war russia gives green light israel attack iran america veto israeli military ops class action lawsuit behalf american public fbi georgia war nyt top story opening ceremony olympics fucking disgrace yet proof decline journalism china tell bush stay country affair world war start invades south ossetia russia get involved nato absorb georgia unleash full scale war faces islamis

In [7]:
df_train = df.iloc[:1688,:] # 85% for training and validation, 15% for testing

# Models

In [9]:
from sklearn import  svm, naive_bayes, neighbors, ensemble
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

lr_model = LogisticRegression()
nb_model = naive_bayes.GaussianNB()
knn_model = neighbors.KNeighborsClassifier()
svc_model = svm.SVC(probability=True, gamma="scale")
rf_model = ensemble.RandomForestClassifier(n_estimators=100)
et_model = ensemble.ExtraTreesClassifier(n_estimators=100)
ada_model = ensemble.AdaBoostClassifier()
xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, 
                              reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8)

models = ["lr_model", "nb_model", "knn_model", "svc_model", "rf_model", "et_model", "ada_model", "xgb_model"]

In [7]:
def baseline_model_filter(modellist, X, y):
    ''' 1. split the train data further into train and validation (17%). 
        2. fit the train data into each model of the model list
        3. get the classification report based on the model performance on validation data
    '''
    X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size = 0.17, random_state = 100)
    for model_name in modellist:
        curr_model = eval(model_name)
        curr_model.fit(X_train, y_train) 
        print(f'{model_name} \n report:{classification_report(y_valid, curr_model.predict(X_valid))}')

# Try using different word embedding techniques to filter the baseline models

# Bag-Of-Words

In [21]:
count_vect = CountVectorizer(analyzer='word')
X = count_vect.fit_transform(df_train.news_str).toarray()
y = df_train.Label

In [22]:
baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.46      0.42      0.44       122
           1       0.60      0.64      0.62       165

    accuracy                           0.54       287
   macro avg       0.53      0.53      0.53       287
weighted avg       0.54      0.54      0.54       287

nb_model 
 report:              precision    recall  f1-score   support

           0       0.39      0.39      0.39       122
           1       0.55      0.55      0.55       165

    accuracy                           0.48       287
   macro avg       0.47      0.47      0.47       287
weighted avg       0.48      0.48      0.48       287

knn_model 
 report:              precision    recall  f1-score   support

           0       0.43      0.43      0.43       122
           1       0.58      0.58      0.58       165

    accuracy                           0.51       287
   macro avg       0.50      0.50      0.50       287
weighted avg       

# Word level TF-IDF

In [23]:
tfidf_vect = TfidfVectorizer(analyzer='word')
X = tfidf_vect.fit_transform(df_train.news_str).toarray()
y = df_train.Label

In [27]:
baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.45      0.20      0.28       122
           1       0.58      0.81      0.68       165

    accuracy                           0.55       287
   macro avg       0.51      0.51      0.48       287
weighted avg       0.52      0.55      0.51       287

nb_model 
 report:              precision    recall  f1-score   support

           0       0.40      0.37      0.38       122
           1       0.56      0.59      0.57       165

    accuracy                           0.49       287
   macro avg       0.48      0.48      0.48       287
weighted avg       0.49      0.49      0.49       287

knn_model 
 report:              precision    recall  f1-score   support

           0       0.44      0.47      0.45       122
           1       0.58      0.55      0.57       165

    accuracy                           0.52       287
   macro avg       0.51      0.51      0.51       287
weighted avg       

# Character Level TF-IDF 

In [8]:
tfidf_chars_vect = TfidfVectorizer(analyzer='char')

X = tfidf_chars_vect.fit_transform(df_train.news_str).toarray()
y = df_train.Label

baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.00      0.00      0.00       122
           1       0.57      1.00      0.73       165

    accuracy                           0.57       287
   macro avg       0.29      0.50      0.37       287
weighted avg       0.33      0.57      0.42       287

nb_model 
 report:              precision    recall  f1-score   support

           0       0.50      0.35      0.41       122
           1       0.61      0.74      0.67       165

    accuracy                           0.57       287
   macro avg       0.55      0.55      0.54       287
weighted avg       0.56      0.57      0.56       287

knn_model 
 report:              precision    recall  f1-score   support

           0       0.48      0.49      0.48       122
           1       0.61      0.60      0.61       165

    accuracy                           0.55       287
   macro avg       0.55      0.55      0.55       287
weighted avg       

# Word2vec

In [9]:
X = np.array(list(df_train.doc_vec))
y = np.array(list(df.Label[:1688]))
baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.38      0.02      0.05       122
           1       0.57      0.97      0.72       165

    accuracy                           0.57       287
   macro avg       0.47      0.50      0.38       287
weighted avg       0.49      0.57      0.43       287

nb_model 
 report:              precision    recall  f1-score   support

           0       0.44      0.44      0.44       122
           1       0.59      0.58      0.58       165

    accuracy                           0.52       287
   macro avg       0.51      0.51      0.51       287
weighted avg       0.52      0.52      0.52       287

knn_model 
 report:              precision    recall  f1-score   support

           0       0.40      0.34      0.37       122
           1       0.56      0.62      0.59       165

    accuracy                           0.50       287
   macro avg       0.48      0.48      0.48       287
weighted avg       

# Best baseline model

__Naive Bayes model with character based TF-IDF embedding__

nb_model <br>
 report:              precision    recall  f1-score   support <br>

           0       0.50      0.35      0.41       122
           1       0.61      0.74      0.67       165

    accuracy                           0.57       287
   macro avg       0.55      0.55      0.54       287 <br>
weighted avg       0.56      0.57      0.56       287 <br>


## see performance on the test data

In [8]:
df_test = df.iloc[1688:,:]
y_test = df_test.Label
X_test = df_test.news_str

In [13]:
tfidf_chars_vect = TfidfVectorizer(analyzer='char')
train_news = tfidf_chars_vect.fit(df_train.news_str)
X_train_trans = train_news.transform(df_train.news_str).toarray()
X_test_trans = train_news.transform(X_test).toarray()

y_train = df_train.Label

nb_model.fit(X_train_trans, y_train) 


print(f'{nb_model} \n report:{classification_report(y_test, nb_model.predict(X_test_trans))}')

GaussianNB(priors=None, var_smoothing=1e-09) 
 report:              precision    recall  f1-score   support

           0       0.53      0.21      0.30       147
           1       0.51      0.81      0.63       151

    accuracy                           0.52       298
   macro avg       0.52      0.51      0.47       298
weighted avg       0.52      0.52      0.47       298

