In [1]:
import pandas as pd
import numpy as np
import scipy
import os
import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import pickle
pd.options.display.max_rows = 6000
import warnings
warnings.filterwarnings("ignore")


import gensim
from gensim import corpora, models, matutils
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction import stop_words
from nltk.stem import WordNetLemmatizer


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
with open('label_news_docvec_newsstr.pkl', 'rb') as r:
    df = pickle.load(r)
df.head()

Unnamed: 0,Label,news,doc_vec,news_str
0,0,"[georgia, two, russian, warplane, country, mov...","[-0.0046601472, 0.046057668, 0.035470575, 0.10...",georgia two russian warplane country move brin...
1,1,"[wont, america, nato, help, wont, help, help, ...","[-0.01796527, 0.026893076, 0.05216946, 0.11043...",wont america nato help wont help help iraq put...
2,0,"[adorable, sang, opening, ceremony, wa, fake, ...","[0.020226372, 0.05665661, 0.038335405, 0.09110...",adorable sang opening ceremony wa fake russia ...
3,0,"[america, refuse, israel, weapon, attack, iran...","[0.009319111, 0.04263116, 0.062353328, 0.08478...",america refuse israel weapon attack iran repor...
4,1,"[expert, admit, legalise, drug, south, osetia,...","[0.01713654, 0.04969087, 0.062367942, 0.105228...",expert admit legalise drug south osetia pictur...


# Topic modeling to vectors  (unsupervised learning)

data allocation:<br>
1. for topic modeling: 15% data set aside for testing. use the 85% for topic modeling. 
2. apply the topic model to the testing data to get the topic vectors.
3. create the final train, valid, test files for AWS

topic modeling models:<br>
1. use HDP to decide the topic size
2. use LDA to determine the topics

In [6]:
df_HDP_train = df.news[:1688]
df_LDA_train = df.news_str[:1688]   # 85% of the news_str data is used to train topic model
df_LDA_test = df.news_str[1688:]
print(len(df_LDA_train), len(df_LDA_test))

1688 298


### HDP corpus and dictionary (need to be bag of words format)

In [7]:
id2word_hdp = gensim.corpora.Dictionary(df_HDP_train)
id2word_hdp.filter_extremes(no_below=10, no_above=0.30)
id2word_hdp.compactify()
id2word_hdp.save('train_dict_hdp')
corpus_hdp = [id2word_hdp.doc2bow(doc) for doc in df_HDP_train]

### Use HDP model to decide the maxium topic numbers

In [8]:
from gensim.models import HdpModel
hdp = HdpModel(corpus_hdp, id2word_hdp)

In [9]:
len(hdp.print_topics())

20

In [10]:
hdp.print_topics(num_topics=20)

[(0,
  '0.003*amp + 0.003*ukraine + 0.003*syria + 0.002*isis + 0.002*oil + 0.002*japan + 0.002*drug + 0.002*city + 0.002*germany + 0.002*power'),
 (1,
  '0.003*wikileaks + 0.002*city + 0.002*syria + 0.002*drug + 0.002*egypt + 0.002*strike + 0.002*amp + 0.002*internet + 0.002*pakistan + 0.002*take'),
 (2,
  '0.002*oil + 0.001*japan + 0.001*get + 0.001*gaza + 0.001*libya + 0.001*killing + 0.001*saudi + 0.001*claim + 0.001*billion + 0.001*back'),
 (3,
  '0.001*video + 0.001*libyan + 0.001*egypt + 0.001*bin + 0.001*libya + 0.001*arrested + 0.001*laden + 0.001*election + 0.001*protester + 0.001*afghanistan'),
 (4,
  '0.002*invest + 0.001*wikileaks + 0.001*mayor + 0.001*plan + 0.001*trick + 0.001*famine + 0.001*fire + 0.001*church + 0.001*iraq + 0.001*get'),
 (5,
  '0.002*gaza + 0.002*wikileaks + 0.002*video + 0.001*iranian + 0.001*system + 0.001*killing + 0.001*east + 0.001*kill + 0.001*australia + 0.001*music'),
 (6,
  '0.001*soldier + 0.001*palestinian + 0.001*wikileaks + 0.001*canada + 0

### LDA modeling for topic vectors

In [11]:
# we need to have more stopwords for topic modeling than for word2vec.
# NLTK + SKlearn + self definded
sk_stop = list(stop_words.ENGLISH_STOP_WORDS)
mywords = ['whilst', 'say', 'says', 'today','yesterday', 'news', 'tomorrow','iii', 'ii', 'like', 'ha','wa']
final_stop = stopwords.words('english') + mywords + sk_stop

In [12]:
tfv = TfidfVectorizer(stop_words = final_stop, ngram_range = (1, 2), max_df = 0.95)
doc_word = tfv.fit_transform(df_LDA_train).transpose()
corpus = matutils.Sparse2Corpus(doc_word)
id2word = dict((v, k) for k, v in tfv.vocabulary_.items())

In [13]:
import logging
logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import warnings
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    lda = models.LdaModel(corpus=corpus, num_topics=20, id2word=id2word, passes=5, random_state = 200 )

In [14]:
lda.print_topics()

[(0,
  '0.000*"new" + 0.000*"war" + 0.000*"israel" + 0.000*"police" + 0.000*"people" + 0.000*"year" + 0.000*"russia" + 0.000*"world" + 0.000*"government" + 0.000*"attack"'),
 (1,
  '0.000*"israeli" + 0.000*"israel" + 0.000*"mumbai" + 0.000*"russia" + 0.000*"war" + 0.000*"china" + 0.000*"new" + 0.000*"year" + 0.000*"police" + 0.000*"fukushima"'),
 (2,
  '0.000*"korea" + 0.000*"israel" + 0.000*"china" + 0.000*"libya" + 0.000*"north" + 0.000*"government" + 0.000*"new" + 0.000*"year" + 0.000*"world" + 0.000*"killed"'),
 (3,
  '0.000*"russia" + 0.000*"china" + 0.000*"government" + 0.000*"new" + 0.000*"iran" + 0.000*"world" + 0.000*"people" + 0.000*"america" + 0.000*"year" + 0.000*"russian"'),
 (4,
  '0.001*"israel" + 0.001*"new" + 0.001*"world" + 0.001*"year" + 0.001*"government" + 0.001*"china" + 0.001*"police" + 0.001*"russia" + 0.001*"people" + 0.001*"war"'),
 (5,
  '0.000*"israel" + 0.000*"new" + 0.000*"ukraine" + 0.000*"world" + 0.000*"isis" + 0.000*"gaza" + 0.000*"russia" + 0.000*"sta

### Make topic vectors

#### TRAIN vectors

In [15]:
range(len(df_LDA_train))

range(0, 1688)

In [16]:
len(corpus)

1688

In [17]:
top_vecs_train = []
for i in range(len(df_LDA_train)):
    doc_topics = lda.get_document_topics(corpus[i], minimum_probability=0.0)
    doc_top_vec = [doc_topics[num][1] for num in range(20)]
    top_vecs_train.append(doc_top_vec)

print(len(top_vecs_train))
top_vecs_train[5]

1688


[0.01226648,
 0.01226648,
 0.01226648,
 0.01226648,
 0.7669369,
 0.01226648,
 0.01226648,
 0.01226648,
 0.01226648,
 0.01226648,
 0.01226648,
 0.01226648,
 0.012266482,
 0.01226648,
 0.01226648,
 0.01226648,
 0.01226648,
 0.01226648,
 0.01226648,
 0.01226648]

#### TEST vectors

In [18]:
# use the topic model from train data to get get data topic vectors
doc_word_test = tfv.fit_transform(df_LDA_test).transpose()
corpus_test = matutils.Sparse2Corpus(doc_word_test)

top_vecs_test = []
for i in range(len(df_LDA_test)):
    doc_topics_test = lda.get_document_topics(corpus_test[i], minimum_probability=0.0)
    doc_top_vec_test = [doc_topics_test[num][1] for num in range(20)]
    top_vecs_test.append(doc_top_vec_test)

print(len(top_vecs_test))
top_vecs_test[1]

298


[0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.23040833,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828,
 0.040504828]

In [19]:
# combine the two lists then convert to a Seires adding to the full dataframe for train, valid, test split.
top_vecs = pd.Series(top_vecs_train + top_vecs_test, name = 'top_vecs')

In [20]:
df['top_vecs'] = top_vecs
df.head()

Unnamed: 0,Label,news,doc_vec,news_str,top_vecs
0,0,"[georgia, two, russian, warplane, country, mov...","[-0.0046601472, 0.046057668, 0.035470575, 0.10...",georgia two russian warplane country move brin...,"[0.012460786, 0.012460786, 0.012460786, 0.0124..."
1,1,"[wont, america, nato, help, wont, help, help, ...","[-0.01796527, 0.026893076, 0.05216946, 0.11043...",wont america nato help wont help help iraq put...,"[0.01266635, 0.01266635, 0.01266635, 0.0126663..."
2,0,"[adorable, sang, opening, ceremony, wa, fake, ...","[0.020226372, 0.05665661, 0.038335405, 0.09110...",adorable sang opening ceremony wa fake russia ...,"[0.013141308, 0.013141308, 0.013141308, 0.0131..."
3,0,"[america, refuse, israel, weapon, attack, iran...","[0.009319111, 0.04263116, 0.062353328, 0.08478...",america refuse israel weapon attack iran repor...,"[0.0119563425, 0.0119563425, 0.0119563425, 0.0..."
4,1,"[expert, admit, legalise, drug, south, osetia,...","[0.01713654, 0.04969087, 0.062367942, 0.105228...",expert admit legalise drug south osetia pictur...,"[0.012844525, 0.012844525, 0.012844525, 0.0128..."


In [22]:
# combine the two vector columns for later modeling
df['vectors'] = pd.Series([list(df.doc_vec[row]) + list(df.top_vecs[row]) for row in range(len(df))])

print(len(df.vectors[0]))
print(len(df.vectors))
df.head()

320
1986


Unnamed: 0,Label,news,doc_vec,news_str,top_vecs,vectors
0,0,"[georgia, two, russian, warplane, country, mov...","[-0.0046601472, 0.046057668, 0.035470575, 0.10...",georgia two russian warplane country move brin...,"[0.012460786, 0.012460786, 0.012460786, 0.0124...","[-0.0046601472, 0.046057668, 0.035470575, 0.10..."
1,1,"[wont, america, nato, help, wont, help, help, ...","[-0.01796527, 0.026893076, 0.05216946, 0.11043...",wont america nato help wont help help iraq put...,"[0.01266635, 0.01266635, 0.01266635, 0.0126663...","[-0.01796527, 0.026893076, 0.05216946, 0.11043..."
2,0,"[adorable, sang, opening, ceremony, wa, fake, ...","[0.020226372, 0.05665661, 0.038335405, 0.09110...",adorable sang opening ceremony wa fake russia ...,"[0.013141308, 0.013141308, 0.013141308, 0.0131...","[0.020226372, 0.05665661, 0.038335405, 0.09110..."
3,0,"[america, refuse, israel, weapon, attack, iran...","[0.009319111, 0.04263116, 0.062353328, 0.08478...",america refuse israel weapon attack iran repor...,"[0.0119563425, 0.0119563425, 0.0119563425, 0.0...","[0.009319111, 0.04263116, 0.062353328, 0.08478..."
4,1,"[expert, admit, legalise, drug, south, osetia,...","[0.01713654, 0.04969087, 0.062367942, 0.105228...",expert admit legalise drug south osetia pictur...,"[0.012844525, 0.012844525, 0.012844525, 0.0128...","[0.01713654, 0.04969087, 0.062367942, 0.105228..."


In [31]:
with open('df_final_6col.pkl', 'wb') as f:
    pickle.dump(df, f)

In [2]:
with open('df_final_6col.pkl', 'rb') as f:
    df = pickle.load(f)

# Modeling
## from the results below, our Naive Bayes base line model is still the best

In [3]:
from sklearn import  svm, naive_bayes, neighbors, ensemble
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report, accuracy_score

lr_model = LogisticRegression()
nb_model = naive_bayes.GaussianNB()
knn_model = neighbors.KNeighborsClassifier()
svc_model = svm.SVC(probability=True, gamma="scale")
rf_model = ensemble.RandomForestClassifier(n_estimators=100)
et_model = ensemble.ExtraTreesClassifier(n_estimators=100)
ada_model = ensemble.AdaBoostClassifier()
xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, 
                              reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8)

models = ["lr_model", "nb_model", "knn_model", "svc_model", "rf_model", "et_model", "ada_model", "xgb_model"]

In [4]:
def baseline_model_filter(modellist, X, y):
    ''' 1. split the train data further into train and validation (17%). 
        2. fit the train data into each model of the model list
        3. get the classification report based on the model performance on validation data
    '''
    X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size = 0.17, random_state = 100)
    for model_name in modellist:
        curr_model = eval(model_name)
        curr_model.fit(X_train, y_train) 
        print(f'{model_name} \n report:{classification_report(y_valid, curr_model.predict(X_valid))}')

## doc2vec + topic vectors

In [5]:
X = np.array(list(df.vectors[:1688]))
y = np.array(list(df.Label[:1688]))

baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.38      0.02      0.05       122
           1       0.57      0.97      0.72       165

    accuracy                           0.57       287
   macro avg       0.47      0.50      0.38       287
weighted avg       0.49      0.57      0.43       287

nb_model 
 report:              precision    recall  f1-score   support

           0       0.40      0.37      0.38       122
           1       0.56      0.59      0.58       165

    accuracy                           0.50       287
   macro avg       0.48      0.48      0.48       287
weighted avg       0.49      0.50      0.49       287

knn_model 
 report:              precision    recall  f1-score   support

           0       0.41      0.36      0.38       122
           1       0.57      0.62      0.59       165

    accuracy                           0.51       287
   macro avg       0.49      0.49      0.49       287
weighted avg       

## topic vectors only

In [6]:
X = np.array(list(df.top_vecs[:1688]))
y = np.array(list(df.Label[:1688]))

baseline_model_filter(models, X, y)

lr_model 
 report:              precision    recall  f1-score   support

           0       0.00      0.00      0.00       122
           1       0.57      1.00      0.73       165

    accuracy                           0.57       287
   macro avg       0.29      0.50      0.37       287
weighted avg       0.33      0.57      0.42       287

nb_model 
 report:              precision    recall  f1-score   support

           0       0.35      0.19      0.25       122
           1       0.55      0.75      0.64       165

    accuracy                           0.51       287
   macro avg       0.45      0.47      0.44       287
weighted avg       0.47      0.51      0.47       287

knn_model 
 report:              precision    recall  f1-score   support

           0       0.45      0.41      0.43       122
           1       0.59      0.64      0.61       165

    accuracy                           0.54       287
   macro avg       0.52      0.52      0.52       287
weighted avg       