In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
newsData = pd.read_csv('Combined_News_DJIA.csv')
#remove all non-alphabetic char from headlines.
df = newsData.copy()
df.replace("[^a-zA-Z]"," ",regex=True, inplace=True)
for i in df.columns:
    if i=='Date':
        continue
    if i=='Label':
        continue
    df[i] = df[i].str.lower()
df['Date'] = newsData['Date']
df.replace("^b ","",regex=True, inplace=True)
df.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,georgia downs two russian warplanes as count...,breaking musharraf to be impeached,russia today columns of troops roll into sout...,russian tanks are moving towards the capital o...,afghan children raped with impunity u n of...,russian tanks have entered south ossetia w...,breaking georgia invades south ossetia russi...,the enemy combatent trials are nothing but a...,...,georgia invades south ossetia if russia gets...,al qaeda faces islamist backlash,condoleezza rice the us would not act to pre...,this is a busy day the european union has ap...,georgia will withdraw soldiers from iraq...,why the pentagon thinks attacking iran is a ba...,caucasus in crisis georgia invades south osse...,indian shoe manufactory and again in a seri...,visitors suffering from mental illnesses banne...,no help for mexico s kidnapping surge
1,2008-08-11,1,why wont america and nato help us if they won...,bush puts foot down on georgian conflict,jewish georgian minister thanks to israeli tr...,georgian army flees in disarray as russians ad...,olympic opening ceremony fireworks faked,what were the mossad with fraudulent new zeala...,russia angered by israeli military sale to geo...,an american citizen living in s ossetia blames...,...,israel and the us behind the georgian aggressi...,do not believe tv neither russian nor georgi...,riots are still going on in montreal canada ...,china to overtake us as largest manufacturer,war in south ossetia pics,israeli physicians group condemns state torture,russia has just beaten the united states over...,perhaps the question about the georgia rus...,russia is so much better at war,so this is what it s come to trading sex for ...
2,2008-08-12,0,remember that adorable year old who sang at ...,russia ends georgia operation,if we had no sexual harassment we would have ...,al qa eda is losing support in iraq because of...,ceasefire in georgia putin outmaneuvers the w...,why microsoft and intel tried to kill the xo ...,stratfor the russo georgian war and the balan...,i m trying to get a sense of this whole georgi...,...,u s troops still in georgia did you know the...,why russias response to georgia was right,gorbachev accuses u s of making a serious bl...,russia georgia and nato cold war two,remember that adorable year old who led you...,war in georgia the israeli connection,all signs point to the us encouraging georgia ...,christopher king argues that the us and nato a...,america the new mexico,bbc news asia pacific extinction by man n...
3,2008-08-13,0,u s refuses israel weapons to attack iran r...,when the president ordered to attack tskhinval...,israel clears troops who killed reuters camer...,britain s policy of being tough on drugs is ...,body of year old found in trunk latest ra...,china has moved million quake survivors i...,bush announces operation get all up in russia ...,russian forces sink georgian ships,...,elephants extinct by,us humanitarian missions soon in georgia if ...,georgia s ddos came from us sources,russian convoy heads into georgia violating t...,israeli defence minister us against strike on...,gorbachev we had no choice,witness russian forces head towards tbilisi i...,quarter of russians blame u s for conflict ...,georgian president says us military will take...,nobel laureate aleksander solzhenitsyn a...
4,2008-08-14,1,all the experts admit that we should legalise ...,war in south osetia pictures made by a ru...,swedish wrestler ara abrahamian throws away me...,russia exaggerated the death toll in south oss...,missile that killed inside pakistan may have...,rushdie condemns random house s refusal to pub...,poland and us agree to missle defense deal in...,will the russians conquer tblisi bet on it n...,...,bank analyst forecast georgian crisis days e...,georgia confict could set back russia s us rel...,war in the caucasus is as much the product of ...,non media photos of south ossetia georgia co...,georgian tv reporter shot by russian sniper du...,saudi arabia mother moves to block child marr...,taliban wages war on humanitarian aid workers,russia world can forget about georgia s t...,darfur rebels accuse sudan of mounting major a...,philippines peace advocate say muslims need ...


In [3]:
#remove NA and replace with empty string
df.fillna(' ', inplace=True)
df.isnull().sum()

Date     0
Label    0
Top1     0
Top2     0
Top3     0
Top4     0
Top5     0
Top6     0
Top7     0
Top8     0
Top9     0
Top10    0
Top11    0
Top12    0
Top13    0
Top14    0
Top15    0
Top16    0
Top17    0
Top18    0
Top19    0
Top20    0
Top21    0
Top22    0
Top23    0
Top24    0
Top25    0
dtype: int64

In [4]:
def mergeCol(row):
    return ' '.join(row['Top1':'Top25'])
df['headlines'] = df.apply(mergeCol, axis = 1)

In [5]:
#Tokenizing
tokenized_text = df['headlines'].apply(lambda x: x.split())
tokenized_text.head()

0    [georgia, downs, two, russian, warplanes, as, ...
1    [why, wont, america, and, nato, help, us, if, ...
2    [remember, that, adorable, year, old, who, san...
3    [u, s, refuses, israel, weapons, to, attack, i...
4    [all, the, experts, admit, that, we, should, l...
Name: headlines, dtype: object

In [6]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

#Stemming
tokenized_text = tokenized_text.apply(lambda x: [stemmer.stem(i)
                                                  for i in x])

In [7]:
for i in range (len(tokenized_text)):
    tokenized_text[i] = ' '.join(tokenized_text[i])
df['tidy_text'] = tokenized_text

In [8]:
all_headline_words = ' '.join([text for text in df['headlines']])
print(f'Total unique words in headline are {len(set(all_headline_words.split()))}')

all_tidytext_words = ' '.join([text for text in df['tidy_text']])
print(f'Total unique words in tidy text are {len(set(all_tidytext_words.split()))}')

Total unique words in headline are 33270
Total unique words in tidy text are 21809


In [9]:
#Vectorized the corpus on the whole data. Using unigram does not make sense, we can test bigrams or higher for better accuracy. Taking maximum features.

countvector=CountVectorizer(ngram_range=(2,2),max_features=10000, stop_words = 'english') #todo play with max_feature and ngram
countvector.fit(df['headlines'])

CountVectorizer(max_features=10000, ngram_range=(2, 2), stop_words='english')

In [10]:
#Seperate the test and train data
train = df[df['Date'] < '2015-01-01']
test = df[df['Date'] > '2014-12-31']

In [11]:
#convert the data
#X_train = countvector.transform(train['headlines'])
#X_test = countvector.transform(test['headlines'])

# Attempt to find best max_feature and ngram for CountVectorizer on headlines

In [12]:
max_features_num = [500,600,700,800,900,1000]
ngram = [1,2,3,4,5]
scores_df = pd.DataFrame(np.zeros((len(max_features_num),len(max_features_num))))
for i in range(len(max_features_num)):
    for j in ngram:
        countvector=CountVectorizer(ngram_range=(j,j),max_features=max_features_num[i],stop_words = 'english')
        countvector.fit(df['headlines'])
        X_train = countvector.transform(train['headlines'])
        X_test = countvector.transform(test['headlines'])
        #traindataset=countvector.fit_transform(headlines)
        #test_dataset = countvector.transform(test_transform)

        xgb = XGBClassifier(random_state =1)
        xgb.fit(pd.DataFrame(X_train.todense(), columns=countvector.get_feature_names()),train['Label'])
        predictions = xgb.predict(pd.DataFrame(X_test.todense(), columns=countvector.get_feature_names()))
        score=accuracy_score(test['Label'],predictions)
        print('max number of features used : {}'.format(max_features_num[i]))
        print('ngram_range ({},{})'.format(j,j))
        print(score)
        matrix=confusion_matrix(test['Label'],predictions)
        print('confusion matrix : {}'.format(matrix))
        print('===============================')
        
        scores_df.iloc[j,i] = score
        
#Rename column titles to max_feature_num
scores_df.columns = max_features_num



max number of features used : 500
ngram_range (1,1)
0.5
confusion matrix : [[ 71 115]
 [ 74 118]]
max number of features used : 500
ngram_range (2,2)
0.48677248677248675
confusion matrix : [[ 69 117]
 [ 77 115]]
max number of features used : 500
ngram_range (3,3)
0.5555555555555556
confusion matrix : [[ 57 129]
 [ 39 153]]
max number of features used : 500
ngram_range (4,4)
0.5238095238095238
confusion matrix : [[ 15 171]
 [  9 183]]
max number of features used : 500
ngram_range (5,5)
0.5052910052910053
confusion matrix : [[  1 185]
 [  2 190]]
max number of features used : 600
ngram_range (1,1)
0.4523809523809524
confusion matrix : [[ 66 120]
 [ 87 105]]
max number of features used : 600
ngram_range (2,2)
0.5529100529100529
confusion matrix : [[ 85 101]
 [ 68 124]]
max number of features used : 600
ngram_range (3,3)
0.5582010582010583
confusion matrix : [[ 59 127]
 [ 40 152]]
max number of features used : 600
ngram_range (4,4)
0.5238095238095238
confusion matrix : [[ 15 171]
 [  9 183

max number of features used : 900
ngram_range (2,2)
0.5158730158730159
confusion matrix : [[ 82 104]
 [ 79 113]]
max number of features used : 900
ngram_range (3,3)
0.5502645502645502
confusion matrix : [[ 52 134]
 [ 36 156]]
max number of features used : 900
ngram_range (4,4)
0.5238095238095238
confusion matrix : [[ 15 171]
 [  9 183]]
max number of features used : 900
ngram_range (5,5)
0.5052910052910053
confusion matrix : [[  1 185]
 [  2 190]]
max number of features used : 1000
ngram_range (1,1)
0.47883597883597884
confusion matrix : [[ 65 121]
 [ 76 116]]
max number of features used : 1000
ngram_range (2,2)
0.5185185185185185
confusion matrix : [[ 80 106]
 [ 76 116]]
max number of features used : 1000
ngram_range (3,3)
0.5502645502645502
confusion matrix : [[ 52 134]
 [ 36 156]]
max number of features used : 1000
ngram_range (4,4)
0.5238095238095238
confusion matrix : [[ 15 171]
 [  9 183]]
max number of features used : 1000
ngram_range (5,5)
0.5052910052910053
confusion matrix : 

In [13]:
#Max Feature_num vs ngram range

scores_df
#Highest accuracy resides with ngram (1,1)

Unnamed: 0,500,600,700,800,900,1000
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.5,0.452381,0.529101,0.510582,0.470899,0.478836
2,0.486772,0.55291,0.531746,0.505291,0.515873,0.518519
3,0.555556,0.558201,0.550265,0.550265,0.550265,0.550265
4,0.52381,0.52381,0.52381,0.52381,0.52381,0.52381
5,0.505291,0.505291,0.505291,0.505291,0.505291,0.505291


In [14]:
max_features_num = [500,600,700,800,900,1000]
ngram = [1,2,3,4,5]
scores_df_tt = pd.DataFrame(np.zeros((len(max_features_num),len(max_features_num))))
for i in range(len(max_features_num)):
    for j in ngram:
        countvector=CountVectorizer(ngram_range=(j,j),max_features=max_features_num[i],stop_words = 'english')
        countvector.fit(df['tidy_text'])
        X_train = countvector.transform(train['tidy_text'])
        X_test = countvector.transform(test['tidy_text'])
        #traindataset=countvector.fit_transform(headlines)
        #test_dataset = countvector.transform(test_transform)

        xgb = XGBClassifier(random_state =1)
        xgb.fit(pd.DataFrame(X_train.todense(), columns=countvector.get_feature_names()),train['Label'])
        predictions = xgb.predict(pd.DataFrame(X_test.todense(), columns=countvector.get_feature_names()))
        score=accuracy_score(test['Label'],predictions)
        print('max number of features used : {}'.format(max_features_num[i]))
        print('ngram_range ({},{})'.format(j,j))
        print(score)
        matrix=confusion_matrix(test['Label'],predictions)
        print('confusion matrix : {}'.format(matrix))
        print('===============================')
        
        scores_df_tt.iloc[j,i] = score
        
#Rename column titles to max_feature_num
scores_df_tt.columns = max_features_num



max number of features used : 500
ngram_range (1,1)
0.4708994708994709
confusion matrix : [[ 72 114]
 [ 86 106]]
max number of features used : 500
ngram_range (2,2)
0.5079365079365079
confusion matrix : [[ 72 114]
 [ 72 120]]
max number of features used : 500
ngram_range (3,3)
0.5211640211640212
confusion matrix : [[ 48 138]
 [ 43 149]]
max number of features used : 500
ngram_range (4,4)
0.5079365079365079
confusion matrix : [[ 12 174]
 [ 12 180]]
max number of features used : 500
ngram_range (5,5)
0.5052910052910053
confusion matrix : [[  1 185]
 [  2 190]]
max number of features used : 600
ngram_range (1,1)
0.4894179894179894
confusion matrix : [[ 73 113]
 [ 80 112]]
max number of features used : 600
ngram_range (2,2)
0.48412698412698413
confusion matrix : [[ 68 118]
 [ 77 115]]
max number of features used : 600
ngram_range (3,3)
0.5317460317460317
confusion matrix : [[ 53 133]
 [ 44 148]]
max number of features used : 600
ngram_range (4,4)
0.5079365079365079
confusion matrix : [[ 12

max number of features used : 900
ngram_range (1,1)
0.4894179894179894
confusion matrix : [[ 69 117]
 [ 76 116]]
max number of features used : 900
ngram_range (2,2)
0.5132275132275133
confusion matrix : [[ 79 107]
 [ 77 115]]
max number of features used : 900
ngram_range (3,3)
0.5396825396825397
confusion matrix : [[ 52 134]
 [ 40 152]]
max number of features used : 900
ngram_range (4,4)
0.5079365079365079
confusion matrix : [[ 12 174]
 [ 12 180]]
max number of features used : 900
ngram_range (5,5)
0.5052910052910053
confusion matrix : [[  1 185]
 [  2 190]]
max number of features used : 1000
ngram_range (1,1)
0.5132275132275133
confusion matrix : [[ 71 115]
 [ 69 123]]
max number of features used : 1000
ngram_range (2,2)
0.5185185185185185
confusion matrix : [[ 74 112]
 [ 70 122]]
max number of features used : 1000
ngram_range (3,3)
0.5502645502645502
confusion matrix : [[ 56 130]
 [ 40 152]]
max number of features used : 1000
ngram_range (4,4)
0.5079365079365079
confusion matrix : [[

In [15]:
#Max Feature_num vs ngram range

scores_df_tt
#Highest accuracy still resides with ngram (1,1)

Unnamed: 0,500,600,700,800,900,1000
0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.470899,0.489418,0.47619,0.478836,0.489418,0.513228
2,0.507937,0.484127,0.513228,0.515873,0.513228,0.518519
3,0.521164,0.531746,0.537037,0.542328,0.539683,0.550265
4,0.507937,0.507937,0.507937,0.507937,0.507937,0.507937
5,0.505291,0.505291,0.505291,0.505291,0.505291,0.505291


# XGBOOST Hyperparameters
Accuracy seemed to be highest on tidy text over headlines. ngram (1,1) seems highest overall. And we'll keep max features to 1000

In [16]:
countvector=CountVectorizer(ngram_range=(1,1),max_features=1000,stop_words = 'english')
countvector.fit(df['tidy_text'])
X_train = countvector.transform(train['tidy_text'])
X_test = countvector.transform(test['tidy_text'])

xgb = XGBClassifier(random_state =1)
param_grid = {
    'n_estimators': [500,550,600,650],
    'colsample_bytree': [0.75,0.8,0.85],
    'max_depth': [None],
    'reg_alpha': [1],
    'reg_lambda': [2, 5, 10],
    'subsample': [0.55, 0.6, .65,0.9],
    'learning_rate':[0.5],
    'gamma':[.5,1,2],
    'min_child_weight':[0.01],
    'sampling_method': ['uniform']
}

clf_xgb = RandomizedSearchCV(xgb, param_distributions = param_grid, cv = 5, verbose = True, n_jobs = -1)
best_clf_xgb = clf_xgb.fit(pd.DataFrame(X_train.todense(), columns=countvector.get_feature_names()),train['Label'])

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [None]:
print(best_clf_xgb.best_score_)
print(best_clf_xgb.best_params_)

In [None]:
best_clf_xgb

In [None]:
#best_clf_xgb = best_clf_xgb.best_estimator_

best_clf_xgb.fit(pd.DataFrame(X_train.todense(), columns=countvector.get_feature_names()),train['Label'])
predictions = best_clf_xgb.predict(pd.DataFrame(X_test.todense(), columns=countvector.get_feature_names()))
score=accuracy_score(test['Label'],predictions)
print(score)
matrix=confusion_matrix(test['Label'],predictions)
print('confusion matrix :')
matrix=confusion_matrix(test['Label'],predictions)
print(matrix)

# Try CatBoost on same countvector

In [None]:
cb=CatBoostClassifier(random_state=1)
cb.fit(pd.DataFrame(X_train.todense(), columns=countvector.get_feature_names()),train['Label'])
predictions = cb.predict(pd.DataFrame(X_test.todense(), columns=countvector.get_feature_names()))
matrix=confusion_matrix(test['Label'],predictions)
score=accuracy_score(test['Label'],predictions)
print(score)
print('===============')
print(matrix)