In [17]:
import string
import unicodedata
import sys
import collections
import random
import math
import os
from collections import Counter
from ast import literal_eval
import regex as re
import pickle
from functools import reduce
from datetime import datetime 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import xgboost as xgb
from IPython.display import display

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import explained_variance_score

from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import tensorflow as tf
from keras.models import Sequential, Model, load_model
from keras.layers import (  Dense, Conv1D, Activation, MaxPool1D, 
                            Embedding, Flatten, Reshape, concatenate, 
                            Input, Dropout, LSTM, AveragePooling1D, Masking )
from keras import optimizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
import h5py as h5py

%matplotlib inline

## Merging data into a single dataframe

In [21]:
df = pd.DataFrame()

for filename in os.listdir(os.path.join('Data')):
    if filename[-3:] == 'csv':
        stockDF = pd.read_csv(os.path.join(os.getcwd(), 'Data', filename), sep='|')
        stockDF['SYM'] = filename[:-4]
        df = df.append(stockDF, ignore_index=True)   
    
df.head()

Unnamed: 0,Difference,date,nextClose,nextDay,prevClose,prevDay,text,SYM
0,-1.3,2011-06-08,376.55,2011-06-09,377.85,2011-06-07,Airtel commences 3G services in J&K,MERGEAIRTEL
1,5.35,2011-06-13,379.3,2011-06-14,373.95,2011-06-10,"Airtel dances to African tune, sees more 3G li...",MERGEAIRTEL
2,3.8,2011-06-16,380.55,2011-06-17,376.75,2011-06-15,TCIL may approach company law board against Bh...,MERGEAIRTEL
3,20.2,2011-06-17,389.85,2011-06-20,369.65,2011-06-16,Malkani bullish on Bharti Airtel,MERGEAIRTEL
4,20.2,2011-06-17,389.85,2011-06-20,369.65,2011-06-16,Hold Bharti Airtel: Angel Broking,MERGEAIRTEL


In [22]:
df.describe()

Unnamed: 0,Difference,nextClose,prevClose
count,30045.0,30045.0,30045.0
mean,-2.832062,1089.146976,1091.979038
std,91.934622,775.223777,778.694248
min,-2648.65,162.05,155.9
25%,-12.1,405.95,405.85
50%,0.95,938.0,934.25
75%,14.9,1436.4,1445.05
max,303.05,4365.9,4359.85


## Data Cleaning Functions

- Delete stopwords, punctuation, number, lemmatize words

In [23]:
stops = stopwords.words('english')
porter = PorterStemmer()
lemma = WordNetLemmatizer()

tbl = dict.fromkeys(i for i in range(sys.maxunicode)
                      if unicodedata.category(chr(i)).startswith('P') or i == 36 or i == ord('`'))


def remove_punctuation(text):
    '''
    From https://stackoverflow.com/questions/11066400/remove-punctuation-from-unicode-formatted-strings

    '''
    return text.translate(tbl)


def remove_stopwords(text, ret_format='str'):
    tokens = filter(lambda x: x not in stops, map(porter.stem, word_tokenize(text)))
    if ret_format == 'list':
        return list(tokens)
    elif ret_format == 'str':
        return ' '.join(tokens)
    else:
        raise Exception('Invalid format')


def restore_arr(a):
    '''
        Converts strings to python list
        
        params:
            a: String -> Input string to be converted to array
        return:
            list
        
        Usage with pandas:
            train_mod = pd.read_csv('modified_train.csv', converters={'description_norm': restore_arr})
    '''
    return [x.replace("'", "") for x in a[:-1][1:].split(', ')]
        
    

def restore_int_arr(a):
    return [int(x.replace("'", "")) for x in a[:-1][1:].split(', ')]
    

def restore_float_arr(a):
    ret = [float(x.replace("'", "")) for x in a[:-1][1:].split(', ')]
    if len(ret) == 1:
        return ret[0]
    else:
        return ret
    
def lemmatize(a):
    return [lemma.lemmatize(x) for x in a.split()]

def remove_numbers(a):
    ans = []
    for s in a.split():
        try:
            g = int(s)
        except ValueError:
            ans.append(s)
            
    return ' '.join(ans)

In [24]:
df['text_norm'] = df['text']\
                    .apply(str.lower)\
                    .apply(remove_punctuation)\
                    .apply(remove_numbers)\
                    .apply(remove_stopwords)\
                    .apply(lambda x: x.split())

In [25]:
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))

In [26]:
df['target'] = (df['Difference'] > 0).astype('int')

In [27]:
last_date = datetime.strptime('2016 Dec 31', '%Y %b %d')
train = df.copy(deep=True)

means = train['Difference'].mean()
std = train['Difference'].std()

train['Difference'] = (train['Difference']-train['Difference'].mean())/train['Difference'].std()

## Attempt 2: XGBoost 

In [28]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
count_vect = TfidfVectorizer()
X = count_vect.fit_transform(train['text'])
X.toarray()

x_train, x_test, y_train, y_test = train_test_split(
                                        X, 
                                        train['Difference'], 
                                        test_size=0.1, 
                                        random_state=4)

In [29]:
xgbModel = xgb.XGBRegressor(n_estimators=100, learning_rate=0.09, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=100, objective='reg:linear')

xgbModel.fit(x_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.09, max_delta_step=0,
       max_depth=100, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

### XGBoost

In [30]:
p = xgbModel.predict(x_test)
print(explained_variance_score(p,y_test))
# for pp, yy in zip(p, y_test):
#     print(pp, " -> ", yy)
# for pp, yy in zip(p, y_test):
#     print((pp*std + means), " -> ", (yy*std + means))

-1.55771311079


In [31]:
print(X.shape)
p = xgbModel.predict(X)
# for i, j in zip(p, [(pp*std + means) for pp in p]):
#     print(i,"->",j)
# print()
train['predict'] = [(pp*std + means) for pp in p] 
train.head()

(30045, 11812)


Unnamed: 0,Difference,date,nextClose,nextDay,prevClose,prevDay,text,SYM,text_norm,target,predict
0,0.016665,2011-06-08,376.55,2011-06-09,377.85,2011-06-07,Airtel commences 3G services in J&K,MERGEAIRTEL,"[airtel, commenc, 3g, servic, jk]",0,25.030433
1,0.088999,2011-06-13,379.3,2011-06-14,373.95,2011-06-10,"Airtel dances to African tune, sees more 3G li...",MERGEAIRTEL,"[airtel, danc, african, tune, see, 3g, licens]",1,1.877295
2,0.072139,2011-06-16,380.55,2011-06-17,376.75,2011-06-15,TCIL may approach company law board against Bh...,MERGEAIRTEL,"[tcil, may, approach, compani, law, board, bha...",1,2.023366
3,0.250527,2011-06-17,389.85,2011-06-20,369.65,2011-06-16,Malkani bullish on Bharti Airtel,MERGEAIRTEL,"[malkani, bullish, bharti, airtel]",1,20.305157
4,0.250527,2011-06-17,389.85,2011-06-20,369.65,2011-06-16,Hold Bharti Airtel: Angel Broking,MERGEAIRTEL,"[hold, bharti, airtel, angel, broke]",1,-0.388998


In [32]:
train.to_csv('Main.csv', sep='|', index=False)

# Conclusions:

- The neural network model shows promise, however requires a much larger dataset than the approx. 30000 news headlines scraped
- XGboost performs significantly better with the smaller dataset (63% accuracy on the validation set)
- This indicates that it is possible to make a model for predicting the rise or fall of stock prices to better than random chance, especially if a larger and more diverse dataset is obtained