In [9]:
#Implementation of Xiaodong Li paper on sentiment analysis - NIFTY 50 stocks 
#Harvard IV dictionary
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import os,re,math
from sklearn import preprocessing
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
LabeledSentence = gensim.models.doc2vec.LabeledSentence
from pythonds.basic.stack import Stack

In [46]:
#Functions
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

def clean_data(data_frame):
    return data_frame.fillna(method='ffill')

def load_data(stock_list,daterange,field):
	x = pd.DataFrame(index = daterange) 
	for symbol in stock_list:
		pdf = pd.read_csv("~/projects/trading/hist2/{}.csv".format(symbol), index_col = "Date", parse_dates = True, usecols = ['Date',field], na_values = ['null'])
		pdf.columns = [field]
		x = x.join(pdf)
	x = clean_data(x)
	return x

def roundoff(x):
	if x>0:
		return 1
	else:
		return 0
    
def count(text,ndim):
    """
    counts words of the given sentiment
    """
    x = np.zeros(ndim)
    for word in text:
        #print "**",word
        word = (re.sub('[^a-zA-Z]+', '', word))#.upper()
        try:
            x += dict[word]
        except:
            pass
    return x

def vectorize(row,ndim):
    temp = np.zeros(ndim)
    for i in range(1,ndim+1):
        try:
            if math.isnan(row[i]):   
                pass
        except:
            temp[i-1] = 1
    return temp
    

In [64]:
#Necessary static data
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}

neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')


stock_list = ['AMBUJACEM','ASIANPAINT','AUROPHARMA','AXISBANK','BAJFINANCE','BPCL','BOSCHLTD','CIPLA','DRREDDY','EICHERMOT','GAIL','HCLTECH','HDFCBANK','HEROMOTOCO','HINDALCO','HINDPETRO','HINDUNILVR','HDFC','ITC','ICICIBANK','IOC','INDUSINDBK','INFY','KOTAKBANK','LT','LUPIN','M&M','MARUTI','NTPC','ONGC','POWERGRID','RELIANCE','SBIN','SUNPHARMA','TCS','TATAMOTORS','TATASTEEL','TECHM','UPL','ULTRACEMCO','VEDL','WIPRO','YESBANK','ZEEL']

#print item
#Training dates
start_date = "2011-01-01"
end_date = "2017-12-31"
daterange = pd.bdate_range(start_date,end_date).tolist()
start = datetime.datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.datetime.strptime(end_date, "%Y-%m-%d")
date_generated = [start + datetime.timedelta(days=x) for x in range(0, (end-start).days)]
ndim = 172
datelist = []
for date in date_generated:
    	datelist.append(date.strftime("%Y-%m-%d"))
        
#Word list
temp = pd.read_csv("/home/harsha/projects/P2/winter/harvard_iv.csv", index_col = "Entry")
#print temp
#Cleaning the word list

dict = {}
index1 = iter(temp.index)
s = Stack()
#count = 0
for word in index1:
    #print word   
    dict[re.sub('[^a-zA-Z]+', '', word)] = vectorize(temp.ix[word],ndim)
    #print re.sub('[^a-zA-Z]+', '', word),dict[re.sub('[^a-zA-Z]+', '', word)]
print len(dict.keys())

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


8629


In [67]:
newsdf_t = pd.DataFrame(columns = ['News','C2C+1','C2O+1','O2C+0','O2C+1','DVOL+0','DVOL+1'])
for symbol in stock_list:
    print symbol
    #C2C+1: Close day 0 to close day+1, 
    #C2O+1: Close to next open, O2C+0: same day open to close, 
    #O2C+1:Next day open to close DVOL0:same day change in volume compared to prev day,
    #DVOL+1: Next day change in volume compared to present day
    newsdf_x = pd.DataFrame(index = datelist, columns = ['News','C2C+1','C2O+1','O2C+0','O2C+1','DVOL+0','DVOL+1']) #Tokenized
    newsdf_y = pd.DataFrame(index = datelist, columns = ['News','C2C+1','C2O+1','O2C+0','O2C+1','DVOL+0','DVOL+1']) #Full text
    newsdfindex = newsdf_x.index 
    closedf = load_data([symbol],datelist,"Close")
    opendf = load_data([symbol],datelist,"Open")
    volumedf = load_data([symbol],datelist,"Volume")
    covdf = closedf.join(opendf, how = 'inner') #Close,Open,Volume 
    covdf = covdf.join(volumedf, how = 'inner')
    #print closedf  	
    
    for i in range(0,len(datelist)): ## newsdfindex changed to datelist
        try:
            text = open("/home/harsha/projects/P2/1692018/data_full/{}/{}".format(symbol,datelist[i]),'r')
            text = text.read()
            #print text
            #text = "A abandon abandon abandon ABANDON bad good "
            text = neg_pattern.sub(lambda x: negations_dic[x.group()], text)
            text = re.sub("[^a-zA-Z]", " ", text)
            tokenizer = RegexpTokenizer(r'\w+')
            text2 = tokenizer.tokenize(text)
            proxy_clean = []
            for word in text2:
                word = str(word)
                word = word.lower()
                proxy_clean.append(word)
            clean = [word.upper() for word in proxy_clean if word not in stopwords.words('english')]
            #print clean
            newsdf_x.ix[datelist[i]]["News"] = list(count(clean,ndim))
            #print newsdf_x.ix[datelist[i]]["News"],"*" 
            
        except Exception as e: 
            #print(e)
            pass
        
    for i in range(0,len(datelist)):
        try:
            newsdf_x.ix[datelist[i]]["C2C+1"] = roundoff((covdf.values[i+1][0]-covdf.values[i][0])/covdf.values[i][0])
            newsdf_x.ix[datelist[i]]["C2O+1"] = roundoff((covdf.values[i+1][1]-covdf.values[i][0])/covdf.values[i][0])
            newsdf_x.ix[datelist[i]]["O2C+0"] = roundoff((covdf.values[i][0]-covdf.values[i][1])/covdf.values[i][1])
            newsdf_x.ix[datelist[i]]["O2C+1"] = roundoff((covdf.values[i+1][0]-covdf.values[i+1][1])/covdf.values[i+1][1])
            newsdf_x.ix[datelist[i]]["DVOL+0"] = roundoff((covdf.values[i][2]-covdf.values[i-1][2])/covdf.values[i-1][2])
            newsdf_x.ix[datelist[i]]["DVOL+1"] = roundoff((covdf.values[i+1][2]-covdf.values[i][2])/covdf.values[i][2])                                     
        except:
            pass
    print newsdf_x.shape    
    newsdf_t = pd.concat([newsdf_t,newsdf_x], ignore_index = True)
"""
x_train, x_test, y_train, y_test = train_test_split(np.array(newsdf2['News']),np.array(newsdf2['Effect']), test_size=0.2)
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')"""
#print newsdf_t.shape

AMBUJACEM


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#

(2556, 7)
ASIANPAINT
(2556, 7)
AUROPHARMA
(2556, 7)
AXISBANK
(2556, 7)
BAJFINANCE
(2556, 7)
BPCL
(2556, 7)
BOSCHLTD
(2556, 7)
CIPLA
(2556, 7)
DRREDDY
(2556, 7)
EICHERMOT
(2556, 7)
GAIL
(2556, 7)
HCLTECH
(2556, 7)
HDFCBANK
(2556, 7)
HEROMOTOCO
(2556, 7)
HINDALCO
(2556, 7)
HINDPETRO
(2556, 7)
HINDUNILVR
(2556, 7)
HDFC
(2556, 7)
ITC
(2556, 7)
ICICIBANK
(2556, 7)
IOC
(2556, 7)
INDUSINDBK
(2556, 7)
INFY
(2556, 7)
KOTAKBANK
(2556, 7)
LT
(2556, 7)
LUPIN
(2556, 7)
M&M
(2556, 7)
MARUTI
(2556, 7)
NTPC
(2556, 7)
ONGC
(2556, 7)
POWERGRID
(2556, 7)
RELIANCE
(2556, 7)
SBIN
(2556, 7)
SUNPHARMA
(2556, 7)
TCS
(2556, 7)
TATAMOTORS
(2556, 7)
TATASTEEL
(2556, 7)
TECHM
(2556, 7)
UPL
(2556, 7)
ULTRACEMCO
(2556, 7)
VEDL
(2556, 7)
WIPRO
(2556, 7)
YESBANK
(2556, 7)
ZEEL
(2556, 7)


"\nx_train, x_test, y_train, y_test = train_test_split(np.array(newsdf2['News']),np.array(newsdf2['Effect']), test_size=0.2)\nx_train = labelizeTweets(x_train, 'TRAIN')\nx_test = labelizeTweets(x_test, 'TEST')"

In [68]:
#Ignoring the ones with zero vectors in newsdf_t2
#print newsdf_t
newsdf_t1 = newsdf_t.dropna()
#print newsdf_t1
newsdf_t2 = pd.DataFrame(columns = ['News','C2C+1','C2O+1','O2C+0','O2C+1','DVOL+0','DVOL+1'])
for i in range(0,len(newsdf_t1.index)):
    if(np.sum(np.array(newsdf_t1.values[i][0])) != 0):
        newsdf_t2 = newsdf_t2.append(newsdf_t1.ix[newsdf_t1.index[i]])
        print "***"
    else:
        pass
print newsdf_t2    
print newsdf_t1.shape, newsdf_t2.shape
newsdf_t2.to_csv("/home/harsha/projects/P2/winter/sentivectors_harvard.csv")

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***


***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
***
                                                     News C2C+1 C2O+1 O2C+0  \
5       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, ...     0     0     0   
10      [3.0, 0.0, 2.0, 2.0, 0.0, 0.0, 3.0, 0.0, 0.0, ...     1     1     0   
20      [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...     0     1     0   
33      [3.0, 0.0, 3

In [69]:
#newsdf_t2 is the final data to be trained on. Now creating different datasets for different predictions
#like close to close, close to open etc.
#C2C+1 = 1
#C20+1 = 2
#O2C+0 = 3 
#O2C+1 = 4
#DVOL+0 = 5
#DVOL+1 = 6
#print newsdf_t2

#1
x_train_1, x_test_1, y_train_1, y_test_1 = train_test_split(np.array(newsdf_t2['News']),np.array(newsdf_t2['C2C+1']), test_size=0.2)
#print np.array(x_train_1[1])
temp1 = np.concatenate([np.array(x_train_1[i]).reshape((1,172)) for i in range(0,len(x_train_1))])
temp1_t = np.concatenate([np.array(x_test_1[i]).reshape((1,172)) for i in range(0,len(x_test_1))])
#_t => test
#x_train_1 = labelizeTweets(x_train_1, 'TRAIN')
#x_test_1 = labelizeTweets(x_test_1, 'TEST')

#2
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(np.array(newsdf_t2['News']),np.array(newsdf_t2['C2O+1']), test_size=0.2)
temp2 = np.concatenate([np.array(x_train_2[i]).reshape((1,ndim)) for i in range(0,len(x_train_2))])
temp2_t = np.concatenate([np.array(x_test_2[i]).reshape((1,ndim)) for i in range(0,len(x_test_2))])
#
#x_train_2 = labelizeTweets(x_train_2, 'TRAIN')
#x_test_2 = labelizeTweets(x_test_2, 'TEST')

#3
x_train_3, x_test_3, y_train_3, y_test_3 = train_test_split(np.array(newsdf_t2['News']),np.array(newsdf_t2['O2C+0']), test_size=0.2)
temp3 = np.concatenate([np.array(x_train_3[i]).reshape((1,ndim)) for i in range(0,len(x_train_3))])
temp3_t = np.concatenate([np.array(x_test_3[i]).reshape((1,ndim)) for i in range(0,len(x_test_3))])
#x_train_3 = labelizeTweets(x_train_3, 'TRAIN')
#x_test_3 = labelizeTweets(x_test_3, 'TEST')

#4
x_train_4, x_test_4, y_train_4, y_test_4 = train_test_split(np.array(newsdf_t2['News']),np.array(newsdf_t2['O2C+1']), test_size=0.2)
temp4 = np.concatenate([np.array(x_train_4[i]).reshape((1,ndim)) for i in range(0,len(x_train_4))])
temp4_t = np.concatenate([np.array(x_test_4[i]).reshape((1,ndim)) for i in range(0,len(x_test_4))])
#
#x_train_4 = labelizeTweets(x_train_4, 'TRAIN')
#x_test_4 = labelizeTweets(x_test_4, 'TEST')

#5
x_train_5, x_test_5, y_train_5, y_test_5 = train_test_split(np.array(newsdf_t2['News']),np.array(newsdf_t2['DVOL+0']), test_size=0.2)
temp5 = np.concatenate([np.array(x_train_5[i]).reshape((1,ndim)) for i in range(0,len(x_train_5))])
temp5_t = np.concatenate([np.array(x_test_5[i]).reshape((1,ndim)) for i in range(0,len(x_test_5))])
#x_train_5 = labelizeTweets(x_train_5, 'TRAIN')
#x_test_5 = labelizeTweets(x_test_5, 'TEST')

#6
x_train_6, x_test_6, y_train_6, y_test_6 = train_test_split(np.array(newsdf_t2['News']),np.array(newsdf_t2['DVOL+1']), test_size=0.2)
temp6 = np.concatenate([np.array(x_train_6[i]).reshape((1,ndim)) for i in range(0,len(x_train_6))])
temp6_t = np.concatenate([np.array(x_test_6[i]).reshape((1,ndim)) for i in range(0,len(x_test_6))])
#x_train_6 = labelizeTweets(x_train_6, 'TRAIN')
#x_test_6 = labelizeTweets(x_test_6, 'TEST')


print len(x_train_1),len(x_train_2),len(x_train_3),len(x_train_4),len(x_train_5),len(x_train_6)

24920 24920 24920 24920 24920 24920


In [None]:
#Classifier - Neural Network
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=ndim))
model.add(Dropout(0.2))
model.add(Dense(32, activation='softmax', input_dim=ndim))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
          loss='binary_crossentropy',
          metrics=['accuracy'])


#print temp
hist = model.fit(temp2, y_train_2, epochs=50, batch_size=32, verbose=2)


score = model.evaluate(temp2_t, y_test_2, batch_size=32, verbose=2)

preds = model.predict(temp1_t)
plt.plot(hist.history["loss"])
#plt.plot(hist.history["val_loss"])
plt.title("model loss")
plt.ylabel("loss")
plt.xlabel("epoch")
plt.legend(["train", "val"], loc="upper left")
plt.show()
#tsne_plot(news_w2v)
#xdf.ix[symbol]["Accuracy"] = score[1]*100
print "Model accuracy for = ", score[1]*100, "%"
print score

In [79]:
n = len(preds)
for i in range(0,n):
    print preds[i],"\n"

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.00864639] 

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.3628497] 

[0.3673793] 

[0.3568997] 

[0.3673793] 

[0.3628497] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.02064585] 

[0.3673793] 

[0.36731583] 

[0.3673793] 

[0.35728145] 

[0.3673793] 

[0.3673793] 

[0.3674339] 

[0.3673793] 

[0.3673793] 

[0.36244792] 

[0.00095423] 

[0.36349872] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.07771378] 

[0.3673793] 

[0.3673793] 

[0.3732203] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36593089] 

[0.37219682] 

[0.37304828] 

[0.36244792] 

[0.3673793] 

[0.3566621] 

[0.3670274] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3732209] 

[0.3568997] 

[0.0004107] 

[0.3673793] 

[0.3673793] 

[0.3673

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3628497] 

[0.3673793] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.3732209] 

[0.36296704] 

[0.36671516] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.362448] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3661881] 

[0.3673793] 

[0.36245918] 

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.3628497] 

[0.3673793] 

[0.3568997] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.37321982] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.37284052] 

[0.3669917] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36737916] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.19998637] 

[0.36226702] 

[0.3673786] 

[0.3673793] 

[0.36244792] 

[0.3673793] 

[0.3673793]

[0.3673793] 

[0.3568997] 

[0.3673793] 

[0.3673793] 

[0.36023784] 

[0.3673793] 

[0.37322015] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.36244792] 

[0.3673793] 

[0.36732122] 

[0.36244792] 

[0.36294815] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.7511763] 

[0.3673793] 

[0.3673793] 

[0.3732209] 

[0.3624636] 

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673274] 

[0.3673793] 

[0.36244792] 

[0.36244792] 

[0.3673793] 

[0.36691085] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.35695434] 

[0.3732209] 

[0.3568997] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.32761362] 

[0.3673793] 

[0.3568997] 

[0.3673793] 

[0.37322012] 

[0.3673793] 

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36730847] 

[0.3673793] 

[0.3628726] 

[0.3673793] 

[0.00036958] 

[0.3673793] 

[0.3

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3628497] 

[0.36736807] 

[0.36728] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36737916] 

[0.3034842] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36786056] 

[0.3673793] 

[0.3673793] 

[0.3569006] 

[0.3732148] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.3673792] 

[0.3673793] 

[0.3628497] 

[0.3673793] 

[0.36265793] 

[0.3673793] 

[0.3673793] 

[0.36289668] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36292374] 

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36284915] 

[0.35691437] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36737582] 

[0.3673793] 

[0.3673793] 

[0.3563688] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.01888063] 

[0.36244792] 

[0.3673793] 

[0.3732209] 

[0.75309205] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.00560

[0.3673793] 

[0.3673793] 

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.0288696] 

[0.3673793] 

[0.35690135] 

[0.02630573] 

[0.36244792] 

[0.36260954] 

[0.3628497] 

[0.3673793] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673667] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36737925] 

[0.3673793] 

[0.37322044] 

[0.37322083] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.67701346] 

[0.7305131] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.34945306] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3628497] 

[0.3673793] 

[0.3628497] 

[0.00441384] 

[0.36737883] 

[0.3628497] 

[0.36737916] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36738002] 

[0.3673793] 

[0.3673793] 

[0.3732209] 

[0.36251652] 

[0.3673793] 

[0.36281353] 

[0.36246264] 

[0.36244792] 

[0.3673793] 

[0

[0.3673793] 

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3569782] 

[0.37321976] 

[0.3628497] 

[0.35700592] 

[0.3673793] 

[0.36244792] 

[0.3673793] 

[0.3673793] 

[0.3732209] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3628559] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673795] 

[0.3624384] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.37322062] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.37318516] 

[0.3673793] 

[0.3673356] 

[0.3673793] 

[0.3673793] 

[0.36244792] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3732209] 

[0.3568997] 

[0.3673793] 

[0.36244792] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.36737806] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36736363] 

[0.3673793] 

[0.36244792] 

[0.367379

[0.3732209] 

[0.3732209] 

[0.36737257] 

[0.70736915] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.7530551] 

[0.3673793] 

[0.3673793] 

[0.7529596] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.36726573] 

[0.21702315] 

[0.3732209] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.35722083] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3624051] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36736345] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36739454] 

[0.3673793] 

[0.3673793] 

[0.75304055] 

[0.36627686] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.36285007] 

[0.3628497] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.37309846] 

[0.3673793] 

[0.3673793] 

[0.3628497] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.35690135] 

[0.36240134] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3673793] 

[0.3635633] 

[0.3673793] 

[0.3673793] 

[0.3673