In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

In [3]:
# Clean / Load
sentiments = pd.read_csv("../data/sentiment/sentiment_days.csv")
sentiments.date = pd.to_datetime(sentiments.date, format='%Y-%m-%d')
tickers = pd.read_csv("../data/asx-tickers.csv")
tickers.drop(['company'], axis=1, inplace=True)
stocks = pd.read_csv("../data/percent_change/relative_change.csv")
stocks.date = pd.to_datetime(stocks.date, format='%Y%m%d')
stocks.drop(['Unnamed: 0'], axis=1, inplace=True)
stocks = stocks[(stocks.date > '2005-12-31') & (stocks.date < '2017-12-31')]

# Join / Label
industries = stocks.merge(tickers, on='ticker')
sentiments['prior_date']=sentiments.date + datetime.timedelta(days=1)
data = industries.merge(sentiments, left_on='date', right_on='prior_date', how='inner')
data['label'] = np.where(data['rel_difference_close_open']>0, 1, 0)

# Preview
data.head(3)

Unnamed: 0,ticker,date_x,close,rel_difference_close_open,rel_difference_close_close,industry,date_y,mode,Sum_QDAP,Sum,Mean_QDAP,Mean,prior_date,label
0,AAC,2006-01-03,1.64,0.015244,0.051829,"Food, Beverage & Tobacco",2006-01-02,neutral,-4.304762,negative,-0.031887,negative,2006-01-03,1
1,AAR,2006-01-03,0.019,-0.052632,-0.052632,Materials,2006-01-02,neutral,-4.304762,negative,-0.031887,negative,2006-01-03,0
2,AAU,2006-01-03,0.7,0.028571,0.028571,Media,2006-01-02,neutral,-4.304762,negative,-0.031887,negative,2006-01-03,1


In [4]:
# Get the industry
industry = 'Food, Beverage & Tobacco'
groups = data.groupby('industry')
tick_dat = groups.get_group(industry)

tick_dat.head(3)

Unnamed: 0,ticker,date_x,close,rel_difference_close_open,rel_difference_close_close,industry,date_y,mode,Sum_QDAP,Sum,Mean_QDAP,Mean,prior_date,label
0,AAC,2006-01-03,1.64,0.015244,0.051829,"Food, Beverage & Tobacco",2006-01-02,neutral,-4.304762,negative,-0.031887,negative,2006-01-03,1
92,BUG,2006-01-03,0.58,0.0,0.0,"Food, Beverage & Tobacco",2006-01-02,neutral,-4.304762,negative,-0.031887,negative,2006-01-03,0
105,CCL,2006-01-03,7.91,0.001264,0.02402,"Food, Beverage & Tobacco",2006-01-02,neutral,-4.304762,negative,-0.031887,negative,2006-01-03,1


In [5]:
# Get the training test data
train, test = train_test_split(tick_dat, test_size=0.2)

train_X = train[['Sum_QDAP']]
train_Y = train[['label']]
test_X = test[['Sum_QDAP']]
test_Y = test[['label']]

print(train_X.shape)

# Build the model
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(40, 2), random_state=1)
model = clf.fit(train_X, train_Y.values.ravel())
print(model.score(test_X, test_Y.values.ravel()))

(45171, 1)
0.6217125653059418
