In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [None]:
news_train_df = pd.read_feather('../input/00-final-project-data-extract/news_train_df.feather')

In [None]:
news_bac = news_train_df[news_train_df['assetCodes'].str.contains('BAC.N', case=False)].drop(
    ['sourceTimestamp', 'firstCreated', 'sourceId', 'headline', 'provider', 'headlineTag','assetCodes', 'assetName'], axis=1)

In [None]:
news_bac['marketCommentary'] = news_bac['marketCommentary'].astype(np.int8)
scaler = MinMaxScaler(feature_range=(0,1))
news_bac[['urgency', 'takeSequence', 'bodySize', 'companyCount', 'marketCommentary',
          'sentenceCount', 'wordCount', 
          'firstMentionSentence', 'relevance', 'sentimentClass',
          'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
          'sentimentWordCount', 'noveltyCount12H', 'noveltyCount24H',
          'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
          'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
          'volumeCounts7D']] = scaler.fit_transform(news_bac[['urgency', 'takeSequence', 'bodySize', 'companyCount', 'marketCommentary',
                                                              'sentenceCount', 'wordCount', 
                                                              'firstMentionSentence', 'relevance', 'sentimentClass',
                                                              'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
                                                              'sentimentWordCount', 'noveltyCount12H', 'noveltyCount24H',
                                                              'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
                                                              'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
                                                              'volumeCounts7D']])

In [None]:
def print_remaining(lbl, cnt):
    print('{} selected all but {} news record(s)'.format(lbl, cnt))
    
print_remaining('assetCodes', (news_train_df['assetCodes'].str.contains('BAC.N')!=True).sum())

subjects_bac  = news_bac['subjects'].str.extract(r'\'(?P<val>[A-Z]+)\'')['val'].unique()
audiences_bac = news_bac['audiences'].str.extract(r'\'(?P<val>[A-Z]+)\'')['val'].unique()

print_remaining('subjects',  (news_train_df['subjects'].str.contains('|'.join(subjects_bac), case=False, regex=True)!=True).sum())
print_remaining('audiences', (news_train_df['audiences'].str.contains('|'.join(audiences_bac), case=False, regex=True)!=True).sum())

del(news_train_df)

In [None]:
news_bac.drop(['subjects', 'audiences'], axis=1, inplace=True)

In [None]:
news_bac = news_bac.resample('1D', on='time').mean().dropna(how='all').astype(np.float32).reset_index()

In [None]:
bac = pd.read_feather('../input/02-final-project-labels/bac_market_data.feather')
bac = bac.bfill()

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))
bac[['volume', 'close', 'open',
     'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
     'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
     'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
     'returnsClosePrevMktres10', 'returnsOpenPrevMktres10',
     'returnsOpenNextMktres10']] = scaler.fit_transform(bac[['volume', 'close', 'open',
                                                             'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
                                                             'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
                                                             'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
                                                             'returnsClosePrevMktres10', 'returnsOpenPrevMktres10',
                                                             'returnsOpenNextMktres10']])

In [None]:
market_and_news_bac = pd.merge(bac, news_bac, how='left', on='time').ffill()

In [None]:
trn = market_and_news_bac[market_and_news_bac['time'].dt.year<2016]
tst = market_and_news_bac[market_and_news_bac['time'].dt.year==2016]

In [None]:
X_cols = [
    'volume', 'close', 'open',
    'returnsClosePrevRaw1', 'returnsOpenPrevRaw1',
    'returnsClosePrevMktres1', 'returnsOpenPrevMktres1',
    'returnsClosePrevRaw10', 'returnsOpenPrevRaw10',
    'returnsClosePrevMktres10', 'returnsOpenPrevMktres10',
    'returnsOpenNextMktres10', 'urgency', 'takeSequence',
    'bodySize', 'companyCount', 'marketCommentary', 'sentenceCount',
    'wordCount', 'firstMentionSentence', 'relevance', 'sentimentClass',
    'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
    'sentimentWordCount', 'noveltyCount12H', 'noveltyCount24H',
    'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H',
    'volumeCounts24H', 'volumeCounts3D', 'volumeCounts5D',
    'volumeCounts7D']
y_cols = ['y']

In [None]:
trn_X = trn[X_cols]
trn_y = trn[y_cols]

tst_X = tst[X_cols]
tst_y = tst[y_cols]

In [None]:
from keras.layers import Input, Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Model
import time #helper libraries

In [None]:
def df_to_3d(df):
    vals = df.values
    return vals.reshape(vals.shape[0], 1, vals.shape[1])

In [None]:
inputs = Input(shape=(1, trn_X.shape[1],))

x = LSTM(50, activation='relu', return_sequences=True)(inputs)
x = Dropout(0.2)(x)
x = LSTM(100, activation='relu')(x)
x = Dropout(0.2)(x)

predictions = Dense(1, activation='tanh')(x)

start = time.time()
model = Model(inputs=inputs, outputs=predictions)
model.compile(loss='mse', optimizer='rmsprop')
print ('compilation time : ', time.time() - start)

In [None]:
model.fit(
    df_to_3d(trn_X), trn_y,
    batch_size=64, epochs=30,
    validation_data=(df_to_3d(tst_X), tst_y))

In [None]:
preds = model.predict(
    df_to_3d(pd.concat([trn_X, tst_X]))
)

In [None]:
eval_model = pd.DataFrame({'date': bac['time'].dt.date,
                           'X': preds.flatten(),
                           'y': bac['y']})

In [None]:
def score_model(X, y):
    adj_X = np.array(X) + 1
    adj_y = np.array(y) + 1
    relative_per_diffs = []
    for val_X, val_y in zip(adj_X, adj_y):
        # calculate relative percent difference
        relative_per_diffs.append( (val_X - val_y) / (np.abs(val_X) + np.abs(val_y)) )
        #print('{:>8.3f} <-> {:>8.3f} = {:>8.3f}'.format(val_X, val_y, relative_per_diffs[-1]))
    # adjust relative percent differnces scale from [-2, 2] to [0, 1]
    # higher scores closer to 1 being more accurate
    relative_per_diffs = 1 - ((np.array(relative_per_diffs) + 1) / 2)
    return list(relative_per_diffs)

In [None]:
eval_model['scores'] = score_model(eval_model['X'].values, eval_model['y'].values)

In [None]:
def plot_vs_time(data_frame, column, calculation='mean', span=10):
    if calculation == 'mean':
        group_temp = data_frame.groupby('date')[column].mean().reset_index()
    if calculation == 'count':
        group_temp = data_frame.groupby('date')[column].count().reset_index()
    if calculation == 'nunique':
        group_temp = data_frame.groupby('date')[column].nunique().reset_index()
    group_temp = group_temp.ewm(span=span).mean()
    fig = plt.figure(figsize=(10,3))
    plt.plot(group_temp['date'], group_temp[column])
    plt.xlabel('Time')
    plt.ylabel(column)
    plt.ylim((0,1))
    plt.title('%s versus time' %column)

In [None]:
plot_vs_time(eval_model, 'scores')

In [None]:
print('Eval model score is {:.5f}'.format(eval_model['scores'].mean()))