In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from datetime import date, datetime, timedelta
from sklearn.metrics import accuracy_score

In [3]:
market_train_df = pd.read_csv("/content/gdrive/My Drive/Forecasting Stock Prices Using News Sentiment Analysis/Datasets/market_train_full.csv")
market_train_df.drop('Unnamed: 0', inplace = True, axis = 1)
market_train_df['time'] = pd.to_datetime(market_train_df['time'], errors='coerce')
print(f'{market_train_df.shape[0]} samples and {market_train_df.shape[1]} features in the training market dataset.')

4072956 samples and 16 features in the training market dataset.


In [4]:
market_train_df.head()

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe
0,2007-02-01 22:00:00+00:00,A.N,Agilent Technologies Inc,2606900.0,32.19,32.17,0.005938,0.005312,,,-0.00186,0.000622,,,0.034672,1.0
1,2007-02-01 22:00:00+00:00,AAI.N,AirTran Holdings Inc,2051600.0,11.12,11.08,0.004517,-0.007168,,,-0.078708,-0.088066,,,0.027803,0.0
2,2007-02-01 22:00:00+00:00,AAP.N,Advance Auto Parts Inc,1164800.0,37.51,37.99,-0.011594,0.025648,,,0.014332,0.045405,,,0.024433,1.0
3,2007-02-01 22:00:00+00:00,AAPL.O,Apple Inc,23747329.0,84.74,86.23,-0.011548,0.016324,,,-0.048613,-0.037182,,,-0.007425,1.0
4,2007-02-01 22:00:00+00:00,ABB.N,ABB Ltd,1208600.0,18.02,18.01,0.011791,0.025043,,,0.012929,0.020397,,,-0.017994,1.0


In [5]:
news_train_df = pd.read_csv("/content/gdrive/My Drive/Forecasting Stock Prices Using News Sentiment Analysis/Datasets/news_train.csv")
news_train_df.drop('Unnamed: 0', inplace = True, axis = 1)
news_train_df['time'] = pd.to_datetime(news_train_df['time'] , errors='coerce')
print(f'{market_train_df.shape[0]} samples and {market_train_df.shape[1]} features in the training news dataset.')

4072956 samples and 16 features in the training news dataset.


In [6]:
news_train_df.head()

Unnamed: 0,time,sourceTimestamp,firstCreated,sourceId,headline,urgency,takeSequence,provider,subjects,audiences,bodySize,companyCount,headlineTag,marketCommentary,sentenceCount,wordCount,assetCodes,assetName,firstMentionSentence,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,sentimentWordCount,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D
0,2007-01-01 04:29:32+00:00,2007-01-01 04:29:32+00:00,2007-01-01 04:29:32+00:00,e58c6279551b85cf,China's Daqing pumps 43.41 mln tonnes of oil i...,3,1,RTRS,"{'ENR', 'ASIA', 'CN', 'NGS', 'EMRG', 'RTRS', '...","{'Z', 'O', 'OIL'}",1438,1,,False,11,275,"{'0857.HK', '0857.F', '0857.DE', 'PTR.N'}",PetroChina Co Ltd,6,0.235702,-1,0.500739,0.419327,0.079934,73,0,0,0,0,0,0,0,3,6,7
1,2007-01-01 07:03:35+00:00,2007-01-01 07:03:34+00:00,2007-01-01 07:03:34+00:00,5a31c4327427f63f,"FEATURE-In kidnapping, finesse works best",3,1,RTRS,"{'FEA', 'CA', 'LATAM', 'MX', 'INS', 'ASIA', 'I...","{'PGE', 'PCO', 'G', 'ESN', 'MD', 'PCU', 'DNP',...",4413,1,FEATURE,False,55,907,{'STA.N'},Travelers Companies Inc,8,0.447214,-1,0.600082,0.345853,0.054064,62,1,1,1,1,1,1,1,3,3,3
2,2007-01-01 11:29:56+00:00,2007-01-01 11:29:56+00:00,2007-01-01 11:29:56+00:00,1cefd27a40fabdfe,PRESS DIGEST - Wall Street Journal - Jan 1,3,1,RTRS,"{'RET', 'ENR', 'ID', 'BG', 'US', 'PRESS', 'IQ'...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",2108,2,PRESS DIGEST,False,15,388,"{'WMT.DE', 'WMT.N'}",Wal-Mart Stores Inc,14,0.377964,-1,0.450049,0.295671,0.25428,67,0,0,0,0,0,0,0,5,11,17
3,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,23768af19dc69992,PRESS DIGEST - New York Times - Jan 1,3,1,RTRS,"{'FUND', 'FIN', 'CA', 'SFWR', 'INS', 'PUB', 'B...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",1776,6,PRESS DIGEST,False,14,325,"{'GOOG.O', 'GOOG.OQ', 'GOOGa.DE'}",Google Inc,13,0.149071,-1,0.752917,0.162715,0.084368,83,0,0,0,0,0,0,0,5,13,15
4,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,2007-01-01 12:08:37+00:00,23768af19dc69992,PRESS DIGEST - New York Times - Jan 1,3,1,RTRS,"{'FUND', 'FIN', 'CA', 'SFWR', 'INS', 'PUB', 'B...","{'T', 'DNP', 'PSC', 'U', 'D', 'M', 'RNP', 'PTD...",1776,6,PRESS DIGEST,False,14,325,{'XMSR.O'},XM Satellite Radio Holdings Inc,11,0.149071,-1,0.699274,0.20936,0.091367,102,0,0,0,0,0,0,0,0,0,0


In [7]:
start_date = date(2015,1,1)

In [8]:
# Removing NaN
column_market = ['returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
column_raw = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1','returnsClosePrevRaw10', 'returnsOpenPrevRaw10']
for i in range(len(column_raw)):
    market_train_df[column_market[i]] = market_train_df[column_market[i]].fillna(market_train_df[column_raw[i]])

# removing anomalies from Market Data (See EDA)

market_train_df['close_to_open'] =  np.abs(market_train_df['close'] / market_train_df['open'])

market_train_df['assetName_mean_open'] = market_train_df.groupby('assetName')['open'].transform('mean')
market_train_df['assetName_mean_close'] = market_train_df.groupby('assetName')['close'].transform('mean')

# if open price is too far from mean open price for this company, replace it. Otherwise replace close price.
for i, row in market_train_df.loc[market_train_df['close_to_open'] >= 2].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']
        
        
for i, row in market_train_df.loc[market_train_df['close_to_open'] <= 0.5].iterrows():
    if np.abs(row['assetName_mean_open'] - row['open']) > np.abs(row['assetName_mean_close'] - row['close']):
        market_train_df.iloc[i,5] = row['assetName_mean_open']
    else:
        market_train_df.iloc[i,4] = row['assetName_mean_close']

In [9]:
# Reducing timeframe to recent market activities
market_train_df = market_train_df.loc[market_train_df['time'].dt.date >= start_date]
news_train_df = news_train_df.loc[news_train_df['time'].dt.date >= start_date]

In [10]:
market_train_df.head()

Unnamed: 0,time,assetCode,assetName,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe,close_to_open,assetName_mean_open,assetName_mean_close
3180756,2015-01-02 22:00:00+00:00,A.N,Agilent Technologies Inc,1530798.0,40.56,41.18,-0.006856,-0.002638,-0.006383,0.01427,0.022619,0.076704,-0.01511,0.000847,-0.024209,1.0,0.984944,38.588223,38.60723
3180757,2015-01-02 22:00:00+00:00,AAL.O,American Airlines Group Inc,10756705.0,53.91,54.28,0.005221,0.00705,0.005505,0.016444,0.104713,0.124741,0.096103,0.110196,-0.039886,1.0,0.993183,40.424093,40.43318
3180758,2015-01-02 22:00:00+00:00,AAN.N,Aaron's Inc,899151.0,30.62,30.81,0.001636,0.0013,0.001943,0.005414,0.086586,0.10351,0.039357,0.020739,0.004615,0.0,0.993833,28.00248,28.010568
3180759,2015-01-02 22:00:00+00:00,AAP.N,Advance Auto Parts Inc,509983.0,158.56,160.85,-0.00452,0.001307,-0.004355,0.007037,-0.005831,0.023083,0.016424,0.065064,-0.042488,0.0,0.985763,84.275408,84.301749
3180760,2015-01-02 22:00:00+00:00,AAPL.O,Apple Inc,53204626.0,109.33,111.39,-0.009513,-0.012675,-0.009066,0.001857,-0.000731,0.039862,-0.030449,-0.0213,0.010835,1.0,0.981506,262.22331,262.096429


In [11]:
news_train_df.head()

Unnamed: 0,time,sourceTimestamp,firstCreated,sourceId,headline,urgency,takeSequence,provider,subjects,audiences,bodySize,companyCount,headlineTag,marketCommentary,sentenceCount,wordCount,assetCodes,assetName,firstMentionSentence,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,sentimentWordCount,noveltyCount12H,noveltyCount24H,noveltyCount3D,noveltyCount5D,noveltyCount7D,volumeCounts12H,volumeCounts24H,volumeCounts3D,volumeCounts5D,volumeCounts7D
7274526,2015-01-01 00:00:04+00:00,2015-01-01 00:00:04+00:00,2015-01-01 00:00:04+00:00,6068f71eba9220bf,Beijing Court Rules in Favor of eLong in Dispu...,3,1,PRN,"{'LAW', 'EMRG', 'ITSE', 'CLJ', 'CYCS', 'SHOP',...","{'CNR', 'PRN'}",8219,3,,False,35,1437,{'LONG.O'},eLong Inc,1,1.0,-1,0.809405,0.129914,0.060681,1403,0,0,0,0,0,0,0,0,0,0
7274527,2015-01-01 00:00:04+00:00,2015-01-01 00:00:04+00:00,2015-01-01 00:00:04+00:00,6068f71eba9220bf,Beijing Court Rules in Favor of eLong in Dispu...,3,1,PRN,"{'LAW', 'EMRG', 'ITSE', 'CLJ', 'CYCS', 'SHOP',...","{'CNR', 'PRN'}",8219,3,,False,35,1437,"{'EXPE.O', 'EXPE.OQ'}",Expedia Inc,24,0.149071,-1,0.819121,0.125242,0.055638,277,0,0,0,0,0,0,0,0,0,0
7274528,2015-01-01 00:23:05+00:00,2015-01-01 00:23:05+00:00,2015-01-01 00:23:05+00:00,4fd4749f9f045e29,SYSTEMAX'S GLOBAL INDUSTRIAL BUSINESS SIGNS DE...,1,1,RTRS,"{'MRG', 'BLR', 'RETE', 'AMERS', 'CYCS', 'SHOP'...","{'E', 'U'}",0,1,,False,1,19,{'SYX.N'},Systemax Inc,1,1.0,1,0.034652,0.12202,0.843328,19,0,0,0,0,0,0,0,0,0,0
7274529,2015-01-01 00:23:07+00:00,2015-01-01 00:23:07+00:00,2015-01-01 00:23:07+00:00,5c9dd7be0e29a5c0,Systemax's Global Industrial Business Signs De...,3,1,PRN,"{'MRG', 'RETE', 'CYCS', 'NEWR', 'SHOP', 'BACT'...","{'CNR', 'PRN'}",4795,1,,False,24,781,{'SYX.N'},Systemax Inc,1,1.0,1,0.09743,0.156117,0.746453,528,1,1,1,1,1,1,1,1,1,1
7274530,2015-01-01 00:30:00+00:00,2015-01-01 00:30:00+00:00,2015-01-01 00:30:00+00:00,2c10e49166893d11,"Brown & Brown, Inc. Announces Sale of Certain ...",3,1,MKW,"{'CMSS', 'MINS', 'INSR', 'BSUP', 'LEN', 'INDS'...","{'CNR', 'MKW'}",4447,1,,False,27,751,{'BRO.N'},Brown & Brown Inc,1,1.0,-1,0.552344,0.248,0.199656,734,0,0,0,0,0,0,0,0,0,0


In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from nltk.corpus import stopwords

column_market = ['returnsClosePrevMktres1','returnsOpenPrevMktres1','returnsClosePrevMktres10', 'returnsOpenPrevMktres10']
column_raw = ['returnsClosePrevRaw1', 'returnsOpenPrevRaw1','returnsClosePrevRaw10', 'returnsOpenPrevRaw10']

news_train_df.headline = np.where(news_train_df.headline.isnull(), "", news_train_df.headline)

vectorizer = CountVectorizer(max_features=1000, stop_words={"english"})

X = vectorizer.fit_transform(news_train_df['headline'].values)
tf_transformer = TfidfTransformer(use_idf=False).fit(X)
X_train_tf = tf_transformer.transform(X)
X_train_vals = X_train_tf.mean(axis=1)

del vectorizer
del X
del X_train_tf

#mean tf-idf score for news article.
d = pd.DataFrame(data=X_train_vals)
news_train_df['tf_score'] = d


In [13]:
# add new features

market_train_df['rolling_average_close_mean'] = market_train_df.groupby('assetCode')['close'].transform('mean')
market_train_df['rolling_average_vol_mean'] = market_train_df.groupby('assetCode')['volume'].transform('mean')
market_train_df['rolling_average_close_std'] = market_train_df.groupby('assetCode')['close'].transform('std')
market_train_df['rolling_average_vol_std'] = market_train_df.groupby('assetCode')['volume'].transform('std')

In [14]:
#some more refined instruments
market_train_df['moving_average_7_day'] = market_train_df.groupby('assetCode')['close'].transform(lambda x: x.rolling(window=7).mean())

# exponential weighted moving average

ewma = pd.Series.ewm
market_train_df['ewma'] =  market_train_df.groupby('assetCode')['close'].transform(lambda x : ewma(x, span=30).mean())
market_train_df['moving_average_7_day'] = market_train_df['moving_average_7_day'].fillna(0)
market_train_df['ewma'] = market_train_df['ewma'].fillna(0)

In [15]:
for i in range(len(column_raw)):
    market_train_df[column_market[i]] = market_train_df[column_market[i]].fillna(market_train_df[column_raw[i]])

column_return = column_market + column_raw + ['returnsOpenNextMktres10']

# remove outliers data from market train dataset
for column in column_return:
    market_train_df = market_train_df.loc[market_train_df[column]>=-2]
    market_train_df = market_train_df.loc[market_train_df[column]<=2]

In [16]:
# remove unusual assets
market_train_df = market_train_df[~market_train_df['assetCode'].isin(['PGN.N','EBRYY.OB'])]

In [17]:
# Function to remove outliers from news train
def remove_outliers(data_frame, column_list, low=0.02, high=0.98):
    for column in column_list:
        this_column = data_frame[column]
        quant_df = this_column.quantile([low,high])
        low_limit = quant_df[low]
        high_limit = quant_df[high]
        data_frame[column] = data_frame[column].clip(lower=low_limit, upper=high_limit)
    return data_frame

In [18]:
news_columns_numerical = ['takeSequence', 'bodySize', 'sentenceCount', 'wordCount', 'sentimentWordCount', 'firstMentionSentence','noveltyCount12H',\
                  'noveltyCount24H', 'noveltyCount3D', 'noveltyCount5D', 'noveltyCount7D', 'volumeCounts12H', 'volumeCounts24H',\
                  'volumeCounts3D','volumeCounts5D','volumeCounts7D']

In [19]:
news_train_df = remove_outliers(news_train_df, news_columns_numerical)

In [20]:
asset_code_dict = {k: v for v, k in enumerate(market_train_df['assetCode'].unique())}
drop_columns = [col for col in news_train_df.columns if col not in ['sourceTimestamp', 'urgency', 'takeSequence', 'bodySize', 'companyCount', 
               'sentenceCount', 'firstMentionSentence', 'relevance','firstCreated', 'assetCodes']]
news_columns = ['firstCreated','relevance','sentimentClass','sentimentNegative','sentimentNeutral',
               'sentimentPositive','noveltyCount24H','noveltyCount7D','volumeCounts24H','volumeCounts7D','assetCodes','sourceTimestamp',
               'assetName','audiences', 'urgency', 'takeSequence', 'bodySize', 'companyCount', 
               'sentenceCount', 'firstMentionSentence','time', 'tf_score']

In [21]:
def data_prep(market_df,news_df):
    market_df['time'] = pd.to_datetime(market_df['time'], errors='coerce')
    news_df['sourceTimestamp'] = pd.to_datetime(news_df['sourceTimestamp'], errors='coerce')
    news_df['firstCreated'] = pd.to_datetime(news_df['firstCreated'], errors='coerce')
    market_df['date'] = market_df.time.dt.date
    market_df['close_to_open'] = market_df['close'] / market_df['open']
    market_df.drop(['time'], axis=1, inplace=True)
    
    news_df = news_df[news_columns]
    news_df['sourceTimestamp']= news_df.sourceTimestamp.dt.hour
    news_df['firstCreated'] = news_df.firstCreated.dt.date
    news_df['assetCodesLen'] = news_df['assetCodes'].map(lambda x: len(eval(x)))
    news_df['assetCodes'] = news_df['assetCodes'].map(lambda x: list(eval(x))[0])
    news_df['asset_sentiment_count'] = news_df.groupby(['assetName', 'sentimentClass'])['time'].transform('count')
    news_df['len_audiences'] = news_train_df['audiences'].map(lambda x: len(eval(x)))
    kcol = ['firstCreated', 'assetCodes']
    news_df = news_df.groupby(kcol, as_index=False).mean()
    market_df = pd.merge(market_df, news_df, how='left', left_on=['date', 'assetCode'], 
                            right_on=['firstCreated', 'assetCodes'])
    del news_df
#     market_df['assetCodeT'] = market_df['assetCode'].map(asset_code_dict)
    market_df = market_df.drop(columns = ['firstCreated','assetCodes','assetName']).fillna(0) 
#     print(market_df.count)
    return market_df

In [22]:
market_train_df = data_prep(market_train_df, news_train_df)
market_train_df = market_train_df.loc[market_train_df['date']>=start_date]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try usin

In [23]:
market_train_df.describe().round(3)

Unnamed: 0,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe,close_to_open,assetName_mean_open,assetName_mean_close,rolling_average_close_mean,rolling_average_vol_mean,rolling_average_close_std,rolling_average_vol_std,moving_average_7_day,ewma,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,noveltyCount24H,noveltyCount7D,volumeCounts24H,volumeCounts7D,sourceTimestamp,urgency,takeSequence,bodySize,companyCount,sentenceCount,firstMentionSentence,tf_score,assetCodesLen,asset_sentiment_count,len_audiences
count,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0,892025.0
mean,2498488.0,48.487,48.478,0.0,0.0,0.0,0.0,0.004,0.004,-0.001,-0.001,-0.001,0.582,1.0,40.572,40.574,48.488,2498731.0,6.968,1653296.0,47.711,48.437,0.157,0.039,0.053,0.07,0.082,0.122,0.248,0.714,2.895,2.961,0.501,0.335,843.302,1.226,4.976,0.914,0.0,0.282,89.903,0.777
std,5594024.0,57.967,57.962,0.025,0.024,0.022,0.023,0.077,0.076,0.069,0.069,0.07,0.493,0.02,37.68,37.681,56.729,4602212.0,9.682,2726267.0,57.82,57.556,0.344,0.328,0.149,0.163,0.19,0.552,2.005,3.067,11.278,6.175,1.036,1.06,2818.028,5.037,15.273,3.851,0.0,0.645,334.819,2.041
min,0.0,0.461,0.462,-0.978,-0.862,-1.236,-0.773,-0.977,-0.829,-1.649,-1.225,-1.067,0.0,0.508,2.067,2.063,1.121,20116.52,0.0,0.0,0.0,0.642,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,502345.0,19.08,19.08,-0.01,-0.01,-0.008,-0.009,-0.033,-0.033,-0.03,-0.031,-0.03,0.0,0.991,20.552,20.546,19.894,636158.8,2.623,437264.4,18.38,19.213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1026280.0,35.05,35.05,0.0,0.0,0.0,0.0,0.003,0.003,0.0,0.0,0.0,1.0,1.0,33.35,33.312,35.588,1156671.0,4.842,851825.2,34.481,35.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,2365369.0,60.64,60.61,0.011,0.011,0.008,0.009,0.038,0.038,0.029,0.029,0.029,1.0,1.009,50.678,50.704,60.582,2441640.0,8.276,1726491.0,60.021,60.538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,431332600.0,1578.13,1584.44,1.921,1.291,1.898,1.208,1.849,1.728,1.861,1.871,1.952,1.0,1.951,753.938,753.69,1283.297,96454470.0,190.417,45074210.0,1534.836,1511.469,1.0,1.0,0.82,0.928,0.857,14.0,76.0,54.0,186.0,23.0,3.0,15.0,36663.62,43.0,148.0,38.0,0.0,7.0,7027.0,25.0


In [24]:
cat_cols = ['assetCodeT']
num_cols = ['volume', 'close', 'open', 'returnsClosePrevRaw1', 'returnsOpenPrevRaw1', 'returnsClosePrevMktres1', 'returnsOpenPrevMktres1', 'returnsClosePrevRaw10', 'returnsOpenPrevRaw10', 
               'returnsClosePrevMktres10', 'returnsOpenPrevMktres10', 'close_to_open', 'rolling_average_close_mean', 'rolling_average_vol_mean', 'rolling_average_close_std', 'ewma', 'rolling_average_close_std', 'sourceTimestamp', 'urgency', 'companyCount', 'takeSequence', 'bodySize', 'sentenceCount',
               'moving_average_7_day','relevance', 'sentimentClass', 'sentimentNegative', 'sentimentNeutral', 'sentimentPositive',
               'noveltyCount24H','noveltyCount7D','volumeCounts24H','volumeCounts7D','assetCodesLen', 'asset_sentiment_count', 'len_audiences', 'tf_score']
scale_cols = ['volume', 'close', 'open','assetName_mean_open', 'assetName_mean_close', 'rolling_average_close_mean', 'rolling_average_vol_mean', 'rolling_average_close_std', 'rolling_average_vol_std', 'moving_average_7_day', 'ewma']
scale_cols_min_max=['noveltyCount24H', 'noveltyCount7D', 'volumeCounts24H',
       'volumeCounts7D', 'sourceTimestamp', 'takeSequence',
       'bodySize', 'companyCount', 'sentenceCount', 'firstMentionSentence', 'assetCodesLen', 'asset_sentiment_count', 'len_audiences']

In [25]:
from sklearn.model_selection import train_test_split
train_indices, val_indices = train_test_split(market_train_df.index.values,
                                              test_size=0.25,
                                              random_state=42,
                                              shuffle=False)

In [26]:
# market_train_df.columns

In [27]:
def encode(encoder, x):
    len_encoder = len(encoder)
    try:
        id = encoder[x]
    except KeyError:
        id = len_encoder
    return id

encoders = [{} for cat in cat_cols]

market_train_df['assetCodeT'] = market_train_df['assetCode'].astype(str).apply(lambda x: encode(encoders[0], x))
embed_sizes = [len(encoder) + 1 for encoder in encoders] #+1 for possible unknown assets

In [28]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
 
market_train_df[num_cols] = market_train_df[num_cols].fillna(0)
# print('scaling numerical columns')

#col_mean = market_train[col].mean()
#market_train[col].fillna(col_mean, inplace=True)
scaler = StandardScaler()
scalerMinMax= MinMaxScaler()
market_train_df[scale_cols] = scaler.fit_transform(market_train_df[scale_cols])
market_train_df[scale_cols_min_max] = scalerMinMax.fit_transform(market_train_df[scale_cols_min_max])


In [29]:
market_train_df.head()

Unnamed: 0,assetCode,volume,close,open,returnsClosePrevRaw1,returnsOpenPrevRaw1,returnsClosePrevMktres1,returnsOpenPrevMktres1,returnsClosePrevRaw10,returnsOpenPrevRaw10,returnsClosePrevMktres10,returnsOpenPrevMktres10,returnsOpenNextMktres10,universe,close_to_open,assetName_mean_open,assetName_mean_close,rolling_average_close_mean,rolling_average_vol_mean,rolling_average_close_std,rolling_average_vol_std,moving_average_7_day,ewma,date,relevance,sentimentClass,sentimentNegative,sentimentNeutral,sentimentPositive,noveltyCount24H,noveltyCount7D,volumeCounts24H,volumeCounts7D,sourceTimestamp,urgency,takeSequence,bodySize,companyCount,sentenceCount,firstMentionSentence,tf_score,assetCodesLen,asset_sentiment_count,len_audiences,assetCodeT
0,A.N,-0.172986,-0.136753,-0.125906,-0.006856,-0.002638,-0.006383,0.01427,0.022619,0.076704,-0.01511,0.000847,-0.024209,1.0,0.984944,-0.052661,-0.052197,-0.123425,-0.038399,-0.348995,-0.189136,-0.825158,-0.136864,2015-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,AAL.O,1.476258,0.09355,0.100103,0.005221,0.00705,0.005505,0.016444,0.104713,0.124741,0.096103,0.110196,-0.039886,1.0,0.993183,-0.003938,-0.003738,-0.120487,1.708247,-0.110472,2.13589,-0.825158,0.095085,2015-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,AAN.N,-0.285901,-0.30823,-0.304816,0.001636,0.0013,0.001943,0.005414,0.086586,0.10351,0.039357,0.020739,0.004615,0.0,0.993833,-0.3336,-0.333419,-0.350614,-0.386616,-0.147188,-0.380579,-0.825158,-0.309565,2015-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,AAP.N,-0.35547,1.898883,1.938712,-0.00452,0.001307,-0.004355,0.007037,-0.005831,0.023083,0.016424,0.065064,-0.042488,0.0,0.985763,1.159847,1.16048,1.958073,-0.315394,0.636465,-0.263731,-0.825158,1.913318,2015-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,AAPL.O,9.064345,1.049609,1.085398,-0.009513,-0.012675,-0.009066,0.001857,-0.000731,0.039862,-0.030449,-0.0213,0.010835,1.0,0.981506,5.882467,5.878932,1.125243,9.267164,0.403576,6.867965,-0.825158,1.057975,2015-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [31]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Concatenate, Flatten, BatchNormalization, Dropout
from keras.losses import binary_crossentropy
import keras

categorical_inputs = []
for cat in cat_cols:
    categorical_inputs.append(Input(shape=[1], name=cat))

categorical_embeddings = []
for i, cat in enumerate(cat_cols):
    categorical_embeddings.append(Embedding(embed_sizes[i], 10)(categorical_inputs[i]))

#categorical_logits = Concatenate()([Flatten()(cat_emb) for cat_emb in categorical_embeddings])
categorical_logits = Flatten()(categorical_embeddings[0])
categorical_logits = Dense(32,activation='relu')(categorical_logits)
categorical_logits = Dropout(0.5)(categorical_logits)
categorical_logits = BatchNormalization()(categorical_logits)
categorical_logits = Dense(32,activation='relu')(categorical_logits)

numerical_inputs = Input(shape=(len(num_cols),), name='num')
numerical_logits = numerical_inputs
numerical_logits = BatchNormalization()(numerical_logits)
numerical_logits = Dense(128,activation='relu')(numerical_logits)

numerical_logits = Dropout(0.5)(numerical_logits)
numerical_logits = BatchNormalization()(numerical_logits)
numerical_logits = Dense(128,activation='relu')(numerical_logits)
numerical_logits = Dense(64,activation='relu')(numerical_logits)

logits = Concatenate()([numerical_logits,categorical_logits])
logits = Dense(64,activation='relu')(logits)
out = Dense(1, activation='sigmoid')(logits)

METRICS = [
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

model = Model(inputs = categorical_inputs + [numerical_inputs], outputs=out)
model.compile(optimizer='adam',loss=binary_crossentropy, metrics=METRICS)

In [32]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
num (InputLayer)                [(None, 37)]         0                                            
__________________________________________________________________________________________________
assetCodeT (InputLayer)         [(None, 1)]          0                                            
__________________________________________________________________________________________________
batch_normalization_4 (BatchNor (None, 37)           148         num[0][0]                        
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 10)        10          assetCodeT[0][0]                 
_______________________________________________________________________________________

In [33]:
# from keras.utils import plot_model
# plot_model(model, to_file='model.png')

In [34]:
def get_input(market_train, indices):
    X_num = market_train.loc[indices, num_cols].values
    X = {'num':X_num}
    for cat in cat_cols:
        X[cat] = market_train.loc[indices, cat_cols].values
    y = (market_train.loc[indices,'returnsOpenNextMktres10'] >= 0).values

    return X,y

X_train,y_train= get_input(market_train_df, train_indices)
X_valid,y_valid= get_input(market_train_df, val_indices)

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

num_epochs=20

check_point = ModelCheckpoint('model.hdf5',verbose=True, save_best_only=True)
early_stop = EarlyStopping(monitor='val_auc', patience=5,verbose=True)
history=model.fit(X_train,y_train.astype(int),
          validation_data=(X_valid,y_valid.astype(int)),
          epochs=num_epochs,
          verbose=True,
          callbacks=[early_stop,check_point]) 

In [84]:
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
def plot_loss(history, label, n):
  # Use a log scale to show the wide range of values.
  plt.semilogy(history.epoch,  history.history['loss'],
               color=colors[n], label='Train '+label)
  plt.semilogy(history.epoch,  history.history['val_loss'],
          color=colors[n], label='Val '+label,
          linestyle="--")
  plt.xlabel('Epoch')
  plt.ylabel('Loss')
  
  plt.legend()

In [None]:
plot_loss(history, "Loss", 0)

In [86]:
def plot_metrics(history):
  metrics =  ['loss', 'auc', 'precision', 'recall']
  plt.figure(figsize=(15, 15))
  for n, metric in enumerate(metrics):
    name = metric.replace("_"," ").capitalize()
    plt.subplot(2,2,n+1)
    plt.plot(history.epoch,  history.history[metric], color=colors[0], label='Train')
    plt.plot(history.epoch, history.history['val_'+metric],
             color=colors[0], linestyle="--", label='Val')
    plt.xlabel('Epoch')
    plt.ylabel(name)
    if metric == 'loss':
      plt.ylim([0, plt.ylim()[1]])
    elif metric == 'auc':
      plt.ylim([0.8,1])
    else:
      plt.ylim([0,1])

    plt.legend()


In [None]:
plot_metrics(history)