In [1]:
# What version of Python do you have?
import sys

import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf
import seaborn as sns
import math
import matplotlib.pyplot as plt
import numpy as np

print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tensorflow.keras.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print(f"Scikit-Learn {sk.__version__}")
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Tensor Flow Version: 1.13.1
Keras Version: 2.2.4-tf

Python 3.6.10 |Anaconda, Inc.| (default, Jan  7 2020, 15:01:53) 
[GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)]
Pandas 1.0.1
Scikit-Learn 0.22.1
GPU is NOT AVAILABLE


In [2]:
stock_data_df = pd.read_csv('GOOG.csv')

In [3]:
stock_data_df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2019-02-06,1139.569946,1147.0,1112.77002,1115.22998,1115.22998,2105600
1,2019-02-07,1104.160034,1104.839966,1086.0,1098.709961,1098.709961,2044800
2,2019-02-08,1087.0,1098.910034,1086.550049,1095.060059,1095.060059,1075800
3,2019-02-11,1096.949951,1105.944946,1092.859985,1095.01001,1095.01001,1065200
4,2019-02-12,1106.800049,1125.295044,1105.849976,1121.369995,1121.369995,1609100


In [4]:
dates = stock_data_df.loc[:, 'Date'].to_list()
close_prices = stock_data_df.loc[:, 'Adj Close'].to_list()
volumes = stock_data_df.loc[:, 'Volume'].to_list()

- stock price feature fields:
    - Date
    - Past N price
    - Past N - 1 price
    - ...
    - Past 1 price
    - Label

In [5]:
time_window = 5

In [6]:
def stock_price_feature_label_generator(time_window = 3):
    if time_window <= 0: 
        raise ValueError("argument time_window has to be at least 1")
    # use "time_window" previous days to predict the movement of the next day
    label = [-1] + [1 if close_prices[i] > close_prices[i - 1] else 0 for i in range(1, len(close_prices))]
    result = pd.DataFrame(dates[time_window: ], columns = ['Date'])
    result.insert(loc=result.shape[1], column="Label", value=label[time_window: ])
    for i in range(time_window):
        result.insert(loc=result.shape[1], column="Past " + str(time_window - i) + " Day Adj. Price", value=np.divide(close_prices[i: len(close_prices) - time_window + i], volumes[i: len(volumes) - time_window + i]))
#     for i in range(time_window):
#         result.insert(loc=result.shape[1], column="Past " + str(time_window - i) + " Day Price", value=close_prices[i: len(close_prices) - time_window + i])
#     for i in range(time_window):
#         result.insert(loc=result.shape[1], column="Past " + str(time_window - i) + " Volume", value=volumes[i: len(volumes) - time_window + i])
    return result

In [7]:
stock_price_feature = stock_price_feature_label_generator(time_window)

In [8]:
stock_price_feature.head()

Unnamed: 0,Date,Label,Past 5 Day Adj. Price,Past 4 Day Adj. Price,Past 3 Day Adj. Price,Past 2 Day Adj. Price,Past 1 Day Adj. Price
0,2019-02-13,0,0.00053,0.000537,0.001018,0.001028,0.000697
1,2019-02-14,1,0.000537,0.001018,0.001028,0.000697,0.001067
2,2019-02-15,0,0.001018,0.001028,0.000697,0.001067,0.001184
3,2019-02-19,1,0.001028,0.000697,0.001067,0.001184,0.000768
4,2019-02-20,0,0.000697,0.001067,0.001184,0.000768,0.001069


In [9]:
only_date_and_label = stock_price_feature_label_generator(1).drop(columns=['Past 1 Day Adj. Price'])

In [10]:
only_date_and_label.head()

Unnamed: 0,Date,Label
0,2019-02-07,0
1,2019-02-08,0
2,2019-02-11,0
3,2019-02-12,1
4,2019-02-13,0


In [11]:
aggregated_vector = pd.read_csv("aggregated_vector.csv")

In [12]:
aggregated_vector.shape

(252, 769)

In [13]:
only_date_and_label.shape

(252, 2)

In [14]:
merged_feature = only_date_and_label.merge(aggregated_vector, left_on="Date", right_on="Date")

In [15]:
# merged_feature = merged_feature.drop(columns=['date'])

In [16]:
merged_feature

Unnamed: 0,Date,Label,1,2,3,4,5,6,7,8,...,759,760,761,762,763,764,765,766,767,768
0,2019-02-07,0,17.211354,-14.679751,27.988252,3.515003,26.570214,-13.947233,-11.697830,48.063252,...,-10.356726,-4.289625,-10.100713,-15.202136,20.286930,6.326298,-13.710997,-37.457773,16.636177,4.240560
1,2019-02-08,0,11.186388,-14.228990,26.126609,4.564929,24.467116,-12.933068,-6.613235,39.696121,...,-12.043655,-7.866660,-0.414103,-12.288814,17.319789,6.287600,-11.422505,-32.802382,12.908284,-3.924432
2,2019-02-11,0,13.118442,-12.224762,26.989365,2.719244,20.817989,-9.563910,-7.055755,31.220197,...,-9.838323,-7.374060,-3.426144,-12.011912,11.048870,3.953444,-14.388462,-29.295217,4.891344,2.466519
3,2019-02-12,1,9.961360,-12.430817,24.998682,2.622114,21.161296,-8.490503,-8.924555,37.294316,...,-8.735921,-7.951745,-4.345756,-12.481351,14.280758,5.225553,-14.239932,-32.048363,11.930510,4.896486
4,2019-02-13,0,15.897544,-14.129895,25.914932,1.158380,22.502810,-9.342260,-9.297494,40.477387,...,-9.066148,-10.759548,-0.583340,-12.176388,13.879866,9.654405,-16.496565,-34.907676,15.768451,0.615699
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
169,2019-10-09,1,10.326575,-9.379911,20.348158,6.365201,14.038260,-12.503391,-7.831584,27.102595,...,-10.757020,-6.306535,-0.500497,-8.352474,4.358695,13.351989,-12.610681,-26.521644,8.661474,3.669245
170,2019-10-10,1,10.507777,-4.282798,14.231531,-0.014126,18.146721,-9.677600,-6.485498,32.389499,...,-4.980697,-7.328956,-2.007536,-12.737934,8.755188,6.677572,-8.784149,-26.220269,5.960826,8.265365
171,2019-10-11,1,7.815836,-5.748727,15.217927,1.607790,16.241403,-7.085594,-1.437758,25.141365,...,-7.311884,-5.409687,-3.316850,-6.228812,2.697748,4.877625,-8.658932,-24.295523,4.227979,2.667156
172,2019-10-14,1,9.693059,-15.852689,23.414626,6.257771,20.459513,-8.905883,-9.631196,29.036693,...,-10.496261,-4.735993,1.823090,-7.482430,11.344104,7.978308,-8.683424,-25.375676,10.909450,-1.212275


In [17]:
vector_by_day = merged_feature.iloc[:, 2:].to_numpy()

In [18]:
feature_by_date = pd.DataFrame(columns=["Date"] + [i for i in range(1, 769)])

for i in range(len(vector_by_day)-1, time_window, -1):
    prev_N_days_sum_vector = np.array([0 for i in range(768)])
    
    # sum up
    for delta in range(1, 1 + time_window):
        prev_N_days_sum_vector = np.add(prev_N_days_sum_vector, vector_by_day[i - delta])
    
    # normalize
#     prev_N_days_sum_vector = prev_N_days_sum_vector / np.linalg.norm(prev_N_days_sum_vector)
    
    # add date
    feature_with_date = np.concatenate((np.array([merged_feature.iloc[i, 0]]), prev_N_days_sum_vector))
    new_row_df = pd.DataFrame(feature_with_date).transpose()
    new_row_df = new_row_df.rename(columns={0: "Date"})
    
    # insert row
    feature_by_date = pd.concat([new_row_df, feature_by_date], ignore_index=True)

In [19]:
feature_by_date

Unnamed: 0,Date,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,2019-02-15,59.70356893259891,-61.52532985795775,128.7524380393326,10.72620532260901,109.21523956473315,-49.89833701467155,-39.82203888299963,180.08162943532486,-50.77982291487656,...,-48.49003637824885,-42.930358123754246,-9.714612862462761,-58.870272789654486,64.43961479611207,31.399635857760977,-72.25065956612326,-158.8536510689989,54.68656310783487,4.726658546628409
1,2019-02-19,60.4778927495503,-52.24951422552871,121.53873941357688,5.808996263491121,98.41117123315354,-45.194109954939265,-33.7944772277507,168.3838743763407,-46.987997783646634,...,-42.16977991730018,-42.012798018439796,-11.335003256241826,-56.31169646926924,51.57124365238615,27.39545836485984,-72.19214251884779,-149.63098246910235,48.17101311944086,10.426026679658813
2,2019-02-20,60.97424933078876,-46.6641100782843,118.41118007371784,4.6800204194215915,95.7414108318797,-48.26249963901273,-34.2602082426425,176.36028472313956,-48.02134841682994,...,-44.15507097295864,-41.034137623725584,-9.490360804361668,-58.51788060966808,47.85059298911703,30.43350367050189,-69.97398591468271,-153.51281473384864,55.48327333203771,14.696228843848282
3,2019-02-21,63.530796485443275,-37.4238982795237,118.15739838616622,3.9052243598531704,93.45190073286645,-50.35254969318056,-26.419113454995514,177.56158748802676,-46.12076283627734,...,-41.88707961601753,-47.05707299832517,-5.958960415953859,-57.79352150020145,42.217428238257746,31.967181858307,-67.88528670292645,-151.28978327325055,54.30655085313564,16.412012077954742
4,2019-02-22,59.47994298254118,-43.58393627779006,141.60884666515935,10.809945462065638,105.74973955766474,-54.99128954118892,-31.707383059359955,193.7185441520624,-52.84393366610128,...,-50.10348664787909,-40.8266447923718,-5.830571997146379,-72.17521463518523,54.61309616410935,34.038297958304696,-71.86497417636674,-166.679824628392,57.103674032998754,18.376393728729763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,2019-10-09,59.956002785080535,-49.47679803866909,110.41262836770791,17.26455686494384,86.51051834235754,-49.797326138514386,-19.896521547818494,167.50174615627787,-57.69468908090879,...,-50.18798171575315,-30.484846958282585,-11.126528636115424,-49.932728407895254,45.45102228468164,36.964729221244376,-70.00491078468808,-142.6537581088549,30.720913444446758,9.136352568937141
164,2019-10-10,60.11086475937982,-49.51666470652834,110.57681140229384,20.096209072165976,86.43531160669716,-54.62307545756528,-23.427617073519514,161.39390280795035,-51.25643824474618,...,-49.962692210369994,-29.728404147747312,-11.52494459975622,-47.845426498892635,40.70545694133452,43.24729340253488,-68.08398793519684,-142.97757398015676,29.982986831914346,10.229696350510302
165,2019-10-11,55.654250795385536,-40.604333723849614,97.282508277611,14.910596670447966,83.99477787232614,-52.062623624682985,-24.615666206099757,156.65023876096936,-42.46887153233308,...,-40.86306293253013,-29.270668461433793,-6.425855490837108,-48.77000608416861,37.24881817969715,44.11918796086655,-58.488026064444654,-132.3865208716972,32.92951777859496,19.08106026118631
166,2019-10-14,50.855605645743246,-35.53489529378136,91.80198251043775,11.433813454731453,81.33719329960056,-49.50576193053476,-22.083609430256136,151.1216731439632,-40.60974056548231,...,-38.92066667042992,-30.483461231398103,-8.404530546067955,-46.69126327019633,32.514805689982694,40.21299660944998,-56.263475041192905,-129.89435207815342,33.239069245510855,21.928938968419224


- fusion with prev price and volume

In [20]:
fused_feature = stock_price_feature.merge(feature_by_date, left_on="Date", right_on="Date")

- check imbalance

In [21]:
np.count_nonzero(fused_feature.Label.to_list()) / len(fused_feature)

0.5416666666666666

In [22]:
fused_feature.head()

Unnamed: 0,Date,Label,Past 5 Day Adj. Price,Past 4 Day Adj. Price,Past 3 Day Adj. Price,Past 2 Day Adj. Price,Past 1 Day Adj. Price,1,2,3,...,759,760,761,762,763,764,765,766,767,768
0,2019-02-15,0,0.001018,0.001028,0.000697,0.001067,0.001184,59.70356893259891,-61.52532985795775,128.7524380393326,...,-48.49003637824885,-42.930358123754246,-9.71461286246276,-58.870272789654486,64.43961479611207,31.399635857760977,-72.25065956612326,-158.8536510689989,54.68656310783487,4.726658546628409
1,2019-02-19,1,0.001028,0.000697,0.001067,0.001184,0.000768,60.4778927495503,-52.24951422552871,121.53873941357688,...,-42.16977991730018,-42.0127980184398,-11.335003256241826,-56.31169646926924,51.57124365238615,27.39545836485984,-72.19214251884779,-149.63098246910235,48.17101311944086,10.426026679658811
2,2019-02-20,0,0.000697,0.001067,0.001184,0.000768,0.001069,60.97424933078876,-46.6641100782843,118.41118007371784,...,-44.15507097295864,-41.034137623725584,-9.490360804361668,-58.51788060966808,47.85059298911703,30.43350367050189,-69.97398591468271,-153.51281473384864,55.48327333203771,14.696228843848282
3,2019-02-21,0,0.001067,0.001184,0.000768,0.001069,0.001024,63.53079648544328,-37.4238982795237,118.15739838616622,...,-41.88707961601753,-47.05707299832517,-5.958960415953859,-57.79352150020145,42.21742823825775,31.967181858307,-67.88528670292645,-151.28978327325055,54.30655085313564,16.412012077954742
4,2019-02-22,1,0.001184,0.000768,0.001069,0.001024,0.000775,59.47994298254118,-43.58393627779006,141.60884666515935,...,-50.10348664787909,-40.8266447923718,-5.830571997146379,-72.17521463518523,54.61309616410935,34.038297958304696,-71.86497417636674,-166.679824628392,57.103674032998754,18.376393728729763


- import ML lib

In [23]:
# from bert_serving.client import BertClient
import math
from string import punctuation
import matplotlib.pyplot as plt
import re
import os
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# import warnings
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.externals import joblib
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV,train_test_split
from sklearn.svm import SVC
import xgboost as xgb



In [24]:
X = fused_feature.iloc[:, 2:].to_numpy()
y = fused_feature.Label.to_list()

- train test split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
'''def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['label'].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    # Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc')
    
    # Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    # Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % accuracy_score(dtrain['label'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % roc_auc_score(dtrain['label'], dtrain_predprob))
    
#     feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
#     feat_imp.plot(kind='bar', title='Feature Importances')
#     plt.ylabel('Feature Importance Score')'''

'def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):\n    \n    if useTrainCV:\n        xgb_param = alg.get_xgb_params()\n        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[\'label\'].values)\n        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()[\'n_estimators\'], nfold=cv_folds,\n            metrics=\'auc\', early_stopping_rounds=early_stopping_rounds, verbose_eval=False)\n        alg.set_params(n_estimators=cvresult.shape[0])\n    \n    # Fit the algorithm on the data\n    alg.fit(dtrain[predictors], dtrain[\'label\'], eval_metric=\'auc\')\n    \n    # Predict training set:\n    dtrain_predictions = alg.predict(dtrain[predictors])\n    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]\n    \n    # Print model report:\n    print("\nModel Report")\n    print("Accuracy : %.4g" % accuracy_score(dtrain[\'label\'].values, dtrain_predictions))\n    print("AUC Score (Train): %f" % roc_auc_score(

In [27]:
xgb_model = xgb.XGBClassifier(learning_rate=0.03,
                                n_estimators=600,
                                max_depth=7,
                                min_child_weight=5,
                                gamma=0,
                                subsample=0.8,
                                colsample_bytree=0.8,
                                objective='binary:logistic',
                                nthread=4,
                                scale_pos_weight=1,
                                seed=27)
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.8, gamma=0,
              learning_rate=0.03, max_delta_step=0, max_depth=7,
              min_child_weight=5, missing=None, n_estimators=600, n_jobs=1,
              nthread=4, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=27,
              silent=None, subsample=0.8, verbosity=1)

In [28]:
y_pred = xgb_model.predict(X_test)
y_pred_prob = xgb_model.predict_proba(X_test)[:,1]

In [29]:
print("\nModel Report")
print("Accuracy : %.4g" % accuracy_score(y_test, y_pred))
print("AUC Score (Train): %f" % roc_auc_score(y_test, y_pred_prob))


Model Report
Accuracy : 0.5179
AUC Score (Train): 0.544061
