In [None]:
!nvidia-smi

Mon Sep 14 14:38:42 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8     9W /  70W |      0MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [1]:
import ta
import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

import tensorflow as tf
print(f'TensorFlow version: {tf.__version__}')

TensorFlow version: 2.3.0


In [None]:
!wget https://raw.githubusercontent.com/dksifoua/Stock-Market-Prediction/master/data/2019_AAPL_1min.csv

--2020-09-14 14:31:21--  https://raw.githubusercontent.com/dksifoua/Stock-Market-Prediction/master/data/2019_AAPL_1min.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6092763 (5.8M) [text/plain]
Saving to: ‘2019_AAPL_1min.csv.1’


2020-09-14 14:31:22 (34.4 MB/s) - ‘2019_AAPL_1min.csv.1’ saved [6092763/6092763]



In [None]:
df = pd.read_csv('./2019_AAPL_1min.csv', header=0, index_col=0)
df.index = pd.to_datetime(df.index).tz_localize(None).to_period('T')
df = df.drop(['open', 'high', 'low'], axis=1)
print(df.shape)
df.head()

(101081, 2)


Unnamed: 0_level_0,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-02 14:30,154.78,3223.0
2019-01-02 14:31,155.325,1674.0
2019-01-02 14:32,154.85,3153.0
2019-01-02 14:33,154.6,5104.0
2019-01-02 14:34,154.76,2948.0


# Data processing

**Add targets**

The target to be predicted in the $i^{th}$ day is calculated as follows:

$$target_i = sign(P_{i+d} - P_i)$$

Where $d$ is the number of minutes/days after which the prediction is to be made

In [None]:
df['label'] = df.close.shift(-1) - df.close
df.label = df.label.apply(lambda x: 0 if x < 0 else 1)
df.head()

Unnamed: 0_level_0,close,volume,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-02 14:30,154.78,3223.0,1
2019-01-02 14:31,155.325,1674.0,0
2019-01-02 14:32,154.85,3153.0,0
2019-01-02 14:33,154.6,5104.0,1
2019-01-02 14:34,154.76,2948.0,0


**Technical indicators**

In [None]:
# Momentum indicators
df['roc'] = ta.momentum.roc(close=df.close) # Rate of Change (ROC)
df['rsi'] = ta.momentum.rsi(close=df.close) # Relative Strength Index (RSI)
df['tsi'] = ta.momentum.tsi(close=df.close) # True strength index (TSI)

# Volatility indicators
bb_indicator = ta.volatility.BollingerBands(close=df.close)
df['bb_bbhi'] = bb_indicator.bollinger_hband_indicator() # Bollinger Band high indicator
df['bb_bbli'] = bb_indicator.bollinger_lband_indicator() # Bollinger Band low indicator

# Trend indicators
aroon_indicator = ta.trend.AroonIndicator(close=df.close)
macd_indicator = ta.trend.MACD(close=df.close)
kst_indicator = ta.trend.KSTIndicator(close=df.close)
df['aroon_down'] = aroon_indicator.aroon_down() # Aroon Down Channel
df['aroon'] = aroon_indicator.aroon_indicator() # Aroon Indicator
df['aroon_up'] = aroon_indicator.aroon_up() # Aroon Up Channel
df['macd_line'] = macd_indicator.macd() # MACD Line
df['macd_hist'] = macd_indicator.macd_diff() # MACD Histogram
df['macd_signal'] = macd_indicator.macd_signal() # MACD Signal Line
df['kst'] = kst_indicator.kst() # Know Sure Thing (KST)
df['kst_diff'] = kst_indicator.kst_diff() # Diff Know Sure Thing (KST)
df['kst_signal'] = kst_indicator.kst_sig() # Signal Line Know Sure Thing (KST)
df['dpo'] = ta.trend.dpo(close=df.close) # Detrended Price Oscillator (DPO)
df['trix'] = ta.trend.trix(close=df.close) # Trix (TRIX)
df['sma_10'] = ta.trend.sma_indicator(close=df.close, n=10) # SMA n=10
df['sma_20'] = ta.trend.sma_indicator(close=df.close, n=20) # SMA n=20
df['sma_30'] = ta.trend.sma_indicator(close=df.close, n=30) # SMA n=30
df['sma_60'] = ta.trend.sma_indicator(close=df.close, n=60) # SMA n=60
df['ema_10'] = ta.trend.sma_indicator(close=df.close, n=10) # EMA n=10
df['ema_20'] = ta.trend.sma_indicator(close=df.close, n=20) # EMA n=20
df['ema_30'] = ta.trend.sma_indicator(close=df.close, n=30) # EMA n=30
df['ema_60'] = ta.trend.sma_indicator(close=df.close, n=60) # EMA n=60

# Volume indicators
df['obv'] = ta.volume.on_balance_volume(close=df.close, volume=df.volume) # On Balance Volume (OBV)
df['vpt'] = ta.volume.volume_price_trend(close=df.close, volume=df.volume) # Volume-price trend (VPT)
df['fi'] = ta.volume.force_index(close=df.close, volume=df.volume) # Force Index (FI)
df['nvi'] = ta.volume.negative_volume_index(close=df.close, volume=df.volume) # Negative Volume Index (NVI)

df.tail()

Unnamed: 0_level_0,close,volume,label,roc,rsi,tsi,bb_bbhi,bb_bbli,aroon_down,aroon,aroon_up,macd_line,macd_hist,macd_signal,kst,kst_diff,kst_signal,dpo,trix,sma_10,sma_20,sma_30,sma_60,ema_10,ema_20,ema_30,ema_60,obv,vpt,fi,nvi
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
2019-12-31 20:55,293.59,6260.0,0,0.080108,66.452076,12.884774,1.0,0.0,20.0,80.0,100.0,0.065703,0.004158,0.061544,0.65673,-0.103265,0.759995,0.24075,0.002978,293.249,293.23425,293.192,292.980833,293.249,293.23425,293.192,292.980833,8353065.0,7.131644,271.749266,1194.559824
2019-12-31 20:56,293.445,7623.0,0,-0.010222,59.379185,14.00982,0.0,0.0,16.0,80.0,96.0,0.075545,0.0112,0.064344,0.648568,-0.088936,0.737504,0.18625,0.003056,293.2515,293.25375,293.201,292.999083,293.2515,293.25375,293.201,292.999083,8345442.0,0.715985,75.022942,1194.559824
2019-12-31 20:57,293.37,5036.0,1,-0.023855,56.055755,13.635937,0.0,0.0,12.0,80.0,92.0,0.076412,0.009654,0.066758,0.619311,-0.091696,0.711008,0.15375,0.003134,293.2525,293.26625,293.206667,293.017167,293.2525,293.26625,293.206667,293.017167,8340406.0,-5.052017,10.348236,1194.254513
2019-12-31 20:58,293.45,5485.0,1,0.010224,58.710399,14.289597,0.0,0.0,8.0,80.0,88.0,0.082602,0.012676,0.069927,0.626898,-0.057242,0.68414,0.0745,0.003255,293.2705,293.2855,293.214,293.036583,293.2705,293.2855,293.214,293.036583,8345891.0,0.208598,71.555631,1194.254513
2019-12-31 20:59,293.62,4139.0,1,0.088628,63.725194,16.667267,0.0,0.0,4.0,96.0,100.0,0.100072,0.024116,0.075956,0.663379,0.000775,0.662605,-0.04275,0.003506,293.31,293.31275,293.229,293.058167,293.31,293.31275,293.229,293.058167,8350030.0,3.893507,161.851969,1194.946362


**Datetime cyclical encoding**

$$x_{sin} = \sin(\frac{2*\pi*x}{max(x)})$$

$$x_{cos} = \cos(\frac{2*\pi*x}{max(x)})$$

In [None]:
df['datetime'] = df.index.to_timestamp()
df['min_sin'] = np.sin(2 * np.pi * df.datetime.dt.minute / 60)
df['min_cos'] = np.cos(2 * np.pi * df.datetime.dt.minute / 60)
df['hour_sin'] = np.sin(2 * np.pi * df.datetime.dt.hour / 60)
df['hour_cos'] = np.cos(2 * np.pi * df.datetime.dt.hour / 60)
df['day_sin'] = np.sin(2 * np.pi * df.datetime.dt.day / 30)
df['day_cos'] = np.cos(2 * np.pi * df.datetime.dt.day / 30)
df['month_sin'] = np.sin(2 * np.pi * df.datetime.dt.month / 12)
df['month_cos'] = np.cos(2 * np.pi * df.datetime.dt.month / 12)

df = df.drop(['datetime'], axis=1)
print(df.shape)
df.tail()

(101081, 39)


Unnamed: 0_level_0,close,volume,label,roc,rsi,tsi,bb_bbhi,bb_bbli,aroon_down,aroon,aroon_up,macd_line,macd_hist,macd_signal,kst,kst_diff,kst_signal,dpo,trix,sma_10,sma_20,sma_30,sma_60,ema_10,ema_20,ema_30,ema_60,obv,vpt,fi,nvi,min_sin,min_cos,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
2019-12-31 20:55,293.59,6260.0,0,0.080108,66.452076,12.884774,1.0,0.0,20.0,80.0,100.0,0.065703,0.004158,0.061544,0.65673,-0.103265,0.759995,0.24075,0.002978,293.249,293.23425,293.192,292.980833,293.249,293.23425,293.192,292.980833,8353065.0,7.131644,271.749266,1194.559824,-0.5,0.866025,0.866025,-0.5,0.207912,0.978148,-2.449294e-16,1.0
2019-12-31 20:56,293.445,7623.0,0,-0.010222,59.379185,14.00982,0.0,0.0,16.0,80.0,96.0,0.075545,0.0112,0.064344,0.648568,-0.088936,0.737504,0.18625,0.003056,293.2515,293.25375,293.201,292.999083,293.2515,293.25375,293.201,292.999083,8345442.0,0.715985,75.022942,1194.559824,-0.406737,0.913545,0.866025,-0.5,0.207912,0.978148,-2.449294e-16,1.0
2019-12-31 20:57,293.37,5036.0,1,-0.023855,56.055755,13.635937,0.0,0.0,12.0,80.0,92.0,0.076412,0.009654,0.066758,0.619311,-0.091696,0.711008,0.15375,0.003134,293.2525,293.26625,293.206667,293.017167,293.2525,293.26625,293.206667,293.017167,8340406.0,-5.052017,10.348236,1194.254513,-0.309017,0.951057,0.866025,-0.5,0.207912,0.978148,-2.449294e-16,1.0
2019-12-31 20:58,293.45,5485.0,1,0.010224,58.710399,14.289597,0.0,0.0,8.0,80.0,88.0,0.082602,0.012676,0.069927,0.626898,-0.057242,0.68414,0.0745,0.003255,293.2705,293.2855,293.214,293.036583,293.2705,293.2855,293.214,293.036583,8345891.0,0.208598,71.555631,1194.254513,-0.207912,0.978148,0.866025,-0.5,0.207912,0.978148,-2.449294e-16,1.0
2019-12-31 20:59,293.62,4139.0,1,0.088628,63.725194,16.667267,0.0,0.0,4.0,96.0,100.0,0.100072,0.024116,0.075956,0.663379,0.000775,0.662605,-0.04275,0.003506,293.31,293.31275,293.229,293.058167,293.31,293.31275,293.229,293.058167,8350030.0,3.893507,161.851969,1194.946362,-0.104528,0.994522,0.866025,-0.5,0.207912,0.978148,-2.449294e-16,1.0


In [None]:
df_na = df.dropna(axis=0)
print(df_na.shape)
df_na.head()

(101022, 39)


Unnamed: 0_level_0,close,volume,label,roc,rsi,tsi,bb_bbhi,bb_bbli,aroon_down,aroon,aroon_up,macd_line,macd_hist,macd_signal,kst,kst_diff,kst_signal,dpo,trix,sma_10,sma_20,sma_30,sma_60,ema_10,ema_20,ema_30,ema_60,obv,vpt,fi,nvi,min_sin,min_cos,hour_sin,hour_cos,day_sin,day_cos,month_sin,month_cos
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1
2019-01-02 15:29,155.765,1538.0,1,0.077099,55.238751,29.6705,0.0,0.0,4.0,80.0,84.0,0.176555,-0.012739,0.189295,4.072616,-0.366435,4.439051,-0.036,0.018094,155.826,155.726,155.498833,155.187667,155.826,155.726,155.498833,155.187667,12576.0,-0.489252,-24.613028,1012.224809,0.1045285,-0.994522,1.0,2.832769e-16,0.406737,0.913545,0.5,0.866025
2019-01-02 15:30,155.935,1765.0,0,0.157364,59.659759,28.481802,0.0,0.0,12.0,68.0,80.0,0.174976,-0.011455,0.186431,4.052878,-0.344026,4.396904,-0.05,0.017853,155.8505,155.74,155.5365,155.206917,155.8505,155.74,155.5365,155.206917,14341.0,0.200315,21.767405,1012.224809,5.665539e-16,-1.0,1.0,2.832769e-16,0.406737,0.913545,0.5,0.866025
2019-01-02 15:31,155.74,767.0,1,0.032115,53.172296,25.29616,0.0,0.0,8.0,68.0,76.0,0.156189,-0.024194,0.180382,3.948436,-0.377154,4.325591,-0.064,0.017344,155.8555,155.754,155.569833,155.213833,155.8555,155.754,155.569833,155.213833,13574.0,0.96715,-2.708653,1010.959,-0.1045285,-0.994522,1.0,2.832769e-16,0.406737,0.913545,0.5,0.866025
2019-01-02 15:32,155.82,1805.0,0,0.083499,55.318926,23.380278,0.0,0.0,4.0,68.0,72.0,0.146071,-0.027449,0.17352,3.782034,-0.453795,4.235828,-0.07475,0.016742,155.855,155.76475,155.606667,155.23,155.855,155.76475,155.606667,155.23,15379.0,-0.031963,18.306869,1010.959,-0.2079117,-0.978148,1.0,2.832769e-16,0.406737,0.913545,0.5,0.866025
2019-01-02 15:33,155.695,740.0,1,0.003212,51.357405,20.51627,0.0,0.0,12.0,56.0,68.0,0.126508,-0.037609,0.164118,3.544114,-0.576064,4.120178,0.05925,0.015916,155.856,155.76575,155.639333,155.24825,155.856,155.76575,155.639333,155.24825,14639.0,0.333553,2.477316,1010.148001,-0.309017,-0.951057,1.0,2.832769e-16,0.406737,0.913545,0.5,0.866025


In [None]:
labels = df_na.label
df_na = df_na.drop(['label'], axis=1)

In [None]:
df_na.columns

Index(['close', 'volume', 'roc', 'rsi', 'tsi', 'bb_bbhi', 'bb_bbli',
       'aroon_down', 'aroon', 'aroon_up', 'macd_line', 'macd_hist',
       'macd_signal', 'kst', 'kst_diff', 'kst_signal', 'dpo', 'trix', 'sma_10',
       'sma_20', 'sma_30', 'sma_60', 'ema_10', 'ema_20', 'ema_30', 'ema_60',
       'obv', 'vpt', 'fi', 'nvi', 'min_sin', 'min_cos', 'hour_sin', 'hour_cos',
       'day_sin', 'day_cos', 'month_sin', 'month_cos'],
      dtype='object')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_na.values, labels.values, test_size=0.05, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((95970, 38), (5052, 38), (95970,), (5052,))

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [None]:
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
pca = PCA(n_components=0.8, random_state=42)
pca.fit(X_train_scaled)

PCA(copy=True, iterated_power='auto', n_components=0.8, random_state=42,
    svd_solver='auto', tol=0.0, whiten=False)

In [None]:
X_train_pca = pca.transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:
X_train_pca.shape, X_test_pca.shape

((95970, 10), (5052, 10))

In [None]:
model = LogisticRegression()
model.fit(X_train_pca, y_train)
y_pred = model.predict_proba(X_test_pca)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
f1 = f1_score(y_test, y_pred.argmax(axis=1))
roc = roc_auc_score(y_test, y_pred[:, 1])
print(f'LogisticRegression: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')

LogisticRegression: acc=56.57% - f1=71.40% - roc=54.00%


In [None]:
model = DecisionTreeClassifier()
model.fit(X_train_pca, y_train)
y_pred = model.predict_proba(X_test_pca)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
f1 = f1_score(y_test, y_pred.argmax(axis=1))
roc = roc_auc_score(y_test, y_pred[:, 1])
print(f'LogisticRegression: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')

LogisticRegression: acc=53.42% - f1=59.31% - roc=52.43%


In [None]:
model = RandomForestClassifier()
model.fit(X_train_pca, y_train)
y_pred = model.predict_proba(X_test_pca)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
f1 = f1_score(y_test, y_pred.argmax(axis=1))
roc = roc_auc_score(y_test, y_pred[:, 1])
print(f'RandomForestClassifier: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')

RandomForest: acc=55.19% - f1=62.85% - roc=56.71%


In [None]:
model = GradientBoostingClassifier()
model.fit(X_train_pca, y_train)
y_pred = model.predict_proba(X_test_pca)
acc = accuracy_score(y_test, y_pred.argmax(axis=1))
f1 = f1_score(y_test, y_pred.argmax(axis=1))
roc = roc_auc_score(y_test, y_pred[:, 1])
print(f'GradientBoostingClassifier: acc={acc*100:.2f}% - f1={f1*100:.2f}% - roc={roc*100:.2f}%')

GradientBoostingClassifier: acc=56.77% - f1=71.75% - roc=56.97%


/bin/bash: conda: command not found
