In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#from matplotlib.finance import candlestick_ohlc
#import matplotlib.dates as mdates
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

%matplotlib inline
plt.rcParams["figure.figsize"] = (14, 10)
plt.rcParams["font.size"] = 14

In [2]:
df_train = pd.read_csv("train.csv", index_col=0)
df_test = pd.read_csv("test.csv", index_col=0)

In [3]:
df_train.head()

Unnamed: 0_level_0,Open Price,Close Price,High Price,Low Price,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
02-Jan-2009,902.99,931.8,934.73,899.35,4048270080
05-Jan-2009,929.17,927.45,936.63,919.53,5413910016
06-Jan-2009,931.17,934.7,943.85,927.28,5392620032
07-Jan-2009,927.45,906.65,927.45,902.37,4704940032
08-Jan-2009,905.73,909.73,910.0,896.81,4991549952


## Check if any null data in the dataset

In [4]:
df_train.isnull().sum()

Open Price     0
Close Price    0
High Price     0
Low Price      0
Volume         0
dtype: int64

## preprocessing
calculate the difference between the prices, and calculate the price is higher or lower than the day before that day

In [5]:
df_train["return"] = df_train["High Price"] - df_train["Low Price"]
df_train["close to open"] = np.abs(df_train["Close Price"] - df_train["Open Price"])
df_train["close to high"] = np.abs(df_train["Close Price"] - df_train["High Price"])
df_train["close to low"] = np.abs(df_train["Close Price"] - df_train["Low Price"])

df_test["return"] = df_test["High Price"] - df_test["Low Price"]
df_test["close to open"] = np.abs(df_test["Close Price"] - df_test["Open Price"])
df_test["close to high"] = np.abs(df_test["Close Price"] - df_test["High Price"])
df_test["close to low"] = np.abs(df_test["Close Price"] - df_test["Low Price"])

def result_calc(x):
    if x > 0:
        return 1
    elif x < 0:
        return 0
    elif x == 0:
        return 2

df_train["result"] = df_train["Close Price"] - df_train["Close Price"].shift(1)
df_train["result"] = df_train["result"].apply(lambda x: result_calc(x))

df_test["result"] = df_test["Close Price"] - df_test["Close Price"].shift(1)
df_test["result"] = df_test["result"].apply(lambda x: result_calc(x))

df_train.head()

Unnamed: 0_level_0,Open Price,Close Price,High Price,Low Price,Volume,return,close to open,close to high,close to low,result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
02-Jan-2009,902.99,931.8,934.73,899.35,4048270080,35.38,28.81,2.93,32.45,
05-Jan-2009,929.17,927.45,936.63,919.53,5413910016,17.1,1.72,9.18,7.92,0.0
06-Jan-2009,931.17,934.7,943.85,927.28,5392620032,16.57,3.53,9.15,7.42,1.0
07-Jan-2009,927.45,906.65,927.45,902.37,4704940032,25.08,20.8,20.8,4.28,0.0
08-Jan-2009,905.73,909.73,910.0,896.81,4991549952,13.19,4.0,0.27,12.92,1.0


### Because the result of the first row is null, drop it

In [6]:
df_train["result"].isnull().sum()

1

In [7]:
df_train = df_train.dropna()
df_test = df_test.dropna()
df_train.head()

Unnamed: 0_level_0,Open Price,Close Price,High Price,Low Price,Volume,return,close to open,close to high,close to low,result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
05-Jan-2009,929.17,927.45,936.63,919.53,5413910016,17.1,1.72,9.18,7.92,0.0
06-Jan-2009,931.17,934.7,943.85,927.28,5392620032,16.57,3.53,9.15,7.42,1.0
07-Jan-2009,927.45,906.65,927.45,902.37,4704940032,25.08,20.8,20.8,4.28,0.0
08-Jan-2009,905.73,909.73,910.0,896.81,4991549952,13.19,4.0,0.27,12.92,1.0
09-Jan-2009,909.91,890.35,911.93,888.31,4716499968,23.62,19.56,21.58,2.04,0.0


## Normalize the elements

In [8]:
scaler = StandardScaler()
scaler.fit(df_train[["Open Price", "Close Price", "High Price", "Low Price", "Volume", "return", "close to open", "close to high", "close to low"]])
df_train[["Open Price", "Close Price", "High Price", "Low Price", "Volume", "return", "close to open", "close to high", "close to low"]] = scaler.transform(df_train[["Open Price", "Close Price", "High Price", "Low Price", "Volume", "return", "close to open", "close to high", "close to low"]])
df_train.head()


Unnamed: 0_level_0,Open Price,Close Price,High Price,Low Price,Volume,return,close to open,close to high,close to low,result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
05-Jan-2009,-1.499495,-1.50445,-1.502622,-1.500502,1.824404,0.010751,-0.836174,0.168867,-0.171102,0.0
06-Jan-2009,-1.495371,-1.489501,-1.487722,-1.484533,1.808651,-0.042903,-0.644738,0.165531,-0.231612,1.0
07-Jan-2009,-1.503041,-1.547337,-1.521567,-1.53586,1.299815,0.818598,1.181833,1.461159,-0.611611,0.0
08-Jan-2009,-1.547821,-1.540987,-1.557579,-1.547317,1.511887,-0.385074,-0.595028,-0.822038,0.433992,1.0
09-Jan-2009,-1.539203,-1.580946,-1.553596,-1.564831,1.308369,0.670796,1.050684,1.547905,-0.882693,0.0


In [9]:
scaler.fit(df_test[["Open Price", "Close Price", "High Price", "Low Price", "Volume", "return", "close to open", "close to high", "close to low"]])
df_test[["Open Price", "Close Price", "High Price", "Low Price", "Volume", "return", "close to open", "close to high", "close to low"]] = scaler.transform(df_test[["Open Price", "Close Price", "High Price", "Low Price", "Volume", "return", "close to open", "close to high", "close to low"]])

df_test.head()

Unnamed: 0_level_0,Open Price,Close Price,High Price,Low Price,Volume,return,close to open,close to high,close to low,result
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
03-Jan-2018,-0.508581,-0.331044,-0.523646,-0.305917,-0.142641,-0.689624,-0.135579,-0.75311,-0.035561,1.0
04-Jan-2018,-0.291669,-0.221964,-0.363034,-0.10582,-0.120207,-0.963246,-0.686423,-0.558909,-0.643437,1.0
05-Jan-2018,-0.170175,-0.030748,-0.210605,-0.022681,-0.521355,-0.735513,-0.312916,-0.802268,-0.039078,1.0
08-Jan-2018,-0.055553,0.014761,-0.156135,0.068256,-0.574383,-0.933653,-0.66759,-0.777933,-0.339206,1.0
09-Jan-2018,0.03016,0.050489,-0.041704,0.164641,-0.436682,-0.917785,-0.923918,-0.434795,-0.730779,1.0


## Model fitting and evaluation

In [10]:
y = df_train["result"]
y_test = df_test["result"]
df_train = df_train.drop("result", 1)
df_test = df_test.drop("result", 1)

df_train.head()

Unnamed: 0_level_0,Open Price,Close Price,High Price,Low Price,Volume,return,close to open,close to high,close to low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
05-Jan-2009,-1.499495,-1.50445,-1.502622,-1.500502,1.824404,0.010751,-0.836174,0.168867,-0.171102
06-Jan-2009,-1.495371,-1.489501,-1.487722,-1.484533,1.808651,-0.042903,-0.644738,0.165531,-0.231612
07-Jan-2009,-1.503041,-1.547337,-1.521567,-1.53586,1.299815,0.818598,1.181833,1.461159,-0.611611
08-Jan-2009,-1.547821,-1.540987,-1.557579,-1.547317,1.511887,-0.385074,-0.595028,-0.822038,0.433992
09-Jan-2009,-1.539203,-1.580946,-1.553596,-1.564831,1.308369,0.670796,1.050684,1.547905,-0.882693


## Using GridSearchCV for tuning hyperparameters of the SVM model

In [31]:
param_grid = {"C": np.arange(1, 11), "gamma": np.arange(0, 6)}
grid = GridSearchCV(SVC(kernel="linear"), param_grid=param_grid, verbose=3)
grid.fit(df_train, y)
grid.predict(df_test)
print(grid.best_score_)
print(grid.best_params_)

Fitting 3 folds for each of 60 candidates, totalling 180 fits
[CV] C=1, gamma=0 ....................................................
[CV] ........... C=1, gamma=0, score=0.8079470198675497, total=   0.1s
[CV] C=1, gamma=0 ....................................................
[CV] ........... C=1, gamma=0, score=0.8249336870026526, total=   0.0s
[CV] C=1, gamma=0 ....................................................
[CV] ........... C=1, gamma=0, score=0.7811671087533156, total=   0.0s
[CV] C=1, gamma=1 ....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV] ........... C=1, gamma=1, score=0.8079470198675497, total=   0.0s
[CV] C=1, gamma=1 ....................................................
[CV] ........... C=1, gamma=1, score=0.8249336870026526, total=   0.0s
[CV] C=1, gamma=1 ....................................................
[CV] ........... C=1, gamma=1, score=0.7811671087533156, total=   0.0s
[CV] C=1, gamma=2 ....................................................
[CV] ........... C=1, gamma=2, score=0.8079470198675497, total=   0.0s
[CV] C=1, gamma=2 ....................................................
[CV] ........... C=1, gamma=2, score=0.8249336870026526, total=   0.0s
[CV] C=1, gamma=2 ....................................................
[CV] ........... C=1, gamma=2, score=0.7811671087533156, total=   0.0s
[CV] C=1, gamma=3 ....................................................
[CV] ........... C=1, gamma=3, score=0.8079470198675497, total=   0.0s
[CV] C=1, gamma=3 ....................................................
[CV] .

[CV] ........... C=4, gamma=3, score=0.8079470198675497, total=   0.1s
[CV] C=4, gamma=3 ....................................................
[CV] ........... C=4, gamma=3, score=0.8395225464190982, total=   0.1s
[CV] C=4, gamma=3 ....................................................
[CV] ........... C=4, gamma=3, score=0.7122015915119363, total=   0.1s
[CV] C=4, gamma=4 ....................................................
[CV] ........... C=4, gamma=4, score=0.8079470198675497, total=   0.1s
[CV] C=4, gamma=4 ....................................................
[CV] ........... C=4, gamma=4, score=0.8395225464190982, total=   0.1s
[CV] C=4, gamma=4 ....................................................
[CV] ........... C=4, gamma=4, score=0.7122015915119363, total=   0.1s
[CV] C=4, gamma=5 ....................................................
[CV] ........... C=4, gamma=5, score=0.8079470198675497, total=   0.1s
[CV] C=4, gamma=5 ....................................................
[CV] .

[CV] ........... C=7, gamma=4, score=0.6870026525198939, total=   0.1s
[CV] C=7, gamma=5 ....................................................
[CV] ........... C=7, gamma=5, score=0.8211920529801324, total=   0.1s
[CV] C=7, gamma=5 ....................................................
[CV] ........... C=7, gamma=5, score=0.8580901856763926, total=   0.1s
[CV] C=7, gamma=5 ....................................................
[CV] ........... C=7, gamma=5, score=0.6870026525198939, total=   0.1s
[CV] C=8, gamma=0 ....................................................
[CV] ............ C=8, gamma=0, score=0.823841059602649, total=   0.1s
[CV] C=8, gamma=0 ....................................................
[CV] ........... C=8, gamma=0, score=0.8620689655172413, total=   0.1s
[CV] C=8, gamma=0 ....................................................
[CV] ........... C=8, gamma=0, score=0.6737400530503979, total=   0.1s
[CV] C=8, gamma=1 ....................................................
[CV] .

[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   15.8s finished


In [45]:
svm = SVC(C=2, kernel="linear")
svm.fit(df_train, y)
y_predict = svm.predict(df_test)

In [46]:
accuracy_score(y_predict, y_test)

0.8247011952191236

## accuracy test with train_test_split

In [49]:
x1, x2, y1, y2 = train_test_split(df_train, y, train_size=0.7)
a = SVC(C=2, kernel="linear")
a.fit(x1, y1)
yp = a.predict(x2)
accuracy_score(yp, y2)



0.8085419734904271