In [3]:
import warnings
warnings.filterwarnings('ignore')
import glob
import os
import datetime
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import *
from sklearn import svm
import seaborn as sns; sns.set()

In [4]:
!git clone https://github.com/quant4junior/algoTrade

Cloning into 'algoTrade'...
remote: Enumerating objects: 3971, done.[K
remote: Counting objects: 100% (3971/3971), done.[K
remote: Compressing objects: 100% (3901/3901), done.[K
remote: Total 3971 (delta 124), reused 3898 (delta 64), pack-reused 0[K
Receiving objects: 100% (3971/3971), 24.52 MiB | 12.83 MiB/s, done.
Resolving deltas: 100% (124/124), done.


In [31]:
df = pd.read_csv('./algoTrade/ch06/ETFs_main.csv')
df

Unnamed: 0,Dates,CLOSE_SPY,OPEN,HIGH,LOW,VOLUME,CLOSE_GLD,CLOSE_FXY,CLOSE_T10Y2Y,CLOSE_TED,CLOSE_USO,CLOSE_UUP,CLOSE_VIX,CLOSE_VWO
0,2007-02-20,146.04,145.56,146.200,144.0,56909500.0,65.31,83.51,2.3263,0.31,48.67,25.07,10.24,40.055
1,2007-02-21,145.98,145.61,146.070,145.0,63971500.0,67.28,82.90,2.3653,0.32,49.86,25.12,10.20,39.975
2,2007-02-22,145.87,146.05,146.420,145.0,79067398.0,67.15,82.46,2.3871,0.31,50.33,25.12,10.18,40.220
3,2007-02-23,145.30,145.74,145.790,145.0,71962797.0,67.72,82.78,2.3809,0.31,50.46,25.04,10.58,40.035
4,2007-02-26,145.17,145.83,145.950,145.0,69320062.0,68.10,83.08,2.3795,0.31,50.90,25.04,11.15,39.960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2766,2018-12-20,247.17,249.86,251.620,245.0,252053406.0,119.24,85.87,1.7807,0.48,9.72,25.77,28.38,38.180
2767,2018-12-21,240.70,246.74,249.710,240.0,255345594.0,118.72,85.87,1.7651,0.48,9.57,25.94,30.11,37.870
2768,2018-12-24,234.34,239.04,240.836,234.0,147311594.0,120.02,86.55,1.7505,0.40,9.29,25.55,36.07,37.320
2769,2018-12-27,248.07,242.57,248.290,239.0,186267297.0,120.57,86.00,1.7581,0.44,9.62,25.57,29.96,37.900


In [32]:
def moving_average(df, n):
    MA = pd.Series(df['CLOSE_SPY'].rolling(n, min_periods=n).mean(), name = 'MA_' + str(n))
    df = df.join(MA)
    return df

def volume_moving_average(df, n):
    VMA = pd.Series(df['VOLUME'].rolling(n, min_periods=n).mean(), name = 'VMA_' + str(n))
    df = df.join(VMA)
    return df

def relative_strength_index(df, n):
    i = 0
    UpI = [0]
    DoI = [0]
    while i + 1 <= df.index[-1]:
        UpMove = df.loc[i + 1, 'HIGH'] - df.loc[i, 'HIGH']
        DoMove = df.loc[i, 'LOW'] - df.loc[i + 1, 'LOW']
        if UpMove > DoMove and UpMove > 0:
            UpD = UpMove
        else:
            UpD = 0
        UpI.append(UpD)
        if DoMove > UpMove and DoMove > 0:
            DoD = DoMove
        else:
            DoD = 0
        DoI.append(DoD)
        i = i + 1
    UpI = pd.Series(UpI)
    DoI = pd.Series(DoI)
    PosDI = pd.Series(UpI.ewm(span=n, min_periods=n).mean())
    NegDI = pd.Series(DoI.ewm(span=n, min_periods=n).mean())
    RSI = pd.Series(PosDI / (PosDI + NegDI), name = 'RSI_' + str(n))
    df = df.join(RSI)
    return df

In [33]:
df = moving_average(df, 45)
df = volume_moving_average(df, 45)
df = relative_strength_index(df, 14)

In [34]:
df = df.set_index('Dates')
df = df.dropna()
len(df)

2727

In [35]:
df['target'] = df['CLOSE_SPY'].pct_change()

In [36]:
df['target'] = np.where(df['target'] > 0, 1, -1)
df['target'].value_counts()

 1    1471
-1    1256
Name: target, dtype: int64

In [37]:
df['target'] = df['target'].shift(-1)

In [38]:
df = df.dropna()
len(df)

2726

In [40]:
df['target'] = df['target'].astype(np.int64)
y_var = df['target'].loc['2017-01-01':]
x_var = df.drop(['target', 'OPEN', 'HIGH', 'LOW', 'VOLUME', 'CLOSE_SPY'], axis=1).loc['2017-01-01':, :]

In [41]:
len(x_var)

460

In [42]:
up = df[df['target'] == 1].target.count()
total=df.target.count()
print('up/down ratio: {0:.2f}'.format((up/total)))

up/down ratio: 0.54


In [43]:
X_train, X_test, y_train, y_test = train_test_split(x_var,
                                                   y_var,
                                                   test_size=0.3,
                                                   shuffle = False,
                                                   random_state=3)

train_count = y_train.count()
test_count = y_test.count()

print('train set label ratio')
print(y_train.value_counts()/train_count)
print('test set label ratio')
print(y_test.value_counts()/test_count)

train set label ratio
 1    0.559006
-1    0.440994
Name: target, dtype: float64
test set label ratio
 1    0.514493
-1    0.485507
Name: target, dtype: float64


In [44]:
def get_confusion_matrix(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    roc_score = roc_auc_score(y_test, pred)
    print('confusion matrix')
    print('accuracy:{0:.4f}, precision: {1:.4f}, recall:{2:.4f}, F1:{3:.4f}, ROC AUC score:{4:.4f}'.format(accuracy, precision, recall, f1, roc_score))

In [45]:
xgb_dis = XGBClassifier(n_estimators=400, learning_rate = 0.1, max_depth = 3)
xgb_dis.fit(X_train, y_train)
xgb_pred = xgb_dis.predict(X_test)
print(xgb_dis.score(X_train, y_train))
get_confusion_matrix(y_test, xgb_pred)

1.0
confusion matrix
accuracy:0.4348, precision: 0.4054, recall:0.2113, F1:0.2778, ROC AUC score:0.4415


In [46]:
n_estimators = range(10, 200, 10)
params = {
    'bootstrap' : [True],
    'n_estimators' : n_estimators,
    'max_depth':[4,6,8,10,12],
    'min_samples_leaf': [2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8, 10],
    'max_features':[4]
}

In [47]:
my_cv = TimeSeriesSplit(n_splits=5).split(X_train)

In [57]:
clf = GridSearchCV(RandomForestClassifier(), params, cv=my_cv, n_jobs=-1)

In [49]:
import time
start_time = time.time()
clf.fit(X_train, y_train)
print(time.time() - start_time, " seconds consumed")

1257.2996459007263  seconds consumed


In [50]:
print('best parameter:\n', clf.best_params_)
print('best prediction:{0:.4f}'.format(clf.best_score_))

best parameter:
 {'bootstrap': True, 'max_depth': 10, 'max_features': 4, 'min_samples_leaf': 2, 'min_samples_split': 4, 'n_estimators': 40}
best prediction:0.5774


In [51]:
pred_con = clf.predict(X_test)
accuracy_con = accuracy_score(y_test, pred_con)
print('accuracy:{0:.4f}'.format(accuracy_con))
get_confusion_matrix(y_test, pred_con)

accuracy:0.4710
confusion matrix
accuracy:0.4710, precision: 0.4706, recall:0.2254, F1:0.3048, ROC AUC score:0.4783


# 수익률 0.05 이상일 때

In [60]:
df = pd.read_csv('./algoTrade/ch06/ETFs_main.csv')
df = moving_average(df, 45)
df = volume_moving_average(df, 45)
df = relative_strength_index(df, 14)
df = df.set_index('Dates')
df = df.dropna()
df['target'] = df['CLOSE_SPY'].pct_change()

In [61]:
df['target'] = np.where(df['target'] > 0.0005, 1, -1)
df['target'].value_counts()

 1    1375
-1    1352
Name: target, dtype: int64

In [62]:
df['target'] = df['target'].shift(-1)
df = df.dropna()
len(df)

2726

In [64]:
df['target'] = df['target'].astype(np.int64)
y_var = df['target'].loc['2017-01-01':]
# 예측에 사용하지 않는 변수 삭제
x_var = df.drop(['target', 'OPEN','HIGH','LOW', 'VOLUME','CLOSE_SPY'], axis=1).loc['2017-01-01':, :] 
up=df[df['target']==1].target.count()
total=df.target.count()
print('up/down ratio: {0:.2f}'.format((up/total)))

up/down ratio: 0.50


In [65]:
#훈련 세트와 테스트 세트를 나눈다.
#shuffle=False을 설정해 기간이 섞이지 않도록 만든다.
X_train, X_test, y_train, y_test = train_test_split(x_var, 
                                                    y_var, 
                                                    test_size=0.3, 
                                                    shuffle=False, 
                                                    random_state=3)

#훈련 세트와 테스트 세트에 포함된 양성 샘플(up) 비율을 확인한다.
train_count = y_train.count()
test_count = y_test.count()

print('train set label ratio')
print(y_train.value_counts()/train_count)
print('test set label ratio')
print(y_test.value_counts()/test_count)

train set label ratio
 1    0.512422
-1    0.487578
Name: target, dtype: float64
test set label ratio
-1    0.528986
 1    0.471014
Name: target, dtype: float64


In [66]:
xgb_dis = XGBClassifier(n_estimators=400, learning_rate=0.1, max_depth=3)
xgb_dis.fit(X_train, y_train)
xgb_pred = xgb_dis.predict(X_test)
print(xgb_dis.score(X_train, y_train))
get_confusion_matrix(y_test, xgb_pred)

1.0
confusion matrix
accuracy:0.5290, precision: 0.5000, recall:0.2615, F1:0.3434, ROC AUC score:0.5143


In [67]:
n_estimators = range(10,200,10)

params = {
    'bootstrap': [True],
    'n_estimators':n_estimators,
    'max_depth':[4,6,8,10,12],
    'min_samples_leaf': [2, 3, 4, 5],
    'min_samples_split': [2, 4, 6, 8, 10],
    'max_features':[4]
}

In [68]:
my_cv = TimeSeriesSplit(n_splits=5).split(X_train)

In [69]:
clf = GridSearchCV(RandomForestClassifier(), params, cv=my_cv, n_jobs=-1)

In [70]:
start_time = time.time()
clf.fit(X_train, y_train)
print(time.time() - start_time," seconds consumed")

1252.4088146686554  seconds consumed


In [71]:
print('best parameter:\n', clf.best_params_)

best parameter:
 {'bootstrap': True, 'max_depth': 10, 'max_features': 4, 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 10}


In [72]:
print('best prediction:{0:.4f}'.format(clf.best_score_))

best prediction:0.5849


In [73]:
pred_con = clf.predict(X_test)
accuracy_con = accuracy_score(y_test, pred_con)
print('accuracy:{0:.4f}'.format(accuracy_con))
get_confusion_matrix(y_test, pred_con)

accuracy:0.5507
confusion matrix
accuracy:0.5507, precision: 0.5172, recall:0.6923, F1:0.5921, ROC AUC score:0.5585
