In [1]:
import os
import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.externals import joblib
import pickle

# 1. データ準備
## 日経225連動型上場投資信託 1321

In [2]:
#pwd 現在のディレクトリ位置を保存
init_directory = os.getcwd()

In [3]:
# file一覧を作成
os.chdir(init_directory+'\\data\\1321')
file_list = os.listdir() 

# ファイル一覧を読み込みひとつのＤａｔａＦｒａｍｅにする
data1321 = pd.DataFrame([])
for i in file_list :
    tmp = pd.read_csv(i,encoding='shift-jis', skiprows=1, index_col='日付', parse_dates=True)
    #data1321 = pd.concat([data1321,tmp],ignore_index=True)
    data1321 = pd.concat([data1321,tmp])
    
data1321.tail()
#data1321.dtypes

Unnamed: 0_level_0,始値,高値,安値,終値,出来高,終値調整値
日付,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-09-03,23290,23310,23180,23190,64120,23190
2018-09-04,23230,23240,23100,23190,98112,23190
2018-09-05,23150,23180,23060,23080,125696,23080
2018-09-06,22960,23040,22920,22960,250909,22960
2018-09-10,22740,22880,22730,22870,180635,22870


## 外国為替

In [4]:
# 為替データを取得してDataFrameに
os.chdir(init_directory+'\\data\\rate_exchange')
exchange_all = pd.DataFrame([])
tmp = pd.read_csv('quote.csv',encoding='shift-jis',  index_col='日付', parse_dates=True)
exchange_all = pd.concat([exchange_all,tmp])

# 不要列の取得
droplist = []
f = open('droplist.txt', 'r')
list = f.readlines()
for line in list:
    line = line.replace('\r','')
    line = line.replace('\n','')
    droplist.append(line)
f.close()

# 為替データから不要列を削除
exchange = exchange_all.drop(columns=droplist)

exchange.tail()

Unnamed: 0_level_0,USD,GBP,EUR,CAD,CHF,SEK,DKK,NOK,AUD,NZD,...,INR,PHP,SGD,THB,KWD,SAR,AED,MXN,IDR(100).1,TWD
日付,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-09-04,111.09,142.88,128.86,84.76,114.53,12.2,17.29,13.28,79.97,73.25,...,1.56,2.08,80.89,3.39,367.36,29.62,30.25,5.78,0.75,3.61
2018-09-05,111.52,143.35,129.16,84.62,114.51,12.27,17.33,13.29,80.06,72.92,...,1.56,2.08,81.03,3.4,367.81,29.73,30.37,5.75,0.75,3.62
2018-09-06,111.29,143.79,129.54,84.46,114.73,12.3,17.37,13.26,80.06,73.51,...,1.55,2.07,80.91,3.4,367.78,29.67,30.3,5.76,0.75,3.61
2018-09-07,110.49,142.76,128.35,83.96,114.45,12.13,17.21,13.12,79.39,72.66,...,1.54,2.05,80.37,3.37,365.14,29.46,30.08,5.75,0.74,3.59
2018-09-10,110.97,143.53,128.28,84.23,114.52,12.27,17.2,13.15,78.93,72.46,...,1.55,2.07,80.47,3.38,366.0,29.59,30.22,5.75,0.75,3.6


## 結合（ターゲットデータの作成）

In [5]:
target = pd.merge(data1321, exchange, right_index=True, left_index=True, how='outer')

#NaNを削除(列)
target2 = target.dropna().dropna(axis=1) 
#target2 = target.dropna(axis=1) 
target2.tail()

Unnamed: 0_level_0,始値,高値,安値,終値,出来高,終値調整値,USD,GBP,EUR,CAD,...,INR,PHP,SGD,THB,KWD,SAR,AED,MXN,IDR(100).1,TWD
日付,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-09-03,23290.0,23310.0,23180.0,23190.0,64120.0,23190.0,110.99,143.48,128.67,84.94,...,1.56,2.08,80.81,3.38,367.03,29.59,30.22,5.79,0.75,3.61
2018-09-04,23230.0,23240.0,23100.0,23190.0,98112.0,23190.0,111.09,142.88,128.86,84.76,...,1.56,2.08,80.89,3.39,367.36,29.62,30.25,5.78,0.75,3.61
2018-09-05,23150.0,23180.0,23060.0,23080.0,125696.0,23080.0,111.52,143.35,129.16,84.62,...,1.56,2.08,81.03,3.4,367.81,29.73,30.37,5.75,0.75,3.62
2018-09-06,22960.0,23040.0,22920.0,22960.0,250909.0,22960.0,111.29,143.79,129.54,84.46,...,1.55,2.07,80.91,3.4,367.78,29.67,30.3,5.76,0.75,3.61
2018-09-10,22740.0,22880.0,22730.0,22870.0,180635.0,22870.0,110.97,143.53,128.28,84.23,...,1.55,2.07,80.47,3.38,366.0,29.59,30.22,5.75,0.75,3.6


## 訓練データ・テストデータの準備

In [6]:
#全データ数
all_data_set = len(target2)

#テストデータ数
tests = 100

x_train_set = target2.iloc[0:all_data_set-tests-1]
x_train_set.tail()

Unnamed: 0_level_0,始値,高値,安値,終値,出来高,終値調整値,USD,GBP,EUR,CAD,...,INR,PHP,SGD,THB,KWD,SAR,AED,MXN,IDR(100).1,TWD
日付,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-04-09,22300.0,22520.0,22300.0,22450.0,175185.0,22450.0,106.89,150.64,131.16,83.69,...,1.65,2.05,81.32,3.42,356.54,28.5,29.11,5.85,0.78,3.65
2018-04-10,22370.0,22720.0,22300.0,22570.0,306090.0,22570.0,106.76,150.94,131.56,84.04,...,1.64,2.05,81.43,3.42,356.1,28.47,29.07,5.82,0.78,3.65
2018-04-11,22640.0,22640.0,22460.0,22460.0,189354.0,22460.0,107.1,151.87,132.36,84.99,...,1.65,2.06,81.78,3.43,357.48,28.56,29.16,5.87,0.78,3.67
2018-04-12,22460.0,22490.0,22350.0,22410.0,180788.0,22410.0,106.91,151.76,132.28,85.07,...,1.64,2.06,81.74,3.44,356.96,28.51,29.11,5.87,0.78,3.66
2018-04-13,22580.0,22690.0,22520.0,22580.0,229998.0,22580.0,107.41,152.98,132.36,85.29,...,1.65,2.06,81.91,3.45,358.63,28.64,29.25,5.91,0.78,3.67


In [7]:
x_test_set = target2.iloc[all_data_set-tests-1:all_data_set-1]
x_test_set.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100 entries, 2018-04-16 to 2018-09-06
Data columns (total 29 columns):
始値            100 non-null float64
高値            100 non-null float64
安値            100 non-null float64
終値            100 non-null float64
出来高           100 non-null float64
終値調整値         100 non-null float64
USD           100 non-null float64
GBP           100 non-null float64
EUR           100 non-null float64
CAD           100 non-null float64
CHF           100 non-null float64
SEK           100 non-null float64
DKK           100 non-null float64
NOK           100 non-null float64
AUD           100 non-null float64
NZD           100 non-null float64
ZAR           100 non-null float64
BHD           100 non-null float64
HKD           100 non-null float64
INR           100 non-null float64
PHP           100 non-null float64
SGD           100 non-null float64
THB           100 non-null float64
KWD           100 non-null float64
SAR           100 non-null float64
A

In [8]:
# 回答データ(Y)の準備
dif_set = target2.copy()[['始値','終値']]
dif_set['dif'] = target2['終値'] - target2['始値']

y_set = []
for s in range(1, len(dif_set)):

    if dif_set['dif'].iloc[s]> 0:
        y_set.append(1)
    else:
        y_set.append(-1)

In [9]:
len(y_set)

4034

In [10]:
y_train_set = y_set[0:all_data_set-tests-1]
len(y_train_set)

3934

In [11]:
y_test_set = y_set[all_data_set-tests-1:]
len(y_test_set)

100

# 2. トレーニング＆テスト

In [12]:
def traningModel(x_train_set, y_train_set):
    clf = RandomForestClassifier(n_estimators=len(x_train_set), random_state=0)
    clf.fit(x_train_set, y_train_set)
    return clf

In [13]:
rf = traningModel(x_train_set, y_train_set)

In [14]:
# 予測結果の取得
y_predict = rf.predict(x_test_set)

In [19]:
def rate_check(y_test_set, y_predict):
    count = 0
    for s in range(0, len(y_test_set)):
        if y_test_set[s] == y_predict[s]:
            count+=1
        
    rate = count / len(y_test_set) * 100
    
    print("rate： ", rate)
    return rate

In [20]:
rate_check(y_test_set, y_predict)

rate：  52.0


52.0

In [30]:
os.chdir(init_directory+'\\output')
now = datetime.datetime.now()
filename = 'spf_{0:%y%m%d-%H%M%S}.pkl'.format(now)

joblib.dump(rf, filename)

['spf_180911-134608.pkl']

In [33]:
filename = 'pickle2_{0:%y%m%d-%H%M%S}.pickle'.format(now)
pickle.dump(rf, open(filename, 'wb'),protocol=2)

rate：  52.0


52.0