<a href="https://colab.research.google.com/github/dAn-solution/competition/blob/main/Probstock005.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 米国株式市場 将来株価予測 ProbSpace コンペティション
### ARIMAモデル　Public Score: 0.04938　Private Score:0.05368 

- 期間：2021.9.21 〜 2021.12.12
- 課題：2011/11/13～2019/11/17週の計419週間の米国株データから、2019/11/24週の終値を予測する。銘柄数は3,278。
- 評価方法：評価関数RMSLE(Root Mean Squared Logarithmic Error)
   $$
   \sqrt{\frac{1}{n} \sum_{i=0}^{n}(log(Pred_i + 1) - log(Act_i - 1))^2}
   $$
- データ：train_data.csv, company_list.csv, submission_template.csv

- はじめは時系列データであるのでARモデルARIMAモデルを構築してはどうかと実施
- しかしPublic Scoreが0.04938と悪く、次のLightGBMへ

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import datetime
from scipy import stats

import statsmodels.api as sm
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.stattools import adfuller

  import pandas.util.testing as tm


In [None]:
# データ読み込み
df = pd.read_csv("/content/drive/MyDrive/Probdata/stock/train_data.csv", encoding="cp932")
submission_df = pd.read_csv("/content/drive/MyDrive/Probdata/stock/submission_template.csv", encoding="cp932")

In [None]:
# 日付をインデックスへ
df.index = pd.to_datetime(df['Date'])
del df['Date']
df = df[:-1]

### KPSS検定、ADF検定

In [None]:
# 訓練データとテストデータを分ける
train_df = df[:-1]
test_df = df[-1:]

train_col = train_df.columns

### KPSS ADF 検定による階差の決定

In [None]:
# KPSS ADF検定の実施
import warnings
warnings.filterwarnings('ignore') # 警告非表示（収束：ConvergenceWarning）

kpss_adf_dict = {}
for col in train_col:
    ts = train_df[col]
    for num in range(3):
        #KPSS検定
        stats, p_value, lags, crit = sm.tsa.kpss(ts, lags=1)
        #ADF検定
        results = sm.tsa.stattools.adfuller(ts, maxlag=1)
        kpss_adf_dict[(col, num)] = [p_value, results[0], results[1], results[4]['5%']]
        diff_data = ts.diff(periods=num+1)
        diff_data = diff_data.dropna()
        ts = diff_data

In [None]:
# KPSS ADFの結果から階差を決定
test_dict = {}

for col in train_col:
    chk_ndarray = np.array(['','',''])
    for num in range(3):
        if kpss_adf_dict[(col, num)][2] < 0.05:
            if kpss_adf_dict[(col, num)][0] < 0.05:
                chk_ndarray[num] = 'C'
            else:
                chk_ndarray[num] = 'A'
        else:
            if kpss_adf_dict[(col, num)][0] < 0.05:
                chk_ndarray[num] = 'B'
            else:
                chk_ndarray[num] = 'D'
    test_dict[col] = [chk_ndarray[0], chk_ndarray[1], chk_ndarray[2]]

In [None]:
# 銘柄毎の階差を作成
# arima_para_dict: 銘柄：p(AR次数)、i(階差)、q(MA次数)、解答
arima_para_dict = {}

for col in train_col:
    if 'A' in test_dict[col]:
        arima_para_dict[col] = [0,test_dict[col].index('A'),0,0]
    elif 'C' in test_dict[col]:
        arima_para_dict[col] = [0,test_dict[col].index('C'),0,0]
    elif 'D' in test_dict[col]:
        arima_para_dict[col] = [0,test_dict[col].index('D'),0,0]
    else:
        arima_para_dict[col] = [0,test_dict[col].index('B'),0,0]

### ARIMA

In [None]:
# SARIMAパラメター最適化（総当たりチェック）
import warnings
warnings.filterwarnings('ignore') # 警告非表示（収束：ConvergenceWarning）

# パラメータ範囲
# order(p, d, q)
min_p = 1; max_p = 2 
min_d = 0; max_d = 2
min_q = 0; max_q = 2

test_pattern = (max_p - min_p +1)*(max_q - min_q + 1)*(max_d - min_d + 1)

for col in train_col:
    ts = train_df[col] # 時系列データ
    test_results = pd.DataFrame(index=range(test_pattern), columns=["p", "d", "q", "aic"])
    num = 0
    for p in range(min_p, max_p + 1):
        for d in range(arima_para_dict[col][1], arima_para_dict[col][1]+1):
            for q in range(min_q, max_q + 1):
                error_chk = True
                try:
                    arima = sm.tsa.ARIMA(ts, order=(p, d, q)).fit(dist=False)
                except ValueError:
                    error_chk = False
                except:
                    error_chk = False
                if error_chk:
                    test_results.iloc[num]["p"] = p
                    test_results.iloc[num]["d"] = d
                    test_results.iloc[num]["q"] = q
                    test_results.iloc[num]["aic"] = arima.aic
                    num = num + 1
    tmp_res = test_results[test_results.aic == min(test_results.aic)]
    arima_para_dict[col][0] = int(tmp_res['p'])
    arima_para_dict[col][1] = int(tmp_res['d'])
    arima_para_dict[col][2] = int(tmp_res['q']) 

In [None]:
# # arima_para_dictの退避
# import pickle

# with open("/content/drive/MyDrive/Probdata/stock/arima_para_dict_1.pkl","wb") as f:
#     pickle.dump(arima_para_dict, f)

In [None]:
# # arima_para_dictの呼び出し
# import pickle

# with open("/content/drive/MyDrive/Probdata/stock/arima_para_dict_1.pkl", 'rb') as f:
#     arima_para_dict = pickle.load(f)

# ARIMAモデル訓練

In [None]:
import warnings
warnings.filterwarnings('ignore') # 警告非表示（収束：ConvergenceWarning）

for col in train_col:
    ts = df[col]
    p = arima_para_dict[col][0]
    d = arima_para_dict[col][1]
    q = arima_para_dict[col][2]
    try:
        arima_train = sm.tsa.ARIMA(ts, order=(p, d, q), missing='drop').fit(dist=False)
    except ValueError:
        pass
    except:
        pass
    arima_predict = arima_train.predict('2019-11-24')
    arima_para_dict[col][3] = arima_predict[0]

### 提出ファイル作成

In [None]:
submission_dict = {}
for col in train_col:
    if arima_para_dict[col][1] == 0:
        submission_dict[col] = arima_para_dict[col][3]
    elif arima_para_dict[col][1] == 1:
        submission_dict[col] = float(df[col][-1])+arima_para_dict[col][3]
    else:
        submission_dict[col] = float(df[col][-2])+arima_para_dict[col][3]

In [None]:
submission_df = submission_df.fillna(0)
submission_col = submission_df
for id in range(len(submission_df)):
    col = submission_df.iloc[id]['id']
    submission_df.at[id, 'y'] = submission_dict[col]

submission_df = submission_df.where(submission_df['y'] > 0, 0)

In [None]:
submission_df.to_csv('/content/drive/MyDrive/Probdata/stock/submission.csv', index=False)

In [None]:
submission_df

Unnamed: 0,id,y
0,VGSH,60.449711
1,JEF,20.519051
2,IVZ,17.054987
3,KTCC,5.513375
4,FBZ,14.764878
...,...,...
3273,TYG,16.946909
3274,VIRC,3.756956
3275,BIS,17.245222
3276,WOOD,64.560033
