# シミュレーション実験

## 前処理(予測まで)

In [248]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import japanize_matplotlib
from sklearn.metrics import accuracy_score
import lightgbm as lgb

In [249]:
def root_mean_squared_error(y_pred, y_test):
    return mean_squared_error(y_pred, y_test)**(1/2)

In [250]:
df_raw = pd.read_csv("../data/kk.csv")
# 全体のみを利用(産地名NaN)
is_nan = [df_raw["産地名"][i] is df_raw["産地名"][0] for i in range(df_raw.shape[0])]
df = df_raw[is_nan].copy()
del_columns = ["産地名", "産地コード", "品目名", "品目コード", "対前日比（数量）", "対前日比（価格）"]
df = df.drop(del_columns, axis=1).copy()


In [251]:
# 曜日の処理
dow = ["月", "火", "水", "木", "金", "土", "日"]
df["曜日"] = [dow.index(dw) for dw in df["曜日"]]


In [252]:
# 日にちの処理
# timestamp型 → timedelta
# date_columns = ["年", "月", "日"]
date = [str(df["年"].iloc[i])+"-"+str(df["月"].iloc[i])+"-" +
        str(df["日"].iloc[i]) for i in range(df.shape[0])]
df["date"] = pd.to_datetime(date)
# df = df.drop(date_columns, axis=1).copy()
df["days"] = [(date - df["date"][0]).days for date in df["date"]]
df = df.drop("date", axis=1).copy()


In [253]:
# 特異日 (天皇即位)
outlier = df[df["価格"].max() == df["価格"]].index 
# df = df.drop(outlier, axis=0).copy()

In [254]:
# 数量で重み付け
def ma_weighted(price, df):
    """
    Args:
        price (df.rolling): df.rolling.apply()を想定した価格列
        df: 重み列を取るための元df
    return:
        ma_w = 数量加重移動平均
    """

    df_price = df.loc[price.index, '価格']
    df_num = df.loc[price.index, '数量']
    
    ma_w = 0
    for p, n in zip(df_price, df_num):
        ma_w += n * p/df_num.sum()

    return ma_w


In [255]:
ma_w3 = df["価格"].rolling(3).apply(ma_weighted, args=(df,), raw=False)
ma_w5 = df["価格"].rolling(5).apply(ma_weighted, args=(df,), raw=False)
ma_w7 = df["価格"].rolling(7).apply(ma_weighted, args=(df,), raw=False)


In [256]:
# train-test-split
# 7年分をtrain，3年分をtest(index 9426以上)
start_test_idx = 9426

ma_w5 = ma_w5.dropna()
df_train = ma_w5[ma_w5.index < start_test_idx].copy()
df_test = ma_w5[ma_w5.index >= start_test_idx].copy()


In [257]:
date_train_raw = df.loc[df_train.index, ["年", "月", "日", "曜日"]].copy()
date_test_raw = df.loc[df_test.index, ["年", "月", "日", "曜日"]].copy()

In [258]:
# 学習データ作成
# data = [4日後予測訓練データ, 5日後予測訓練データ, ..., 10日後予測訓練データ]

# 入力データから予測対象日までの最短日数 = 輸送日数 + [1,2,3,4,5,6,7,...]
span = 7   # 何日分予測するか
n_input = 7 # 何日分入力するか
move_days = 3 # 輸送日数
data = []
for sp in range(span):

    sp = sp + 1 # 予測日は最低move_days+1日後

    n_train = df_train.shape[0] - (n_input - 1) - move_days - sp
    
    X_train_idx = [np.arange(n_input) + i for i in range(n_train)]
    y_train_idx = [(n_input - 1) + move_days + sp + i for i in range(n_train)]

    date_train_idx = [i + (n_input - 1) for i in range(n_train)]

    n_test = df_test.shape[0] - (n_input - 1) - move_days - sp
    
    X_test_idx = [np.arange(n_input) + i for i in range(n_test)]
    y_test_idx = [(n_input - 1) + move_days + sp + i for i in range(n_test)]

    date_test_idx = [i + (n_input - 1) for i in range(n_test)]

    X_train = np.array([df_train.iloc[xt_i].values for xt_i in X_train_idx])

    y_train = np.array([df_train.iloc[yt_i] for yt_i in y_train_idx])

    X_test = np.array([df_test.iloc[xt_i].values for xt_i in X_test_idx])
    y_test = np.array([df_test.iloc[yt_i] for yt_i in y_test_idx])

    X_train = pd.DataFrame(X_train, columns=np.arange(n_input))
    X_test = pd.DataFrame(X_test, columns=np.arange(n_input))
    y_train = pd.Series(y_train)
    y_test = pd.Series(y_test)

    date_train = date_train_raw.iloc[date_train_idx].reset_index(drop=True)
    date_test = date_test_raw.iloc[date_test_idx].reset_index(drop=True)

    X_train = pd.concat([X_train, date_train], axis=1)
    X_test = pd.concat([X_test, date_test], axis=1)

    data.append((X_train, X_test, y_train, y_test))

In [259]:
# y_preds = [4日後予測データ, 5日後予測データ, ..., 10日後予測データ]
y_preds = []
for i in range(data.__len__()):
    X_train, X_test, y_train, y_test = data[i][0], data[i][1], data[i][2], data[i][3]

    model = lgb.LGBMRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    score = root_mean_squared_error(y_pred, y_test)
    print(score)

    ape = sum(abs((y_test-y_pred)/y_test))
    mape = ape/y_test.shape[0]
    print(mape)

    y_preds.append(y_pred)

152.87335371530077
0.04265687606131471
167.42563249986372
0.04898505031284628
178.33708145843627
0.056432169833098976
187.275739116027
0.06050149909514425
192.52051982586158
0.06181689322490505
198.87448514492291
0.06552261407165835
208.60556557978728
0.06955631357819869


In [260]:
# テストデータ数を揃える
y_preds_min_len = min(list(map(len, y_preds)))
y_preds_fix = []
for p in y_preds:
    y_preds_fix.append(p[:y_preds_min_len])

data_fix = []
for d in data:
    X_train_fix = d[0].iloc[:y_preds_min_len]
    X_test_fix = d[1].iloc[:y_preds_min_len]
    y_train_fix = d[2].iloc[:y_preds_min_len]
    y_test_fix = d[3].iloc[:y_preds_min_len]
    data_fix.append((X_train_fix, X_test_fix, y_train_fix, y_test_fix))

In [261]:
# y_preds_fix = np.array(y_preds_fix).T

In [262]:
# y_preds_max = [np.argmax(ypf) for ypf in y_preds_fix]

In [263]:
# y_tests_fix = np.array([data[0][3][i:i+7].values for i in range(data[0][3].__len__()-6)])

In [264]:
# y_tests_max = [np.argmax(ytf) for ytf in y_tests_fix]

In [265]:
# 移動平均とサイズを合わせるために最初4日分をドロップ
df_for_compare = df["価格"].iloc[4:]

df_c_train = df_for_compare[df_for_compare.index < start_test_idx].copy()
df_c_test = df_for_compare[df_for_compare.index >= start_test_idx].copy()

date_c_train_raw = df.loc[df_c_train.index, ["年", "月", "日", "曜日"]].copy()
date_c_test_raw = df.loc[df_c_test.index, ["年", "月", "日", "曜日"]].copy()

data_c = []
for sp in range(span):

    sp = sp + 1 # 予測日は最低move_days+1日後

    n_c_train = df_c_train.shape[0] - (n_input - 1) - move_days - sp
    
    X_c_train_idx = [np.arange(n_input) + i for i in range(n_c_train)]
    y_c_train_idx = [(n_input - 1) + move_days + sp + i for i in range(n_c_train)]

    date_c_train_idx = [i + (n_input - 1) for i in range(n_c_train)]

    n_c_test = df_c_test.shape[0] - (n_input - 1) - move_days - sp
    
    X_c_test_idx = [np.arange(n_input) + i for i in range(n_c_test)]
    y_c_test_idx = [(n_input - 1) + move_days + sp + i for i in range(n_c_test)]

    date_c_test_idx = [i + (n_input - 1) for i in range(n_c_test)]

    X_c_train = np.array([df_c_train.iloc[xt_i].values for xt_i in X_c_train_idx])

    y_c_train = np.array([df_c_train.iloc[yt_i] for yt_i in y_c_train_idx])

    X_c_test = np.array([df_c_test.iloc[xt_i].values for xt_i in X_c_test_idx])
    y_c_test = np.array([df_c_test.iloc[yt_i] for yt_i in y_c_test_idx])

    X_c_train = pd.DataFrame(X_c_train, columns=np.arange(n_input))
    X_c_test = pd.DataFrame(X_c_test, columns=np.arange(n_input))
    y_c_train = pd.Series(y_c_train)
    y_c_test = pd.Series(y_c_test)

    date_c_train = date_c_train_raw.iloc[date_c_train_idx].reset_index(drop=True)
    date_c_test = date_c_test_raw.iloc[date_c_test_idx].reset_index(drop=True)

    X_c_train = pd.concat([X_c_train, date_c_train], axis=1)
    X_c_test = pd.concat([X_c_test, date_c_test], axis=1)

    data_c.append((X_c_train, X_c_test, y_c_train, y_c_test))

## 収穫量データの読み込み

In [266]:
df_harvest = pd.read_csv("../data/harvest.csv", index_col=0)

In [267]:
df.tail(20)

Unnamed: 0,年,月,日,曜日,数量,価格,days
13398,2021,9,13,0,2588,2612,3539
13402,2021,9,14,1,2243,2531,3540
13406,2021,9,16,3,3172,2606,3542
13411,2021,9,17,4,1604,2672,3543
13415,2021,9,18,5,2382,2568,3544
13420,2021,9,21,1,5067,2626,3547
13424,2021,9,22,2,2477,2675,3548
13428,2021,9,24,4,4532,2637,3550
13433,2021,9,25,5,2871,2683,3551
13437,2021,9,27,0,4105,2742,3553


# 価格データの成形
シミュレーションのためにテストデータ、予測結果に日時を付与

In [268]:
data_fix[0][1]

Unnamed: 0,0,1,2,3,4,5,6,年,月,日,曜日
0,1733.399918,1628.760410,1569.194895,1487.608268,1472.025809,1402.767145,1404.725781,2019,1,14,0
1,1628.760410,1569.194895,1487.608268,1472.025809,1402.767145,1404.725781,1389.238495,2019,1,15,1
2,1569.194895,1487.608268,1472.025809,1402.767145,1404.725781,1389.238495,1395.272783,2019,1,16,2
3,1487.608268,1472.025809,1402.767145,1404.725781,1389.238495,1395.272783,1404.539995,2019,1,17,3
4,1472.025809,1402.767145,1404.725781,1389.238495,1395.272783,1404.539995,1416.526499,2019,1,18,4
...,...,...,...,...,...,...,...,...,...,...,...
710,2542.462414,2544.971150,2545.938107,2583.590880,2589.820467,2594.543748,2602.437863,2021,9,21,1
711,2544.971150,2545.938107,2583.590880,2589.820467,2594.543748,2602.437863,2625.562032,2021,9,22,2
712,2545.938107,2583.590880,2589.820467,2594.543748,2602.437863,2625.562032,2632.652534,2021,9,24,4
713,2583.590880,2589.820467,2594.543748,2602.437863,2625.562032,2632.652534,2637.351838,2021,9,25,5


In [269]:
df_test_raw = df.loc[start_test_idx:].copy()
# 予測のもとになった入力データの最後の日の日付
# 最も有効データ数の多い1日後の予測をするモデルに合わせる
datelist = pd.Series([pd.to_datetime(str(int(d["年"]))+"/"+str(int(d["月"]))+"/"+str(int(d["日"]))) for _, d in df_test_raw.iterrows()])

datelist_input = pd.Series([pd.to_datetime(str(int(d["年"]))+"/"+str(int(d["月"]))+"/"+str(int(d["日"]))) for _, d in data[0][1].iterrows()])

# 予測の際の正解データの日付
datelist_output = [datelist[(n_input-1):][n:] for n in range(4,11)]

In [270]:
datelist

0     2019-01-05
1     2019-01-07
2     2019-01-08
3     2019-01-10
4     2019-01-11
         ...    
726   2021-10-05
727   2021-10-07
728   2021-10-08
729   2021-10-09
730   2021-10-11
Length: 731, dtype: datetime64[ns]

In [271]:
datelist_input

0     2019-01-14
1     2019-01-15
2     2019-01-16
3     2019-01-17
4     2019-01-18
         ...    
716   2021-09-30
717   2021-10-01
718   2021-10-02
719   2021-10-04
720   2021-10-05
Length: 721, dtype: datetime64[ns]

In [276]:
datelist_output[6]

16    2019-01-26
17    2019-01-28
18    2019-01-29
19    2019-01-31
20    2019-02-01
         ...    
726   2021-10-05
727   2021-10-07
728   2021-10-08
729   2021-10-09
730   2021-10-11
Length: 715, dtype: datetime64[ns]

In [273]:
y_preds_with_date = [pd.Series([datelist_output[i], y_preds[i]]) for i in range(span)]

In [274]:
span

7

## 理論値データの作成
y_c_test_maxに合わせて出荷

## 固定日データの作成
区間の1日目に合わせて出荷

## 提案手法データの作成
y_pred_maxに合わせて出荷

In [275]:
df.head(30)

Unnamed: 0,年,月,日,曜日,数量,価格,days
0,2012,1,5,3,840793,1525,0
4,2012,1,6,4,528012,1377,1
8,2012,1,7,5,317676,1292,2
12,2012,1,9,0,667,1466,4
15,2012,1,10,1,549611,1234,5
19,2012,1,11,2,243756,1212,6
23,2012,1,12,3,250483,1215,7
27,2012,1,13,4,230214,1198,8
31,2012,1,14,5,257579,1195,9
35,2012,1,16,0,323331,1192,11
