# 売買シミュレーションに備えた実験
- いちごの流通期間は 2~3日
- 10日後の価格を予測することで、予測日から流通期間を引いた7日間から販売日を決定可能に

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import japanize_matplotlib
from sklearn.metrics import accuracy_score
import lightgbm as lgb

In [2]:
def root_mean_squared_error(y_pred, y_test):
    return mean_squared_error(y_pred, y_test)**(1/2)

In [3]:
df_raw = pd.read_csv("../data/kk.csv")
# 全体のみを利用(産地名NaN)
is_nan = [df_raw["産地名"][i] is df_raw["産地名"][0] for i in range(df_raw.shape[0])]
df = df_raw[is_nan].copy()
del_columns = ["産地名", "産地コード", "品目名", "品目コード", "対前日比（数量）", "対前日比（価格）"]
df = df.drop(del_columns, axis=1).copy()


In [4]:
# 曜日の処理
dow = ["月", "火", "水", "木", "金", "土", "日"]
df["曜日"] = [dow.index(dw) for dw in df["曜日"]]


In [5]:
# 日にちの処理
# timestamp型 → timedelta
# date_columns = ["年", "月", "日"]
date = [str(df["年"].iloc[i])+"-"+str(df["月"].iloc[i])+"-" +
        str(df["日"].iloc[i]) for i in range(df.shape[0])]
df["date"] = pd.to_datetime(date)
# df = df.drop(date_columns, axis=1).copy()
df["days"] = [(date - df["date"][0]).days for date in df["date"]]
df = df.drop("date", axis=1).copy()


In [6]:
# 特異日 (天皇即位)
outlier = df[df["価格"].max() == df["価格"]].index 
# df = df.drop(outlier, axis=0).copy()

In [7]:
# 数量で重み付け
def ma_weighted(price, df):
    """
    Args:
        price (df.rolling): df.rolling.apply()を想定した価格列
        df: 重み列を取るための元df
    return:
        ma_w = 数量加重移動平均
    """

    df_price = df.loc[price.index, '価格']
    df_num = df.loc[price.index, '数量']
    
    ma_w = 0
    for p, n in zip(df_price, df_num):
        ma_w += n * p/df_num.sum()

    return ma_w


In [8]:
ma_w3 = df["価格"].rolling(3).apply(ma_weighted, args=(df,), raw=False)
ma_w5 = df["価格"].rolling(5).apply(ma_weighted, args=(df,), raw=False)
ma_w7 = df["価格"].rolling(7).apply(ma_weighted, args=(df,), raw=False)


In [9]:
# train-test-split
# 7年分をtrain，3年分をtest(index 9426以上)
ma_w5 = ma_w5.dropna()
df_train = ma_w5[ma_w5.index < 9426].copy()
df_test = ma_w5[ma_w5.index >= 9426].copy()


In [10]:
date_train_raw = df.loc[df_train.index, ["年", "月", "日", "曜日"]].copy()
date_test_raw = df.loc[df_test.index, ["年", "月", "日", "曜日"]].copy()

# 7 日分の予測

In [11]:
# 学習データ作成
# data = [4日後予測訓練データ, 5日後予測訓練データ, ..., 10日後予測訓練データ]

span = 10
data = []
for sp in range(4,11):
    n_train = df_train.shape[0] - sp - 6
    X_train_idx = [np.array([0, 1, 2, 3, 4, 5, 6]) + i for i in range(n_train)]
    y_train_idx = [i + sp + 6 for i in range(n_train)]

    date_train_idx = [i + 6 for i in range(n_train)]

    n_test = df_test.shape[0] - sp - 6
    X_test_idx = [np.array([0, 1, 2, 3, 4, 5, 6]) + i for i in range(n_test)]
    y_test_idx = [i + sp + 6 for i in range(n_test)]

    date_test_idx = [i + 6 for i in range(n_test)]

    X_train = np.array([df_train.iloc[xt_i].values for xt_i in X_train_idx])

    y_train = np.array([df_train.iloc[yt_i] for yt_i in y_train_idx])

    X_test = np.array([df_test.iloc[xt_i].values for xt_i in X_test_idx])
    y_test = np.array([df_test.iloc[yt_i] for yt_i in y_test_idx])

    X_train = pd.DataFrame(X_train, columns=np.arange(7))
    X_test = pd.DataFrame(X_test, columns=np.arange(7))
    y_train = pd.Series(y_train)
    y_test = pd.Series(y_test)

    date_train = date_train_raw.iloc[date_train_idx].reset_index(drop=True)
    date_test = date_test_raw.iloc[date_test_idx].reset_index(drop=True)

    X_train = pd.concat([X_train, date_train], axis=1)
    X_test = pd.concat([X_test, date_test], axis=1)

    data.append((X_train, X_test, y_train, y_test))

In [12]:
# y_preds = [4日後予測データ, 5日後予測データ, ..., 10日後予測データ]
y_preds = []
for i in range(data.__len__()):
    X_train, X_test, y_train, y_test = data[i][0], data[i][1], data[i][2], data[i][3]

    model = lgb.LGBMRegressor(random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    score = root_mean_squared_error(y_pred, y_test)
    print(score)

    ape = sum(abs((y_test-y_pred)/y_test))
    mape = ape/y_test.shape[0]
    print(mape)

    y_preds.append(y_pred)

152.87335371530077
0.04265687606131471
167.42563249986372
0.04898505031284628
178.33708145843627
0.056432169833098976
187.275739116027
0.06050149909514425
192.52051982586158
0.06181689322490505
198.87448514492291
0.06552261407165835
208.60556557978728
0.06955631357819869


## 最大値と予測した日が実際に最大だった割合（正解：移動平均）

In [111]:
# テストデータ数を揃える
y_preds_fix = []
for p in y_preds:
    y_preds_fix.append(p[:715])


In [112]:
y_preds_fix = np.array(y_preds_fix).T

In [113]:
y_preds_max = [np.argmax(ypf) for ypf in y_preds_fix]

In [114]:
y_tests_fix = np.array([data[0][3][i:i+7].values for i in range(data[0][3].__len__()-6)])

In [115]:
y_tests_max = [np.argmax(ytf) for ytf in y_tests_fix]

In [116]:
y_preds_fix.shape, y_tests_fix.shape

((715, 7), (715, 7))

In [120]:
acc_roll = accuracy_score(y_preds_max, y_tests_max)

## 元データとの比較

In [118]:
# 移動平均とサイズを合わせるために最初4日分をドロップ
df_for_compare = df["価格"].iloc[4:]

df_c_train = df_for_compare[df_for_compare.index < 9426].copy()
df_c_test = df_for_compare[df_for_compare.index >= 9426].copy()

date_c_train_raw = df.loc[df_c_train.index, ["年", "月", "日", "曜日"]].copy()
date_c_test_raw = df.loc[df_c_test.index, ["年", "月", "日", "曜日"]].copy()

data_c = []
for sp in range(4,11):
    n_c_train = df_c_train.shape[0] - sp - 6
    X_c_train_idx = [np.array([0, 1, 2, 3, 4, 5, 6]) + i for i in range(n_c_train)]
    y_c_train_idx = [i + sp + 6 for i in range(n_c_train)]

    date_c_train_idx = [i + 6 for i in range(n_c_train)]

    n_c_test = df_c_test.shape[0] - sp - 6
    X_c_test_idx = [np.array([0, 1, 2, 3, 4, 5, 6]) + i for i in range(n_c_test)]
    y_c_test_idx = [i + sp + 6 for i in range(n_c_test)]

    date_c_test_idx = [i + 6 for i in range(n_c_test)]

    X_c_train = np.array([df_c_train.iloc[xt_i].values for xt_i in X_c_train_idx])

    y_c_train = np.array([df_c_train.iloc[yt_i] for yt_i in y_c_train_idx])

    X_c_test = np.array([df_c_test.iloc[xt_i].values for xt_i in X_c_test_idx])
    y_c_test = np.array([df_c_test.iloc[yt_i] for yt_i in y_c_test_idx])

    X_c_train = pd.DataFrame(X_c_train, columns=np.arange(7))
    X_c_test = pd.DataFrame(X_c_test, columns=np.arange(7))
    y_c_train = pd.Series(y_c_train)
    y_c_test = pd.Series(y_c_test)

    date_c_train = date_c_train_raw.iloc[date_c_train_idx].reset_index(drop=True)
    date_c_test = date_c_test_raw.iloc[date_c_test_idx].reset_index(drop=True)

    X_c_train = pd.concat([X_c_train, date_c_train], axis=1)
    X_c_test = pd.concat([X_c_test, date_c_test], axis=1)

    data_c.append((X_c_train, X_c_test, y_c_train, y_c_test))

y_c_tests_fix = np.array([data_c[0][3][i:i+7].values for i in range(data_c[0][3].__len__()-6)])
y_c_tests_max = [np.argmax(ytf) for ytf in y_c_tests_fix]

In [121]:
acc = accuracy_score(y_preds_max, y_c_tests_max)

In [122]:
print("acc roll: ", acc_roll)
print("acc: ", acc)
print("diff: ", acc_roll - acc)

acc roll:  0.4881118881118881
acc:  0.3118881118881119
diff:  0.17622377622377622


10 %程度の精度低下

→ 移動平均なので大幅な損失にはつながらない見込み