# Part3 実践例 Chap.8 回帰問題のコンペ

## 8.1 MLB Player Digital Engagement Forecasting コンペの概要

## 8.2 分析のステップ

- 1 ベースライン作成
  - モデル: LightGBM
  - バリデーション設計: 時系列を加味したtrain/validの分割
  - 評価指数: MCMAE
- 2 特徴量エンジニアリング
  - ほかのカラム・テーブルも活用して特徴量を生成
  - ラグ特徴量
- 3 モデルチューニング
  - ニューラルネットワークの適用


- MCMAE: Mean Columns-wise Mean Absolute Error)
- MAE: Mean Absolute Error

## 8.3 ベースライン作成

### 8.3.1 分析設計

### 8.3.2 データ前処理

In [1]:
# ライブラリのインポート
import numpy as np
import pandas as pd
import gc
import pickle
import os
import datetime as dt

# plot
import matplotlib.pyplot as plt

# LightGBM
import lightgbm as lgb

from sklearn.metrics import mean_absolute_error

import warnings

warnings.simplefilter("ignore")

# 表示桁数の指定
pd.options.display.float_format = "{:10.4f}".format
pd.set_option("display.max_columns", None)

#### train_updated.csvの読み込みと加工

In [2]:
# train_updated.csvファイルの読み込み
train = pd.read_csv("./mlb-player-digital-engagement-forecasting/train_updated.csv")
print(train.shape)
train.head()

(1308, 12)


Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20180101,"[{""engagementMetricsDate"":""2018-01-02"",""player...",,"[{""playerId"":400121,""gameDate"":""2018-01-01"",""t...",,,"[{""transactionId"":340732,""playerId"":547348,""pl...",,,,"[{""date"":""2018-01-01"",""playerId"":545361,""playe...","[{""date"":""2018-01-01"",""teamId"":147,""teamName"":..."
1,20180102,"[{""engagementMetricsDate"":""2018-01-03"",""player...",,"[{""playerId"":134181,""gameDate"":""2018-01-02"",""t...",,,"[{""transactionId"":339458,""playerId"":621173,""pl...",,,,,
2,20180103,"[{""engagementMetricsDate"":""2018-01-04"",""player...",,"[{""playerId"":425492,""gameDate"":""2018-01-03"",""t...",,,"[{""transactionId"":347527,""playerId"":572389,""pl...",,,,,
3,20180104,"[{""engagementMetricsDate"":""2018-01-05"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-04"",""t...",,,"[{""transactionId"":339549,""playerId"":545343,""pl...",,,,,
4,20180105,"[{""engagementMetricsDate"":""2018-01-06"",""player...",,"[{""playerId"":282332,""gameDate"":""2018-01-05"",""t...",,,"[{""transactionId"":341195,""playerId"":628336,""pl...",,,,,


In [3]:
# 処理速度を上げるためにデータを絞り込む
# train: 2020-04~2021-03
# test: 2021-04
train = train.loc[train["date"] >= 20200401, :].reset_index(drop=True)
print(train.shape)

(487, 12)


In [4]:
# json形式のカラムを表形式に変換する
def unpack_json(json_str):
    return np.nan if pd.isna(json_str) else pd.read_json(json_str)


def extract_data(input_df, col="events", show=False):
    output_df = pd.DataFrame()
    for i in np.arange(len(input_df)):
        if show:
            print("\r{}/{}".format(i + 1, len(input_df)), end="")
        try:
            output_df = pd.concat([output_df, unpack_json(input_df[col].iloc[i])], axis=0, ignore_index=True)
        except:
            pass
    if show:
        print("")
    if show:
        print(output_df.shape)
    if show:
        display(output_df.head())
    return output_df

In [5]:
# train_updated.csvから「nextDayPlayerEngagement」を取り出して表形式に変換
df_engagement = extract_data(train, col="nextDayPlayerEngagement", show=True)
# 20s

487/487
(1003707, 6)


Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967


In [6]:
# 結合キーであるdate_playerIdの作成
df_engagement["date_playerId"] = (
    df_engagement["engagementMetricsDate"].str.replace("-", "") + "_" + df_engagement["playerId"].astype(str)
)
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193


In [7]:
# 推論実施日のカラム作成（推論実施日＝推論対象日の前日）
df_engagement["date"] = pd.to_datetime(df_engagement["engagementMetricsDate"], format="%Y-%m-%d") + dt.timedelta(
    days=-1
)

# 推論実施日から「曜日」と「年月」の特徴量作成
df_engagement["dayofweek"] = df_engagement["date"].dt.dayofweek
df_engagement["yearmonth"] = df_engagement["date"].astype(str).apply(lambda x: x[:7])
df_engagement.head()

Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId,date,dayofweek,yearmonth
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794,2020-04-01,2,2020-04
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704,2020-04-01,2,2020-04
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702,2020-04-01,2,2020-04
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231,2020-04-01,2,2020-04
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193,2020-04-01,2,2020-04


#### player.csvの読み込みと加工

In [8]:
# player.csvの読み込みと加工
df_players = pd.read_csv("./mlb-player-digital-engagement-forecasting/players.csv")
print(df_players.shape)
print(df_players["playerId"].aggregate("nunique"))
df_players.head()

(2061, 12)
2061


Unnamed: 0,playerId,playerName,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,665482,Gilberto Celestino,1999-02-13,2021-06-02,Santo Domingo,,Dominican Republic,72,170,8,Outfielder,False
1,593590,Webster Rivas,1990-08-08,2021-05-28,Nagua,,Dominican Republic,73,219,3,First Base,True
2,661269,Vladimir Gutierrez,1995-09-18,2021-05-28,Havana,,Cuba,73,190,1,Pitcher,True
3,669212,Eli Morgan,1996-05-13,2021-05-28,Rancho Palos Verdes,CA,USA,70,190,1,Pitcher,True
4,666201,Alek Manoah,1998-01-09,2021-05-27,Homestead,FL,USA,78,260,1,Pitcher,True


In [18]:
# 評価対象の人数確認
# True == 1
# df_players["playerForTestSetAndFuturePreds"] = np.where(df_players["playerForTestSetAndFuturePreds"] == True, 1, 0)
df_players["playerForTestSetAndFuturePreds"] = np.where(df_players["playerForTestSetAndFuturePreds"] == 1, 1, 0)
print(df_players["playerForTestSetAndFuturePreds"].sum())
print(df_players["playerForTestSetAndFuturePreds"].mean())

1187
0.5759340126152354


### 8.3.3 データセット作成

In [19]:
# テーブルの結合
df_train = pd.merge(df_engagement, df_players, on=["playerId"], how="left")
print(df_train.shape)
display(df_train.head())

(1003707, 21)


Unnamed: 0,engagementMetricsDate,playerId,target1,target2,target3,target4,date_playerId,date,dayofweek,yearmonth,playerName,DOB,mlbDebutDate,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,2020-04-02,425794,5.1249,9.434,0.1179,6.1947,20200402_425794,2020-04-01,2,2020-04,Adam Wainwright,1981-08-30,2005-09-11,Brunswick,GA,USA,79,230,1,Pitcher,1
1,2020-04-02,571704,0.0389,8.1761,0.0105,2.1304,20200402_571704,2020-04-01,2,2020-04,Ken Giles,1990-09-20,2014-06-12,Albuquerque,NM,USA,75,210,1,Pitcher,0
2,2020-04-02,506702,0.0106,5.0314,0.0082,0.885,20200402_506702,2020-04-01,2,2020-04,Sandy Leon,1989-03-13,2012-05-14,Maracaibo,,Venezuela,70,235,2,Catcher,1
3,2020-04-02,607231,0.0247,2.8302,0.0222,0.59,20200402_607231,2020-04-01,2,2020-04,John Gant,1992-08-06,2016-04-06,Savannah,GA,USA,76,200,1,Pitcher,1
4,2020-04-02,543193,0.0071,1.1006,0.0012,0.1967,20200402_543193,2020-04-01,2,2020-04,Drew Gagnon,1990-06-26,2018-07-10,Columbia,CA,USA,76,215,1,Pitcher,0


In [20]:
# 学習用データセットの作成
x_train = df_train[
    [
        "playerId",
        "dayofweek",
        "birthCity",
        "birthStateProvince",
        "birthCountry",
        "heightInches",
        "weight",
        "primaryPositionCode",
        "primaryPositionName",
        "playerForTestSetAndFuturePreds",
    ]
]
y_train = df_train[["target1", "target2", "target3", "target4"]]
id_train = df_train[
    ["engagementMetricsDate", "playerId", "date_playerId", "date", "yearmonth", "playerForTestSetAndFuturePreds"]
]
print(x_train.shape, y_train.shape, id_train.shape)
x_train.head()

(1003707, 10) (1003707, 4) (1003707, 6)


Unnamed: 0,playerId,dayofweek,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,425794,2,Brunswick,GA,USA,79,230,1,Pitcher,1
1,571704,2,Albuquerque,NM,USA,75,210,1,Pitcher,0
2,506702,2,Maracaibo,,Venezuela,70,235,2,Catcher,1
3,607231,2,Savannah,GA,USA,76,200,1,Pitcher,1
4,543193,2,Columbia,CA,USA,76,215,1,Pitcher,0


In [12]:
y_train.head()

Unnamed: 0,target1,target2,target3,target4
0,5.1249,9.434,0.1179,6.1947
1,0.0389,8.1761,0.0105,2.1304
2,0.0106,5.0314,0.0082,0.885
3,0.0247,2.8302,0.0222,0.59
4,0.0071,1.1006,0.0012,0.1967


In [13]:
id_train.head()

Unnamed: 0,engagementMetricsDate,playerId,date_playerId,date,yearmonth,playerForTestSetAndFuturePreds
0,2020-04-02,425794,20200402_425794,2020-04-01,2020-04,1
1,2020-04-02,571704,20200402_571704,2020-04-01,2020-04,0
2,2020-04-02,506702,20200402_506702,2020-04-01,2020-04,1
3,2020-04-02,607231,20200402_607231,2020-04-01,2020-04,1
4,2020-04-02,543193,20200402_543193,2020-04-01,2020-04,0


In [21]:
# カテゴリ変数をcategory方に変換
for col in [
    "playerId",
    "dayofweek",
    "birthCity",
    "birthStateProvince",
    "birthCountry",
    "primaryPositionCode",
    "primaryPositionName",
]:
    x_train[col] = x_train[col].astype("category")

print(x_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1003707 entries, 0 to 1003706
Data columns (total 10 columns):
 #   Column                          Non-Null Count    Dtype   
---  ------                          --------------    -----   
 0   playerId                        1003707 non-null  category
 1   dayofweek                       1003707 non-null  category
 2   birthCity                       1003707 non-null  category
 3   birthStateProvince              738292 non-null   category
 4   birthCountry                    1003707 non-null  category
 5   heightInches                    1003707 non-null  int64   
 6   weight                          1003707 non-null  int64   
 7   primaryPositionCode             1003707 non-null  category
 8   primaryPositionName             1003707 non-null  category
 9   playerForTestSetAndFuturePreds  1003707 non-null  int64   
dtypes: category(7), int64(3)
memory usage: 31.7 MB
None


### 8.3.4 バリデーション設計 P.287

- デジタルエンゲージメント（目的変数）: target1, target2, target3, target4
- twitter情報: playerTwitterFollowers, teemTwitterFollowers

In [22]:
# 学習データと検証データの期間を設定
list_cv_month = [
    [
        ["2020-05", "2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11", "2020-12", "2021-01", "2021-02", "2021-03", "2021-04",],
        ["2021-05"],
    ],
    [
        ["2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11", "2020-12", "2021-01", "2021-02", "2021-03", "2021-04", "2021-05",],
        ["2021-06"],
    ],
    [
        ["2020-07", "2020-08", "2020-09", "2020-10", "2020-11", "2020-12", "2021-01", "2021-02", "2021-03", "2021-04", "2021-05", "2021-06",],
        ["2021-07"],
    ],
]

In [23]:
# 学習データと検証データの index リストを作成
cv = list()
for month_tr, month_va in list_cv_month:
    cv.append(
        [
            id_train.index[id_train["yearmonth"].isin(month_tr)],
            id_train.index[id_train["yearmonth"].isin(month_va) & (id_train["playerForTestSetAndFuturePreds"] == 1)],
        ]
    )
# fold0のindexのリスト
cv[0]

[Index([ 61830,  61831,  61832,  61833,  61834,  61835,  61836,  61837,  61838,
         61839,
        ...
        814085, 814086, 814087, 814088, 814089, 814090, 814091, 814092, 814093,
        814094],
       dtype='int64', length=752265),
 Index([814095, 814096, 814100, 814101, 814102, 814104, 814105, 814106, 814107,
        814109,
        ...
        877931, 877934, 877950, 877951, 877957, 877958, 877969, 877972, 877974,
        877975],
       dtype='int64', length=36797)]

In [32]:
len(cv[0][0])

752265

In [33]:
len(cv[0][1])

36797

### 8.3.5 モデル学習

1-A targetごとの処理（target1~4）
    1-B foldごとの処理（fold0~2）
        (1)学習データと検証データに分離
        (2)モデル学習
        (3)モデル評価
        (4)説明変数の重要度取得
2 モデル評価（全target/foldのサマリ）
3 推論値の取得（全target/foldのサマリ）
4 説明変数の重要度取得（全target/foldのサマリ）

#### `target1`の`fold1`の学習の流れ

1. 学習データと検証データに分離
2. モデル学習
3. モデル評価
4. 説明変数の重要度取得

#### (1)学習データと検証データに分離

In [24]:
# (1)学習データと検証データに分離

# 目的変数は「target1」で，foldは「fold0」の場合とする
target = "target1"
nfold = 0

# trainとvalidのindex取得
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# trainデータとvalidデータに分離
x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train.loc[idx_tr, target], id_train.loc[idx_tr, :]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train.loc[idx_va, target], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

(752265, 10) (752265,) (752265, 6)
(36797, 10) (36797,) (36797, 6)


#### (2)モデル学習

In [25]:
# (2)モデル学習
# ハイパーパラメータの設定
params = {
    "boosting_type": "gbdt",
    "objective": "regression_l1",
    "metric": "mean_absolute_error",
    "learning_rate": 0.05,
    "num_leaves": 32,
    "subsample": 0.7,
    "subsample_freq": 1,
    "feature_fraction": 0.8,
    "min_data_in_leaf": 50,
    "min_sum_hessian_in_leaf": 50,
    "n_estimators": 1000,
    "seed": 123,
    "importance_type": "gain",
}

# モデルの学習
model = lgb.LGBMRegressor(**params)
model.fit(
    x_tr,
    y_tr,
    eval_set=[(x_tr, y_tr), (x_va, y_va)],
    callbacks=[
        lgb.early_stopping(stopping_rounds=100, verbose=True),
        lgb.log_evaluation(100),
    ],
)

# モデルの保存
with open("model_lgb_target1_fold0.h5", "wb") as f:
    pickle.dump(model, f, protocol=4)
# 11s

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006705 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3300
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 10
[LightGBM] [Info] Start training from score 0.001289
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.50831	valid_1's l1: 1.29786
[200]	training's l1: 0.508183	valid_1's l1: 1.29768
[300]	training's l1: 0.508143	valid_1's l1: 1.29767
Early stopping, best iteration is:
[258]	training's l1: 0.508161	valid_1's l1: 1.29766


#### (3)モデル評価

In [26]:
# (3)モデル評価
# validデータの推論値取得
y_va_pred = model.predict(x_va)

# 全target/foldの推論値を格納する変数の作成
df_valid_pred = pd.DataFrame()

# 推論値を格納
tmp_pred = pd.concat(
    [
        id_va,
        pd.DataFrame({"target": target, "nfold": 0, "true": y_va, "pred": y_va_pred}),
    ],
    axis=1,
)
df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)

# 全target/foldの評価値を入れる変数の作成
metrics = []

# 評価値の算出
metric_va = mean_absolute_error(y_va, y_va_pred)
# 評価値を格納
metrics.append([target, nfold, metric_va])
metrics



[['target1', 0, 1.2976578174338422]]

#### (4)説明変数の重要度取得

In [27]:
# (4)説明変数の重要度取得
# 重要度の取得
tmp_imp = pd.DataFrame({
    "col": x_tr.columns,
    "imp": model.feature_importances_,
    "target": "target1",
    "nfold": nfold
})
# 確認（重要度の上位10個）
display(tmp_imp.sort_values("imp", ascending=False))

# 全target/foldの重要度を格納するデータフレームの作成
df_imp = pd.DataFrame()
# imp_foldをdf_impに結合
df_imp = pd.concat([df_imp, tmp_imp], axis=0, ignore_index=True)

Unnamed: 0,col,imp,target,nfold
0,playerId,13595482.8115,target1,0
9,playerForTestSetAndFuturePreds,2314285.0327,target1,0
2,birthCity,2249420.1773,target1,0
7,primaryPositionCode,523633.5634,target1,0
8,primaryPositionName,91211.0063,target1,0
1,dayofweek,89016.5762,target1,0
3,birthStateProvince,35673.0473,target1,0
6,weight,30337.572,target1,0
5,heightInches,20493.2084,target1,0
4,birthCountry,4882.033,target1,0


#### 2 モデル評価（全target/foldのサマリ）

In [19]:
metrics

[['target1', 0, 1.2976578174338422]]

In [28]:
# 2 モデル評価（全target/foldのサマリ）
# リスト型をデータフレームに変換
df_metrics = pd.DataFrame(metrics, columns=["target", "nfold", "mae"])
display(df_metrics.head())

# 評価値
print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))

display(pd.pivot_table(df_metrics, index="nfold", columns="target", values="mae", aggfunc=np.mean, margins=True))

Unnamed: 0,target,nfold,mae
0,target1,0,1.2977


MCMAE: 1.2977


target,target1,All
nfold,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1.2977,1.2977
All,1.2977,1.2977


#### 3 推論値の取得（全target/foldのサマリ）

In [29]:
# 3 推論値の取得（全target/foldのサマリ）
# 検証データの推論値の形式変換
df_valid_pred_all = pd.pivot_table(
    df_valid_pred,
    index=["engagementMetricsDate", "playerId", "date_playerId", "date", "yearmonth", "playerForTestSetAndFuturePreds"],
    columns=["target", "nfold"],
    values=["true", "pred"],
    aggfunc=np.sum,
)
df_valid_pred_all.columns = ["{}_fold{}_{}".format(j, k, i) for i, j, k in df_valid_pred_all.columns]
df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)
df_valid_pred_all.head()

Unnamed: 0,engagementMetricsDate,playerId,date_playerId,date,yearmonth,playerForTestSetAndFuturePreds,target1_fold0_pred,target1_fold0_true
0,2021-05-02,405395,20210502_405395,2021-05-01,2021-05,1,0.6049,0.1518
1,2021-05-02,408234,20210502_408234,2021-05-01,2021-05,1,0.3317,0.2365
2,2021-05-02,424144,20210502_424144,2021-05-01,2021-05,1,0.002,0.0016
3,2021-05-02,425772,20210502_425772,2021-05-01,2021-05,1,0.0065,0.0035
4,2021-05-02,425784,20210502_425784,2021-05-01,2021-05,1,0.0008,0.0001


#### 4 説明変数の重要度取得（全target/foldのサマリ）

In [30]:
# 4 説明変数の重要度取得（全target/foldのサマリ）
df_imp.groupby(["col"])["imp"].aggregate(["mean", "std"]).sort_values("mean", ascending=False)

Unnamed: 0_level_0,mean,std
col,Unnamed: 1_level_1,Unnamed: 2_level_1
playerId,13595482.8115,
playerForTestSetAndFuturePreds,2314285.0327,
birthCity,2249420.1773,
primaryPositionCode,523633.5634,
primaryPositionName,91211.0063,
dayofweek,89016.5762,
birthStateProvince,35673.0473,
weight,30337.572,
heightInches,20493.2084,
birthCountry,4882.033,


#### 学習用関数の作成

In [31]:

# 学習用関数の作成
"""
1-A targetごとの処理（target1~4）
    1-B foldごとの処理（fold0~2）
        (1)学習データと検証データに分離
        (2)モデル学習
        (3)モデル評価
        (4)説明変数の重要度取得
2 モデル評価（全target/foldのサマリ）
3 推論値の取得（全target/foldのサマリ）
4 説明変数の重要度取得（全target/foldのサマリ）
"""
def train_lgb(
    input_x,
    input_y,
    input_id,
    params,
    list_nfold=[0, 1, 2],
    model_train="train",
):
    # 推論値を格納するデータフレームの作成
    df_valid_pred = pd.DataFrame()
    # 評価値を格納するリストの作成
    metrics = list()
    # 重要度を格納するデータフレームの作成
    df_imp = pd.DataFrame()

    # validation
    cv = list()
    for month_tr, month_va in list_cv_month:
        cv.append([
            input_id.index[input_id["yearmonth"].isin(month_tr)],
            input_id.index[input_id["yearmonth"].isin(month_va) & (input_id["playerForTestSetAndFuturePreds"] == 1)],
        ])

    # モデル学習（target/foldごとに学習）
    for nfold in list_nfold:
        for i, target in enumerate(["target1", "target2", "target3", "target4"]):
            print("-"*20, target, ", fold: ", nfold, "-"*20)
            # train と valid に分離
            idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
            x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y.loc[idx_tr, target], input_id.loc[idx_tr, :]
            x_va, y_va, id_va = input_x.loc[idx_va, :], input_y.loc[idx_va, target], input_id.loc[idx_va, :]
            print("x_va, y_va, id_va")
            print(x_tr.shape, y_tr.shape, id_tr.shape)
            print(x_va.shape, y_va.shape, id_va.shape)

            # 保存するモデルのファイル名
            filepath = f"model_lgb_{target}_fold{nfold}.h5"

            # モデル学習
            if model_train == "train":
                print("training start.")
                model = lgb.LGBMRegressor(**params)
                model.fit(
                    x_tr,
                    y_tr,
                    eval_set=[(x_tr, y_tr), (x_va, y_va)],
                    callbacks=[
                        lgb.early_stopping(stopping_rounds=100, verbose=True),
                        lgb.log_evaluation(100),
                    ],
                )
                # 学習モデルの保存
                with open(filepath, "wb") as f:
                    pickle.dump(model, f, protocol=4)
            else:
                print("model load.")
                with open(filepath, "rb") as f:
                    model = pickle.load(f)
                print("Done.")

            # validの推論値取得
            y_va_pred = model.predict(x_va)
            tmp_pred = pd.concat([
                id_va,
                pd.DataFrame({
                    "target": target,
                    "nfold": nfold,
                    "true": y_va,
                    "pred": y_va_pred,
                })
            ], axis=1)
            df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)

            # 評価値の算出
            metric_va = mean_absolute_error(y_va, y_va_pred)
            metrics.append([target, nfold, metric_va])

            # 重要度の取得
            tmp_imp = pd.DataFrame({
                "col": x_tr.columns,
                "imp": model.feature_importances_,
                "target": target,
                "nfold": nfold
            })
            df_imp = pd.concat([df_imp, tmp_imp], axis=0, ignore_index=True)

    print("-"*10, "result", "-"*10)
    # 評価値の取得 MCMAE
    df_metrics = pd.DataFrame(metrics, columns=["target", "nfold", "mae"])
    print(f"MCMAE:  {df_metrics["mae"].mean():.4f}")

    # validの推論値
    df_valid_pred_all = pd.pivot_table(
        df_valid_pred,
        index=["engagementMetricsDate", "playerId", "date_playerId", "date", "yearmonth", "playerForTestSetAndFuturePreds"],
        columns=["target", "nfold"],
        values=["true", "pred"],
        aggfunc=np.sum
    )
    df_valid_pred_all.columns = [
        "{}_fold{}_{}".format(j, k, i) for i, j, k in df_valid_pred_all.columns
    ]
    df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)

    return df_valid_pred_all, df_metrics, df_imp

#### モデル学習の実行

In [32]:
# モデル学習の実行
params = {
    "boosting_type": "gbdt",
    "objective": "regression_l1",
    "metric": "mean_absolute_error",
    "learning_rate": 0.05,
    "num_leaves": 32,
    "subsample": 0.7,
    "subsample_freq": 1,
    "feature_fraction": 0.8,
    "min_data_in_leaf": 50,
    "min_sum_hessian_in_leaf": 50,
    "n_estimators": 1000,
    "seed": 123,
    "importance_type": "gain",
}

df_valid_pred, df_metrics, df_imp = train_lgb(
    input_x=x_train,
    input_y=y_train,
    input_id=id_train,
    params=params,
    list_nfold=[0, 1, 2],
    model_train="train"
)
#2m30s

-------------------- target1 , fold:  0 --------------------
x_va, y_va, id_va
(752265, 10) (752265,) (752265, 6)
(36797, 10) (36797,) (36797, 6)
training start.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003966 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3300
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 10
[LightGBM] [Info] Start training from score 0.001289
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.50831	valid_1's l1: 1.29786
[200]	training's l1: 0.508183	valid_1's l1: 1.29768
[300]	training's l1: 0.508143	valid_1's l1: 1.29767
Early stopping, best iteration is:
[258]	training's l1: 0.508161	valid_1's l1: 1.29766
-------------------- target2 , fold:  0 --------------------
x_va, y_va, id_va
(752265, 10) (752265,) (752265, 6)
(36797, 10) (

#### 評価値（MCMAE）の確認

In [33]:
# 評価値（MCMAE）の確認
print(f"MCMAE: {df_metrics["mae"].mean():.4f}")
display(pd.pivot_table(df_metrics, index="nfold", columns="target", values="mae", aggfunc=np.mean, margins=True))

MCMAE: 1.3504


target,target1,target2,target3,target4,All
nfold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.2977,2.4447,0.878,1.2451,1.4664
1,1.1953,2.1539,0.8317,1.6406,1.4554
2,1.1133,1.7903,0.7606,0.8542,1.1296
All,1.2021,2.1297,0.8234,1.2466,1.3504


#### 説明変数の重要度の確認

In [34]:
# 説明変数の重要度の確認
df_imp.groupby(["col"])["imp"].aggregate(["mean", "std"]).sort_values("mean", ascending=False)

Unnamed: 0_level_0,mean,std
col,Unnamed: 1_level_1,Unnamed: 2_level_1
playerId,4976980.5395,6102371.1341
playerForTestSetAndFuturePreds,1115074.5464,1091298.8384
birthCity,741457.7171,1058010.6632
primaryPositionCode,110792.2785,167555.0234
dayofweek,78879.047,140714.7657
primaryPositionName,33697.431,38239.7619
weight,20633.3897,31551.4583
heightInches,19410.1825,34119.9997
birthStateProvince,7714.1863,13322.8164
birthCountry,2933.4399,5114.668


### 8.3.6 モデル推論

#### Code Competition の推論スクリプトの構成と仕組み

- 推論データセットの作成
- モデル推論
- 提出用フォーマットへの変換

##### Part1: スロン用データセットの作成

In [None]:
# 推論時に受け取るデータのフォーマット確認(1)（サブミット時にはコメントアウト）
# import mlb

# env = mlb.make_env()
# iter_test = env.iter_test()
# for (test_df, prediction_df) in iter_test:
#     # forループで受け取るデータの確認
#     display(test_df.head())
#     display(prediction_df.head())
#     break

In [35]:
# 推論時に受け取るデータのフォーマット確認(2)（サブミット時にはコメントアウト）

# forループで受け取るtest_dfのデータサンプル
test_df = pd.read_csv("./mlb-player-digital-engagement-forecasting/example_test.csv")
display(test_df.head())

# forループで受け取るprediction_dfのサンプルデータ
prediction_df = pd.read_csv("./mlb-player-digital-engagement-forecasting/example_sample_submission.csv")
display(prediction_df.head())

Unnamed: 0,date,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
0,20210426,"[{""gamePk"":634374,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-26"",""t...","[{""home"":1,""gamePk"":634377,""gameDate"":""2021-04...","[{""home"":1,""teamId"":139,""gamePk"":634343,""gameD...","[{""transactionId"":480386,""playerId"":543685,""pl...","[{""season"":2021,""gameDate"":""2021-04-26"",""divis...",,"[{""gamePk"":634433,""gameDate"":""2021-04-26"",""gam...",,
1,20210427,"[{""gamePk"":634318,""gameType"":""R"",""season"":2021...","[{""playerId"":443558,""gameDate"":""2021-04-27"",""t...","[{""home"":1,""gamePk"":634320,""gameDate"":""2021-04...","[{""home"":1,""teamId"":117,""gamePk"":634333,""gameD...","[{""transactionId"":480456,""playerId"":642162,""pl...","[{""season"":2021,""gameDate"":""2021-04-27"",""divis...",,"[{""gamePk"":634332,""gameDate"":""2021-04-27"",""gam...",,
2,20210428,"[{""gamePk"":634309,""gameType"":""R"",""season"":2021...","[{""playerId"":429722,""gameDate"":""2021-04-28"",""t...","[{""home"":1,""gamePk"":634310,""gameDate"":""2021-04...","[{""home"":0,""teamId"":111,""gamePk"":634310,""gameD...","[{""transactionId"":480728,""playerId"":545358,""pl...","[{""season"":2021,""gameDate"":""2021-04-28"",""divis...",,"[{""gamePk"":634317,""gameDate"":""2021-04-28"",""gam...",,
3,20210429,"[{""gamePk"":634330,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-29"",""t...","[{""home"":1,""gamePk"":634330,""gameDate"":""2021-04...","[{""home"":0,""teamId"":119,""gamePk"":634346,""gameD...","[{""transactionId"":480993,""playerId"":606965,""pl...","[{""season"":2021,""gameDate"":""2021-04-29"",""divis...",,"[{""gamePk"":634346,""gameDate"":""2021-04-29"",""gam...",,
4,20210430,"[{""gamePk"":634287,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-30"",""t...","[{""home"":1,""gamePk"":634305,""gameDate"":""2021-04...","[{""home"":1,""teamId"":135,""gamePk"":634303,""gameD...",,"[{""season"":2021,""gameDate"":""2021-04-30"",""divis...","[{""awardId"":""NLRRELMON"",""awardName"":""NL Reliev...","[{""gamePk"":634327,""gameDate"":""2021-04-30"",""gam...",,


Unnamed: 0,date,date_playerId,target1,target2,target3,target4
0,20210426,20210427_656669,0,0,0,0
1,20210426,20210427_543475,0,0,0,0
2,20210426,20210427_592866,0,0,0,0
3,20210426,20210427_452678,0,0,0,0
4,20210426,20210427_570257,0,0,0,0


In [36]:
# 推論時に受け取るデータの疑似生成（2021/4/26分）

# test_df の疑似生成（4/26に受け取るデータを想定）
test_df = train.loc[train["date"] == 20210426, :]
display(test_df.head())

# prediction_df の疑似生成（4/26に受け取るデータを想定）
prediction_df = df_engagement.loc[df_engagement["date"] == "2021-04-26", ["date", "date_playerId"]].reset_index(drop=True)
prediction_df["date"] = prediction_df["date"].apply(lambda x: int(str(x).replace("-", "")[:8]))
for col in ["target1", "target2", "target3", "target4"]:
    prediction_df[col] = 0
display(prediction_df.head())

Unnamed: 0,date,nextDayPlayerEngagement,games,rosters,playerBoxScores,teamBoxScores,transactions,standings,awards,events,playerTwitterFollowers,teamTwitterFollowers
390,20210426,"[{""engagementMetricsDate"":""2021-04-27"",""player...","[{""gamePk"":634374,""gameType"":""R"",""season"":2021...","[{""playerId"":405395,""gameDate"":""2021-04-26"",""t...","[{""home"":1,""gamePk"":634377,""gameDate"":""2021-04...","[{""home"":1,""teamId"":139,""gamePk"":634343,""gameD...","[{""transactionId"":480386,""playerId"":543685,""pl...","[{""season"":2021,""gameDate"":""2021-04-26"",""divis...",,"[{""gamePk"":634433,""gameDate"":""2021-04-26"",""gam...",,


Unnamed: 0,date,date_playerId,target1,target2,target3,target4
0,20210426,20210427_656669,0,0,0,0
1,20210426,20210427_543475,0,0,0,0
2,20210426,20210427_623465,0,0,0,0
3,20210426,20210427_595032,0,0,0,0
4,20210426,20210427_592866,0,0,0,0


In [37]:
# 推論用データセット作成の関数

def make_dataset_for_prediction(input_test, input_prediction):
    test = input_test.copy()
    prediction = input_prediction.copy()

    # dateを日付型に変換
    prediction["date"] = pd.to_datetime(prediction["date"], format="%Y%m%d")
    # 推論対象日（engagementMetricsDate）と選手ID（playerId）のカラムを作成
    prediction["engagementMetricsDate"] = prediction["date_playerId"].apply(lambda x: x[:8])
    prediction["engagementMetricsDate"] = pd.to_datetime(prediction["engagementMetricsDate"], format="%Y%m%d")
    prediction["playerId"] = prediction["date_playerId"].apply(lambda x: int(x[9:]))

    # 日付から曜日と年月を作成
    prediction["dayofweek"] = prediction["date"].dt.dayofweek
    prediction["yearmonth"] = prediction["date"].astype(str).apply(lambda x: x[:7])

    # テーブルの結合
    df_test = pd.merge(prediction, df_players, on=["playerId"], how="left")

    # 説明変数の作成
    x_test = df_test[[
        "playerId",
        "dayofweek",
        "birthCity",
        "birthStateProvince",
        "birthCountry",
        "heightInches",
        "weight",
        "primaryPositionCode",
        "primaryPositionName",
        "playerForTestSetAndFuturePreds",
    ]]
    id_test = df_test[[
        "engagementMetricsDate",
        "playerId",
        "date_playerId",
        "date",
        "yearmonth",
        "playerForTestSetAndFuturePreds"
    ]]

    # カテゴリ変数をcategory型に変換
    for col in [
        "playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"
    ]:
        x_test[col] = x_test[col].astype("category")

    return x_test, id_test

In [38]:
# 推論用データセット作成の実行
x_test, id_test = make_dataset_for_prediction(test_df, prediction_df)
display(x_test.head())
display(id_test.head())

Unnamed: 0,playerId,dayofweek,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds
0,656669,0,Visalia,CA,USA,73,195,8,Outfielder,1
1,543475,0,Hartsville,SC,USA,77,230,1,Pitcher,1
2,623465,0,Salisbury,MD,USA,74,215,1,Pitcher,0
3,595032,0,Ranburne,AL,USA,76,220,1,Pitcher,0
4,592866,0,San Diego,CA,USA,75,235,1,Pitcher,1


Unnamed: 0,engagementMetricsDate,playerId,date_playerId,date,yearmonth,playerForTestSetAndFuturePreds
0,2021-04-27,656669,20210427_656669,2021-04-26,2021-04,1
1,2021-04-27,543475,20210427_543475,2021-04-26,2021-04,1
2,2021-04-27,623465,20210427_623465,2021-04-26,2021-04,0
3,2021-04-27,595032,20210427_595032,2021-04-26,2021-04,0
4,2021-04-27,592866,20210427_592866,2021-04-26,2021-04,1


##### Part2: モデル推論

- 1-A targetごとの処理（target1, 2, 3, 4）
  - 1-B foldごとの処理（fold0, fold1, fold2）
    - (1) モデルの読み込み
    - (2) モデルを用いた推論
- 2 推論値の取得（全foldのサマリ）

- (1) モデルの読み込み

In [39]:
# (1) モデルの読み込み
with open("model_lgb_target1_fold0.h5", "rb") as f:
    model = pickle.load(f)

- (2) モデルを用いた推論

In [40]:
# (2) モデルを用いた推論
pred = model.predict(x_test)

df_test_pred = id_test.copy()
df_test_pred["target1_fold0"] = pred



In [41]:
# 推論値の計算
# target1の推論値: 各foldの平均値
df_test_pred["target1"] = df_test_pred[df_test_pred.columns[df_test_pred.columns.str.contains("target1")]].mean(axis=1)

# target2~4も同様の計算をする

print(df_test_pred.shape)
display(df_test_pred.head())

(2061, 8)


Unnamed: 0,engagementMetricsDate,playerId,date_playerId,date,yearmonth,playerForTestSetAndFuturePreds,target1_fold0,target1
0,2021-04-27,656669,20210427_656669,2021-04-26,2021-04,1,0.029,0.029
1,2021-04-27,543475,20210427_543475,2021-04-26,2021-04,1,0.0033,0.0033
2,2021-04-27,623465,20210427_623465,2021-04-26,2021-04,0,0.0001,0.0001
3,2021-04-27,595032,20210427_595032,2021-04-26,2021-04,0,-0.0,-0.0
4,2021-04-27,592866,20210427_592866,2021-04-26,2021-04,1,0.052,0.052


In [42]:
# 推論処理の関数

def predict_lgb(
    input_x,
    input_id,
    list_nfold=[0, 1, 2]
):
    list_target = ["target1", "target2", "target3", "target4"]
    df_test_pred = input_id.copy()

    for target in list_target:
        for nfold in list_nfold:
            # モデルの読み込み
            with open(f"model_lgb_{target}_fold{nfold}.h5", "rb") as f:
                model = pickle.load(f)

            # 推論
            pred = model.predict(input_x)
            # 予測値の格納
            df_test_pred[f"{target}_fold{nfold}"] = pred

    # 推論値の取得: 各foldの平均値
    for target in list_target:
        df_test_pred[target] = df_test_pred[df_test_pred.columns[df_test_pred.columns.str.contains(target)]].mean(axis=1)

    return df_test_pred

In [43]:
# モデル推論の実行
df_test_pred = predict_lgb(x_test, id_test)
display(df_test_pred.head())



Unnamed: 0,engagementMetricsDate,playerId,date_playerId,date,yearmonth,playerForTestSetAndFuturePreds,target1_fold0,target1_fold1,target1_fold2,target2_fold0,target2_fold1,target2_fold2,target3_fold0,target3_fold1,target3_fold2,target4_fold0,target4_fold1,target4_fold2,target1,target2,target3,target4
0,2021-04-27,656669,20210427_656669,2021-04-26,2021-04,1,0.029,0.0329,0.0304,1.376,1.1927,0.9529,0.0043,0.005,0.006,0.1918,0.2319,0.3213,0.0308,1.1739,0.0051,0.2483
1,2021-04-27,543475,20210427_543475,2021-04-26,2021-04,1,0.0033,0.0034,0.0033,1.0971,0.968,0.8118,0.0047,0.0053,0.0052,0.2145,0.2534,0.3002,0.0033,0.959,0.0051,0.256
2,2021-04-27,623465,20210427_623465,2021-04-26,2021-04,0,0.0001,0.0001,0.0002,0.3058,0.2606,0.2231,0.004,0.0034,0.0025,0.1053,0.1188,0.1562,0.0001,0.2632,0.0033,0.1268
3,2021-04-27,595032,20210427_595032,2021-04-26,2021-04,0,-0.0,0.0,0.0001,0.0311,0.077,0.1273,0.0008,0.0006,0.0004,0.0753,0.0849,0.127,0.0,0.0784,0.0006,0.0957
4,2021-04-27,592866,20210427_592866,2021-04-26,2021-04,1,0.052,0.0523,0.0188,1.4585,1.232,0.9757,0.0081,0.0091,0.0115,0.5356,0.6168,0.5319,0.041,1.2221,0.0096,0.5614


##### Part3: 提出用フォーマットへの変換

In [44]:
# 提出用フォーマットへの変換
df_submit = df_test_pred[["date_playerId", "target1", "target2", "target3", "target4"]]
display(df_submit.head())

Unnamed: 0,date_playerId,target1,target2,target3,target4
0,20210427_656669,0.0308,1.1739,0.0051,0.2483
1,20210427_543475,0.0033,0.959,0.0051,0.256
2,20210427_623465,0.0001,0.2632,0.0033,0.1268
3,20210427_595032,0.0,0.0784,0.0006,0.0957
4,20210427_592866,0.041,1.2221,0.0096,0.5614


## 8.4 特徴量エンジニアリング

- train_updated.csvの「rosters」カラムのデータ項目
- target1~4のラグ特徴量（1ヶ月前）

### 8.4.1 データ前処理

In [45]:
# train_updated.csv から rosters カラムのデータ取り出し
df_rosters = extract_data(train, col="rosters")
# 15s

In [46]:
# rostersデータ前処理加工
# dateカラムの作成・加工
df_rosters = df_rosters.rename(columns={"gameDate": "date"})
df_rosters["date"] = pd.to_datetime(df_rosters["date"], format="%Y-%m-%d")

# 追加する絡むリストの作成（dateとplayersIdは結合キー）
col_rosters = ["teamId", "statusCode", "status"]

display(df_rosters.head())

Unnamed: 0,playerId,date,teamId,statusCode,status
0,430935,2020-04-01,144,A,Active
1,435062,2020-04-01,120,A,Active
2,444489,2020-04-01,158,A,Active
3,445276,2020-04-01,119,A,Active
4,446308,2020-04-01,138,A,Active


In [47]:
# targetの統計量の計算
df_agg_target = df_train.groupby(["yearmonth", "playerId"])[["target1", "target2", "target3", "target4"]].aggregate(["mean", "median", "std", "min", "max"])
df_agg_target.columns = ["{}_{}".format(i, j) for i, j in df_agg_target.columns]
df_agg_target = df_agg_target.reset_index(drop=False)
display(df_agg_target.head())

Unnamed: 0,yearmonth,playerId,target1_mean,target1_median,target1_std,target1_min,target1_max,target2_mean,target2_median,target2_std,target2_min,target2_max,target3_mean,target3_median,target3_std,target3_min,target3_max,target4_mean,target4_median,target4_std,target4_min,target4_max
0,2020-04,112526,0.8834,0.0647,2.9618,0.0224,15.978,10.811,10.4352,5.3041,2.1242,21.7257,0.2894,0.1752,0.3478,0.0216,1.6761,21.1961,20.7913,12.6768,0.6305,51.3299
1,2020-04,134181,2.9999,0.2175,10.9845,0.0645,58.4642,14.7861,11.9902,13.2362,2.0241,76.4113,10.6877,0.9546,24.8149,0.0348,100.0,12.0298,11.6739,6.2926,0.5478,24.3902
2,2020-04,279571,0.0003,0.0,0.0006,0.0,0.0016,0.397,0.3435,0.2787,0.0,0.9174,0.0004,0.0,0.0013,0.0,0.006,0.2895,0.2481,0.1986,0.0097,0.7
3,2020-04,282332,0.1413,0.0748,0.1702,0.0223,0.7391,7.8652,7.7711,4.0453,1.7227,20.75,0.3794,0.3382,0.2484,0.0501,0.9882,11.354,10.0147,6.1022,0.5633,23.4455
4,2020-04,400085,1.9515,0.6949,3.3399,0.0947,17.0843,30.0941,27.2808,16.4382,6.4386,89.1042,13.3777,1.8486,26.4342,0.2183,100.0,50.7711,47.0509,29.4601,2.5769,100.0


In [48]:
# ラグ特徴量の作成

# 年月でソート（1ヶ月シフトさせるため、時系列順に並べる）
df_agg_target = df_agg_target.sort_values("yearmonth").reset_index(drop=True)

# yearmonthを1ヶ月シフトして過去にする
df_agg_target["yearmonth"] = df_agg_target.groupby(["playerId"])["yearmonth"].shift(-1)
# yearmonthの欠損値を「2021-8」で埋める
df_agg_target["yearmonth"] = df_agg_target["yearmonth"].fillna("2021-08")

# 集計値がラグ特徴量とわかるようにカラムの名称を変更
df_agg_target.columns = [col + "_lag1month" if col not in ["playerId", "yearmonth"] else col for col in df_agg_target.columns]

# 追加したカラムリストの作成
col_agg_target = list(df_agg_target.columns[df_agg_target.columns.str.contains("lag1month")])

display(df_agg_target.head())

Unnamed: 0,yearmonth,playerId,target1_mean_lag1month,target1_median_lag1month,target1_std_lag1month,target1_min_lag1month,target1_max_lag1month,target2_mean_lag1month,target2_median_lag1month,target2_std_lag1month,target2_min_lag1month,target2_max_lag1month,target3_mean_lag1month,target3_median_lag1month,target3_std_lag1month,target3_min_lag1month,target3_max_lag1month,target4_mean_lag1month,target4_median_lag1month,target4_std_lag1month,target4_min_lag1month,target4_max_lag1month
0,2020-05,112526,0.8834,0.0647,2.9618,0.0224,15.978,10.811,10.4352,5.3041,2.1242,21.7257,0.2894,0.1752,0.3478,0.0216,1.6761,21.1961,20.7913,12.6768,0.6305,51.3299
1,2020-05,628318,0.0003,0.0,0.0016,0.0,0.0088,0.3717,0.3519,0.2857,0.0,1.0582,0.0,0.0,0.0,0.0,0.0,0.4519,0.4173,0.2852,0.0126,1.176
2,2020-05,628317,0.0747,0.0327,0.1005,0.0139,0.4201,10.7568,9.6495,4.7834,3.5948,23.2609,0.0816,0.0746,0.0462,0.0116,0.1811,3.2524,2.9701,1.861,0.1119,6.8816
3,2020-05,627894,0.0004,0.0,0.0008,0.0,0.0037,1.2347,1.1066,0.6663,0.2956,2.4691,0.002,0.0,0.0035,0.0,0.0157,0.3802,0.3303,0.2352,0.0165,0.9146
4,2020-05,627500,0.0004,0.0,0.0019,0.0,0.0104,0.294,0.1969,0.3396,0.0,1.1472,0.0,0.0,0.0001,0.0,0.0005,0.2036,0.1609,0.1362,0.0117,0.5662


### 8.4.2 データセット作成

In [49]:
# 学習用データセットの作成

# データを結合
df_train = pd.merge(df_engagement, df_players, on=["playerId"], how="left")
df_train = pd.merge(df_train, df_rosters, on=["date", "playerId"], how="left")
df_train = pd.merge(df_train, df_agg_target, on=["playerId", "yearmonth"], how="left")

# 説明変数と目的変数の作成
x_train = df_train[
    [
        "playerId",
        "dayofweek",
        "birthCity",
        "birthStateProvince",
        "birthCountry",
        "heightInches",
        "weight",
        "primaryPositionCode",
        "primaryPositionName",
        "playerForTestSetAndFuturePreds",
    ] + col_rosters + col_agg_target
]
y_train = df_train[["target1", "target2", "target3", "target4"]]
id_train = df_train[
    [
        "engagementMetricsDate","playerId","date_playerId","date","yearmonth","playerForTestSetAndFuturePreds"
    ]
]

# カテゴリ変数をcategory型に変換
for col in [
    "playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"
] + col_rosters:
    x_train[col] = x_train[col].astype("category")

print(x_train.shape, y_train.shape, id_train.shape)

(1003707, 33) (1003707, 4) (1003707, 6)


In [52]:
display(x_train.head())

Unnamed: 0,playerId,dayofweek,birthCity,birthStateProvince,birthCountry,heightInches,weight,primaryPositionCode,primaryPositionName,playerForTestSetAndFuturePreds,teamId,statusCode,status,target1_mean_lag1month,target1_median_lag1month,target1_std_lag1month,target1_min_lag1month,target1_max_lag1month,target2_mean_lag1month,target2_median_lag1month,target2_std_lag1month,target2_min_lag1month,target2_max_lag1month,target3_mean_lag1month,target3_median_lag1month,target3_std_lag1month,target3_min_lag1month,target3_max_lag1month,target4_mean_lag1month,target4_median_lag1month,target4_std_lag1month,target4_min_lag1month,target4_max_lag1month
0,425794,2,Brunswick,GA,USA,79,230,1,Pitcher,1,138.0,A,Active,,,,,,,,,,,,,,,,,,,,
1,571704,2,Albuquerque,NM,USA,75,210,1,Pitcher,0,141.0,A,Active,,,,,,,,,,,,,,,,,,,,
2,506702,2,Maracaibo,,Venezuela,70,235,2,Catcher,1,114.0,A,Active,,,,,,,,,,,,,,,,,,,,
3,607231,2,Savannah,GA,USA,76,200,1,Pitcher,1,138.0,A,Active,,,,,,,,,,,,,,,,,,,,
4,543193,2,Columbia,CA,USA,76,215,1,Pitcher,0,,,,,,,,,,,,,,,,,,,,,,,


In [53]:
display(y_train.head())

Unnamed: 0,target1,target2,target3,target4
0,5.1249,9.434,0.1179,6.1947
1,0.0389,8.1761,0.0105,2.1304
2,0.0106,5.0314,0.0082,0.885
3,0.0247,2.8302,0.0222,0.59
4,0.0071,1.1006,0.0012,0.1967


In [54]:
display(id_train.head())

Unnamed: 0,engagementMetricsDate,playerId,date_playerId,date,yearmonth,playerForTestSetAndFuturePreds
0,2020-04-02,425794,20200402_425794,2020-04-01,2020-04,1
1,2020-04-02,571704,20200402_571704,2020-04-01,2020-04,0
2,2020-04-02,506702,20200402_506702,2020-04-01,2020-04,1
3,2020-04-02,607231,20200402_607231,2020-04-01,2020-04,1
4,2020-04-02,543193,20200402_543193,2020-04-01,2020-04,0


### 8.4.3 モデル学習

In [50]:
# モデル学習
params = {
    "boosting_type": "gbdt",
    "objective": "regression_l1",
    "metric": "mean_absolute_error",
    "learning_rate": 0.05,
    "num_leaves": 32,
    "subsample": 0.7,
    "subsample_freq": 1,
    "feature_fraction": 0.8,
    "min_data_in_leaf": 50,
    "min_sum_hessian_in_leaf": 50,
    "n_estimators": 10000,
    "random_state": 123,
    "importance_type": "gain",
}

df_valid_pred, df_metrics, df_imp = train_lgb(
    x_train,
    y_train,
    id_train,
    params,
    list_nfold=[0, 1, 2],
    model_train="train",
)
# 5m

-------------------- target1 , fold:  0 --------------------
x_va, y_va, id_va
(752265, 33) (752265,) (752265, 6)
(36797, 33) (36797,) (36797, 6)
training start.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027543 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8454
[LightGBM] [Info] Number of data points in the train set: 752265, number of used features: 33
[LightGBM] [Info] Start training from score 0.001289
Training until validation scores don't improve for 100 rounds
[100]	training's l1: 0.504414	valid_1's l1: 1.28763
[200]	training's l1: 0.504261	valid_1's l1: 1.28723
[300]	training's l1: 0.504065	valid_1's l1: 1.28668
Early stopping, best iteration is:
[267]	training's l1: 0.504073	valid_1's l1: 1.28667
-------------------- target2 , fold:  0 --------------------
x_va, y_va, id_va
(752265, 33) (752265,) (752265, 6)
(36797, 33) 

In [51]:
# 評価値の取得
print(f"MCMAE: {df_metrics["mae"].mean()}")
display(pd.pivot_table(df_metrics, index="nfold", columns="target", values="mae", aggfunc=np.mean, margins=True))

MCMAE: 1.2753868811822466


target,target1,target2,target3,target4,All
nfold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.2867,2.1865,0.8731,1.206,1.3881
1,1.1815,1.8953,0.825,1.5443,1.3615
2,1.0987,1.5734,0.7524,0.8818,1.0766
All,1.1889,1.8851,0.8168,1.2107,1.2754


In [53]:
# 説明変数の重要度の確認
df_imp.groupby(["col"])["imp"].aggregate(["mean", "std"]).sort_values("mean", ascending=False)[:10]

Unnamed: 0_level_0,mean,std
col,Unnamed: 1_level_1,Unnamed: 2_level_1
playerId,2086493.5507,1576369.7445
target3_mean_lag1month,1819243.0614,3762719.5721
target1_mean_lag1month,1573639.0108,2796189.7431
target1_median_lag1month,850869.666,1323185.2808
target1_std_lag1month,637682.1988,1242996.3031
target3_std_lag1month,588140.8248,1341843.2396
target4_mean_lag1month,501917.9009,1120624.2126
birthCity,429196.4021,315338.6709
target2_mean_lag1month,405228.8385,542622.0635
target2_std_lag1month,397383.6493,650700.5265


### 8.4.4 モデル推論

In [54]:
# 推論用データセット作成の関数

def make_dataset_for_predict(input_x, input_prediction):
    test = input_x.copy()
    prediction = input_prediction.copy()

    # 日付型に変換
    prediction["date"] = pd.to_datetime(prediction["date"], format="%Y%m%d")
    # engagementMetricsDateとplayerIdを取り出す
    prediction["engagementMetricsDate"] = prediction["date_plyerId"].apply(lambda x: x[:8])
    prediction["engagementMetricsDate"] = pd.to_datetime(prediction["engagementMetricsDate"], format="%Y%m%d")
    prediction["playerId"] = prediction["date_playerId"].apply(lambda x: x[9:])

    # dateから特徴量を作成
    prediction["dayofweek"] = prediction["date"].dt.dayofweek
    prediction["yearmonth"] = prediction["date"].astype(str).apply(lambda x: x[:7])

    # dateカラムの作成・加工
    df_rosters = extract_data(test, col="rosters")
    df_rosters = df_rosters.rename(columns={"gameDate": "date"})
    df_rosters["date"] = pd.to_datetime(df_rosters["date"], format="%Y-%m-%d")

    # テーブルの結合
    df_test = pd.merge(prediction, df_players, on=["playerId"], how="left")
    df_test = pd.merge(df_test, df_rosters, on=["date", "playerId"], how="left")
    df_test = pd.merge(df_test, df_agg_target, on=["playerId", "yearmonth"], how="left")

    # 説明変数の作成
    x_test = df_test[
        [
            "playerId",
            "dayofweek",
            "birthCity",
            "birthStateProvince",
            "birthCountry",
            "heightInches",
            "weight",
            "primaryPositionCode",
            "primaryPositionName",
            "playerForTestSetAndFuturePreds",
        ] + col_rosters + col_agg_target
    ]
    id_test = df_test[
        ["engagementMetricsDate", "playerId", "date_playerId", "date", "yearmonth", "playerForTestSetAndFuturePreds"]
    ]

    # カテゴリ変数をcategory型に変更
    for col in [
        "playerId", "dayofweek", "birthCity", "birthStateProvince", "birthCountry", "primaryPositionCode", "primaryPositionName"
    ] + col_rosters:
        x_test[col] = x_test[col].astype("category")

    return x_test, id_test

In [None]:
# 推論処理の実行（ベースラインと同じ）

In [None]:
# 提出データの作成

## 8.5 モデルチューニング

In [55]:
# 目的変数間の相関係数の算出
df_engagement[["target1", "target2", "target3", "target4"]].corr()

Unnamed: 0,target1,target2,target3,target4
target1,1.0,0.3529,0.3833,0.3252
target2,0.3529,1.0,0.366,0.4988
target3,0.3833,0.366,1.0,0.3229
target4,0.3252,0.4988,0.3229,1.0


In [57]:
# ライブラリのインポート

from sklearn.preprocessing import LabelEncoder

# import tensorflow
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout, BatchNormalization, Activation, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LearningRateScheduler
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.layers import Embedding, Flatten

In [69]:
def seed_everything(seed):
    import random

    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

### 8.5.1 データセット作成

In [58]:
col_rosters

['teamId', 'statusCode', 'status']

In [59]:
col_agg_target

['target1_mean_lag1month',
 'target1_median_lag1month',
 'target1_std_lag1month',
 'target1_min_lag1month',
 'target1_max_lag1month',
 'target2_mean_lag1month',
 'target2_median_lag1month',
 'target2_std_lag1month',
 'target2_min_lag1month',
 'target2_max_lag1month',
 'target3_mean_lag1month',
 'target3_median_lag1month',
 'target3_std_lag1month',
 'target3_min_lag1month',
 'target3_max_lag1month',
 'target4_mean_lag1month',
 'target4_median_lag1month',
 'target4_std_lag1month',
 'target4_min_lag1month',
 'target4_max_lag1month']

In [60]:
# 学習用データセットの前処理

# 説明変数と目的変数の作成
x_train = df_train[
    [
        "playerId",
        "dayofweek",
        "birthCity",
        "birthStateProvince",
        "birthCountry",
        "heightInches",
        "weight",
        "primaryPositionCode",
        "primaryPositionName",
        "playerForTestSetAndFuturePreds",
    ] + col_rosters + col_agg_target
]
y_train = df_train[["target1", "target2", "target3", "target4"]]
id_train = df_train[
    ["engagementMetricsDate", "playerId", "date_playerId", "date", "yearmonth", "playerForTestSetAndFuturePreds"]
]

print(x_train.shape, y_train.shape, id_train.shape)

(1003707, 33) (1003707, 4) (1003707, 6)


In [61]:
# 数値とカテゴリ値のカラムリストを作成
col_num = ["heightInches", "weight", "playerForTestSetAndFuturePreds"] + col_agg_target
col_cat = [
    "playerId",
    "dayofweek",
    "birthCity",
    "birthStateProvince",
    "birthCountry",
    "primaryPositionCode",
    "primaryPositionName",
] + col_rosters
print(len(col_num), len(col_cat))

23 10


#### 数値データの欠損値補間と正規化

- 欠損値は0で埋める
- 正規化は0~1の範囲になるように変換する

In [62]:
# 数値データの欠損値補間・正規化
dict_num = dict()
for col in col_num:
    print(col)
    # 欠損値補間: 0で埋める
    value_fillna = 0
    x_train[col] = x_train[col].fillna(value_fillna)

    # 正規化（0~1の範囲になるように変換）
    value_min = x_train[col].min()
    value_max = x_train[col].max()
    x_train[col] = (x_train[col] - value_min) / (value_max - value_min)

    # testデータにも適用できるように保存
    dict_num[col] = dict()
    dict_num[col]["fillna"] = value_fillna
    dict_num[col]["min"] = value_min
    dict_num[col]["max"] = value_max

print("Done.")

heightInches
weight
playerForTestSetAndFuturePreds
target1_mean_lag1month
target1_median_lag1month
target1_std_lag1month
target1_min_lag1month
target1_max_lag1month
target2_mean_lag1month
target2_median_lag1month
target2_std_lag1month
target2_min_lag1month
target2_max_lag1month
target3_mean_lag1month
target3_median_lag1month
target3_std_lag1month
target3_min_lag1month
target3_max_lag1month
target4_mean_lag1month
target4_median_lag1month
target4_std_lag1month
target4_min_lag1month
target4_max_lag1month
Done.


#### カテゴリ値データの欠損値補間と数値化

- 欠損値は「unknown」で埋める
- カテゴリの値を0から始まる整数にマッピングさせて数値に変換する

In [64]:
# カテゴリ値の欠損値補間・数値化
dict_cat = dict()
for col in col_cat:
    print(col)
    # 欠損値補間: unknownで埋める
    value_fillna = "unknown"
    x_train[col] = x_train[col].fillna(value_fillna)

    # str型に変換
    x_train[col] = x_train[col].astype(str)

    # ラベルエンコーダー: 0からはじまる整数に変換
    le = LabelEncoder()
    le.fit(x_train[col])
    list_label = sorted(list(set(le.classes_) | set("unknown")))
    map_label = {j:i for i, j in enumerate(list_label)}
    x_train[col] = x_train[col].map(map_label)

    # testデータにも適用できるように保存
    dict_cat[col] = dict()
    dict_cat[col]["fillna"] = value_fillna
    dict_cat[col]["map_label"] = map_label
    dict_cat[col]["num_label"] = len(list_label)

print("Done.")

playerId
dayofweek
birthCity
birthStateProvince
birthCountry
primaryPositionCode
primaryPositionName
teamId
statusCode
status
Done.


In [None]:
# 推論用データの欠損値補間・正規化/数値化の関数化

def transform_data(input_x):
    output_x = input_x.copy()

    # 数値データの欠損値補間・正規化
    for col in col_num:
        # 欠損値補間: 0で埋める
        value_fillna = dict_num[col]["fillna"]
        output_x[col] = output_x[col].fillna(value_fillna)

        # 正規化（0~1の範囲になるように変換）
        value_min = dict_num[col]["min"]
        value_max = dict_num[col]["max"]
        output_x[col] = (output_x[col] - value_min) / (value_max - value_min)

    # カテゴリ値データの欠損値補間: 数値化
    for col in col_cat:
        # 欠損値補間: unknownで埋める
        value_fillna = dict_cat[col]["fillna"]
        output_x[col] = output_x[col].fillna(value_fillna)

        # str型に変換
        output_x[col] = output_x[col].astype(str)

        # ラベルエンコーダー: 0からはじまる整数に変換
        map_label = dict_cat[col]["map_label"]
        output_x[col] = output_x[col].map(map_label)
        # 対応するものがない場合はunknownで埋める
        output_x[col] = output_x[col].fillna(map_label["unknown"])

    return output_x

### 8.5.2 モデル学習

- 埋め込み層ありのネットワークモデル

In [65]:
# ニューラルネットワークのモデル定義

def create_model(
    col_num=["heightInches", "weight"],
    col_cat=["playerId", "teamId", "dayofweek"],
    show=False,
):
    input_num = Input(shape=(len(col_num),))
    input_cat = Input(shape=(len(col_cat),))

    # numeric
    x_num = input_num

    # category
    for i, col in enumerate(col_cat):
        tmp_cat = input_cat[:, i]
        input_dim = dict_cat[col]["num_label"]
        output_dim = int(input_dim/2)
        tmp_cat = Embedding(input_dim=input_dim, output_dim=output_dim)(tmp_cat)
        tmp_cat = Dropout(0.2)(tmp_cat)
        tmp_cat = Flatten()(tmp_cat)
        if i == 0:
            x_cat = tmp_cat
        else:
            x_cat = Concatenate()([x_cat, tmp_cat])

    # concat
    x = Concatenate()([x_num, x_cat])
    x = Dense(128, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.1)(x)
    output = Dense(4, activation="linear")(x)

    model = Model(inputs=[input_num, input_cat], outputs=output)
    model.compile(optimizer="Adam", loss="mae", metrics=["mae"])

    if show:
        print(model.summary())
    else:
        return model

In [66]:
# モデル構造の確認
create_model(
    col_num=col_num,
    col_cat=col_cat,
    show=True
)

None


In [71]:
# LightGBM学習用の関数をニューラルネットワーク用にカスタマイズ
# データフレームをarray型に変換
# 出力が4次元（target1~4）なので、pd_pred_validをその形式に合わせる

def train_tf(
    input_x,
    input_y,
    input_id,
    list_nfold=[0, 1, 2],
    mode_train="train",
    batch_size=1024,
    epochs=100,
):
    # 推論値を格納する変数の作成
    df_valid_pred = pd.DataFrame()
    # 評価値をいれる変数の作成
    metrics = list()

    # validation
    cv = list()
    for month_tr, month_va in list_cv_month:
        cv.append([
            input_id.index[input_id["yearmonth"].isin(month_tr)],
            input_id.index[input_id["yearmonth"].isin(month_va) & (input_id["playerForTestSetAndFuturePreds"] == 1)]
        ])

    # モデル学習 (foldごとに学習)
    for nfold in list_nfold:
        print("-" * 20, "fold:", nfold, "-" * 20)
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

        x_num_tr, x_cat_tr, y_tr = (
            input_x.loc[idx_tr, col_num].values,
            input_x.loc[idx_tr, col_cat].values,
            input_y.loc[idx_tr, :].values,
        )
        x_num_va, x_cat_va, y_va = (
            input_x.loc[idx_va, col_num].values,
            input_x.loc[idx_va, col_cat].values,
            input_y.loc[idx_va, :].values,
        )
        print(x_num_tr.shape, x_cat_tr.shape, y_tr.shape)
        print(x_num_va.shape, x_cat_va.shape, y_va.shape)

        filepath = "model_tf_fold{}.h5".format(nfold)

        if mode_train == "train":
            print("training start.")
            seed_everything(seed=123)
            model = create_model(col_num=col_num, col_cat=col_cat, show=False)
            model.fit(
                x=[x_num_tr, x_cat_tr],
                y=y_tr,
                validation_data=([x_num_va, x_cat_va], y_va),
                batch_size=batch_size,
                epochs=epochs,
                callbacks=[
                    ModelCheckpoint(
                        filepath=filepath,
                        monitor="val_loss",
                        mode="min",
                        verbose=1,
                        save_best_only=True,
                        save_weights_only=True,
                    ),
                    EarlyStopping(
                        monitor="val_loss", mode="min", min_delta=0, patience=10, verbose=1, restore_best_weights=True
                    ),
                    ReduceLROnPlateau(monitor="val_loss", mode="min", factor=0.1, patience=5, verbose=1),
                ],
                verbose=1,
            )
        else:
            print("model load.")
            model = create_model(col_num=col_num, col_cat=col_cat, show=False)
            model.load_weights(filepath)
            print("Done.")

        # validの推論値取得
        y_va_pred = model.predict([x_num_va, x_cat_va])
        tmp_pred = pd.concat(
            [
                id_va,
                pd.DataFrame(y_va, columns=["target1_true", "target2_true", "target3_true", "target4_true"]),
                pd.DataFrame(y_va_pred, columns=["target1_pred", "target2_pred", "target3_pred", "target4_pred"]),
            ],
            axis=1,
        )
        tmp_pred["nfold"] = nfold
        df_valid_pred = pd.concat([df_valid_pred, tmp_pred], axis=0, ignore_index=True)

        # 評価値の算出
        metrics.append(["target1", nfold, np.mean(np.abs(y_va[:, 0] - y_va_pred[:, 0]))])
        metrics.append(["target2", nfold, np.mean(np.abs(y_va[:, 1] - y_va_pred[:, 1]))])
        metrics.append(["target3", nfold, np.mean(np.abs(y_va[:, 2] - y_va_pred[:, 2]))])
        metrics.append(["target4", nfold, np.mean(np.abs(y_va[:, 3] - y_va_pred[:, 3]))])

    print("-" * 10, "result", "-" * 10)
    # 評価値
    df_metrics = pd.DataFrame(metrics, columns=["target", "nfold", "mae"])
    print("MCMAE: {:.4f}".format(df_metrics["mae"].mean()))

    # validの推論値
    df_valid_pred_all = pd.pivot_table(
        df_valid_pred,
        index=[
            "engagementMetricsDate",
            "playerId",
            "date_playerId",
            "date",
            "yearmonth",
            "playerForTestSetAndFuturePreds",
        ],
        columns=["nfold"],
        values=list(df_valid_pred.columns[df_valid_pred.columns.str.contains("target")]),
        aggfunc=np.sum,
    )
    df_valid_pred_all.columns = [
        "{}_fold{}_{}".format(i.split("_")[0], j, i.split("_")[1]) for i, j in df_valid_pred_all.columns
    ]
    df_valid_pred_all = df_valid_pred_all.reset_index(drop=False)

    return df_valid_pred_all, df_metrics

In [72]:
# 学習の実行
df_valid_pred, df_metrics = train_tf(
    x_train,
    y_train,
    id_train,
    list_nfold=[0, 1, 2],
    mode_train="train",
    batch_size=1024,
    epochs=1000,
)

-------------------- fold: 0 --------------------
(752265, 23) (752265, 10) (752265, 4)
(36797, 23) (36797, 10) (36797, 4)
training start.


AttributeError: module 'keras._tf_keras.keras.backend' has no attribute 'set_session'