# ボートレーサーごとの統計量をdfに追加　その3
- 各枠からの1着率, 2着率, 3着率をそれぞれ計算し、特徴量として追加
- その枠でのスタートタイムの平均値を算出し、特徴量として追加

## 1. 統計解析用のdfをロード
- `race_df`に格納
- 2_データ前処理.ipnbを参照

In [5]:
import pandas as pd
race_df = pd.read_pickle('../../data/arrangedData/data_formatted_for_statistic.pkl').reset_index(drop=True)
# 中身を確認
race_df

Unnamed: 0,date,venue,raceNumber,枠,racer_id,racer_class,racer_name,num_false_start,num_late_start,motorNo,...,進入コース,start_time,exhibition_cource,exhibition_flying,exhibition_late,exhibition_st,weather_晴,weather_曇り,weather_雨,weather_雪
0,2020-08-01,びわこ,1.0,1,3783,1,瓜生 正義,0,0,46,...,1.0,0.06,1.0,False,False,0.13,1,0,0,0
1,2020-08-01,びわこ,1.0,2,4580,1,長谷川　雅和,0,0,54,...,2.0,0.13,2.0,False,False,0.06,1,0,0,0
2,2020-08-01,びわこ,1.0,3,4261,1,岡 祐臣,0,0,41,...,3.0,0.18,3.0,False,False,0.24,1,0,0,0
3,2020-08-01,びわこ,1.0,4,4064,1,原田 篤志,0,0,60,...,4.0,0.27,4.0,True,False,0.07,1,0,0,0
4,2020-08-01,びわこ,1.0,5,4239,1,竹田 辰也,0,0,51,...,5.0,0.24,5.0,True,False,0.04,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227569,2021-04-30,鳴　門,12.0,2,3070,2,山室 展弘,0,0,12,...,2.0,0.14,2.0,False,False,0.18,1,0,0,0
227570,2021-04-30,鳴　門,12.0,3,3919,1,村上 純,1,0,11,...,3.0,0.18,3.0,False,False,0.11,1,0,0,0
227571,2021-04-30,鳴　門,12.0,4,3516,3,吉原 聖人,0,0,14,...,4.0,0.19,4.0,False,False,0.10,1,0,0,0
227572,2021-04-30,鳴　門,12.0,5,3333,1,丸尾 義孝,1,0,25,...,5.0,0.26,5.0,False,False,0.07,1,0,0,0


## 2. これまでのレース結果から統計量を算出
- 各選手が、その枠からの1着率・２着率・3着率を算出し、それぞれ列に格納
- 選手ごと、枠ごと、日付ごとに、その前日までのレースを対象に統計量を算出

In [None]:
from tqdm.notebook import tqdm

def calc_rank_i_ratio(data, rank_i):
    # rank_i着であった率を算出
    total_count = data.count()
    rank_i_count = (data == rank_i).sum()
    rank_i_ratio = rank_i_count / total_count
    
    return rank_i_ratio

date_list = sorted(race_df["date"].unique())
date_list = date_list[1:]

stocastic_df_list = []

for date in tqdm(date_list):

    rank_i_ratio_df_list = []
    
    for i in range(1,4):
        race_df_before_the_date = race_df[race_df["date"] < date]

        # その日までのデータを集めたdfに対して、選手ごとにi着率を算出
        rank_i_ratio_df = race_df_before_the_date.groupby(
            ["racer_id", "枠"])["着順"].apply(
            calc_rank_i_ratio, rank_i=i).reset_index() # reset_indexすることでgroupbyオブジェクトをpandas dfに変換
        
        rank_i_ratio_df.rename(columns={"着順": "{0}着率".format(i)}, inplace=True)
        
        # racer_idと枠をmultiindexとして指定
        rank_i_ratio_df.set_index(["racer_id", "枠"], inplace=True)
        
        rank_i_ratio_df_list.append(rank_i_ratio_df)
    
    # その日までのレーサーごとの統計量を横方向に結合
    rank_i_ratio_df_date = pd.concat(rank_i_ratio_df_list, axis=1)
    
    # レーサーごと・枠番ごとの平均start timeを算出
    average_start_time_df = race_df_before_the_date.groupby(
        ["racer_id", "枠"])["start_time"].mean().reset_index()
    
    average_start_time_df.rename(columns={"start_time": "average_start_time"}, inplace=True)
    
    average_start_time_df.set_index(["racer_id", "枠"], inplace=True)
    
    # その日のデータについて諸々の統計量をマージしたdf
    stocastic_df_date = rank_i_ratio_df_date.join(average_start_time_df)
    
    # dfにdate列を追加
    stocastic_df_date["date"] = date
    # その日についてまとめたデータをリストに格納
    stocastic_df_list.append(stocastic_df_date)

# あらゆる日付について縦方向にマージ
stocastic_df = pd.concat(stocastic_df_list)
# indexを振り直す
stocastic_df.reset_index(inplace=True)

stocastic_df

  0%|          | 0/268 [00:00<?, ?it/s]

## 3. `race_df`に2で算出した統計量をマージ

In [3]:
merge_key_list = ["date", "枠", "racer_id"]
race_df_w_stocastic = pd.merge(race_df, stocastic_df,
                               left_on=merge_key_list, right_on=merge_key_list,
                               how="left"
                              )
race_df_w_stocastic

Unnamed: 0,date,venue,raceNumber,枠,racer_id,racer_class,racer_name,num_false_start,num_late_start,motorNo,...,exhibition_flying,exhibition_late,exhibition_st,weather_曇り,weather_雨,weather_雪,1着率,2着率,3着率,average_start_time
0,2020-10-01,三　国,1.0,1,4961,2,西橋 奈未,1,0,34,...,False,False,0.07,0,0,0,,,,
1,2020-10-01,三　国,1.0,2,4071,3,古賀 千晶,1,0,52,...,False,False,0.21,0,0,0,,,,
2,2020-10-01,三　国,1.0,3,4849,4,森田 太陽,0,0,26,...,False,False,0.30,0,0,0,,,,
3,2020-10-01,三　国,1.0,4,4746,3,大豆生田　蒼,0,0,53,...,False,False,0.24,0,0,0,,,,
4,2020-10-01,三　国,1.0,5,4987,3,島倉 都,0,0,35,...,False,False,0.19,0,0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
158977,2021-04-10,蒲　郡,12.0,2,4066,2,東本 勝利,0,0,20,...,False,False,0.04,0,0,0,0.117647,0.352941,0.176471,0.161176
158978,2021-04-10,蒲　郡,12.0,3,4611,2,今井 美亜,0,0,68,...,True,False,0.06,0,0,0,0.190476,0.142857,0.142857,0.147143
158979,2021-04-10,蒲　郡,12.0,4,3761,2,山本 光雄,0,0,27,...,True,False,0.01,0,0,0,0.052632,0.210526,0.157895,0.170526
158980,2021-04-10,蒲　郡,12.0,5,3994,2,茶谷 桜,0,0,52,...,False,False,0.02,0,0,0,0.000000,0.000000,0.176471,0.181765


## 4. 3.で作った統計量入りのdfをpickleファイルにして保存

In [4]:
race_df_w_stocastic.to_pickle('../../data/arrangedData/race_df_w_stocastic_3.pkl')