# ボートレーサーごとの統計量をdfに追加　その3
- 各枠からの1着率, 2着率, 3着率をそれぞれ計算し、特徴量として追加
- その枠でのスタートタイムの平均値を算出し、特徴量として追加

## 1. 統計解析用のdfをロード
- `race_df`に格納
- 2_データ前処理.ipnbを参照

In [1]:
import pandas as pd
race_df = pd.read_pickle('../../data/arrangedData/data_formatted_for_statistic.pkl').reset_index(drop=True)
# 中身を確認
race_df

Unnamed: 0,date,venue,raceNumber,枠,racer_id,racer_class,racer_name,num_false_start,num_late_start,motorNo,...,exhibition_ST,flying,late,着順,タイム,進入コース,start_time,weather_曇り,weather_雨,weather_雪
0,2020-10-01,三　国,1.0,1,4961,2,西橋 奈未,1,0,34,...,0.07,0.0,0,1,"1'47""9",1.0,0.21,0,0,0
1,2020-10-01,三　国,1.0,2,4071,3,古賀 千晶,1,0,52,...,0.21,0.0,0,3,"1'52""1",2.0,0.25,0,0,0
2,2020-10-01,三　国,1.0,3,4849,4,森田 太陽,0,0,26,...,0.30,0.0,0,6,,3.0,0.21,0,0,0
3,2020-10-01,三　国,1.0,4,4746,3,大豆生田　蒼,0,0,53,...,0.24,0.0,0,2,"1'49""1",4.0,0.23,0,0,0
4,2020-10-01,三　国,1.0,5,4987,3,島倉 都,0,0,35,...,0.19,0.0,0,4,"1'53""7",5.0,0.26,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155239,2021-04-06,蒲　郡,12.0,2,3177,3,宮本 紀美,0,0,24,...,0.11,0.0,0,4,"1'53""4",2.0,0.14,0,1,0
155240,2021-04-06,蒲　郡,12.0,3,4758,2,富樫 麗加,1,0,59,...,0.09,1.0,0,2,"1'48""9",3.0,0.09,0,1,0
155241,2021-04-06,蒲　郡,12.0,4,4611,2,今井 美亜,0,0,68,...,0.04,0.0,0,3,"1'51""8",4.0,0.11,0,1,0
155242,2021-04-06,蒲　郡,12.0,5,3994,2,茶谷 桜,0,0,52,...,0.07,0.0,0,5,,5.0,0.14,0,1,0


## 2. これまでのレース結果から統計量を算出
- 各選手が、その枠からの1着率・２着率・3着率を算出し、それぞれ列に格納
- 選手ごと、枠ごと、日付ごとに、その前日までのレースを対象に統計量を算出

In [2]:
from tqdm.notebook import tqdm

def calc_rank_i_ratio(data, rank_i):
    # rank_i着であった率を算出
    total_count = data.count()
    rank_i_count = (data == rank_i).sum()
    rank_i_ratio = rank_i_count / total_count
    
    return rank_i_ratio

date_list = sorted(race_df["date"].unique())
date_list = date_list[1:]

stocastic_df_list = []

for date in tqdm(date_list):

    rank_i_ratio_df_list = []
    
    for i in range(1,4):
        race_df_before_the_date = race_df[race_df["date"] < date]

        # その日までのデータを集めたdfに対して、選手ごとにi着率を算出
        rank_i_ratio_df = race_df_before_the_date.groupby(
            ["racer_id", "枠"])["着順"].apply(
            calc_rank_i_ratio, rank_i=i).reset_index() # reset_indexすることでgroupbyオブジェクトをpandas dfに変換
        
        rank_i_ratio_df.rename(columns={"着順": "{0}着率".format(i)}, inplace=True)
        
        # racer_idと枠をmultiindexとして指定
        rank_i_ratio_df.set_index(["racer_id", "枠"], inplace=True)
        
        rank_i_ratio_df_list.append(rank_i_ratio_df)
    
    # その日までのレーサーごとの統計量を横方向に結合
    rank_i_ratio_df_date = pd.concat(rank_i_ratio_df_list, axis=1)
    
    # レーサーごと・枠番ごとの平均start timeを算出
    average_start_time_df = race_df_before_the_date.groupby(
        ["racer_id", "枠"])["start_time"].mean().reset_index()
    
    average_start_time_df.rename(columns={"start_time": "average_start_time"}, inplace=True)
    
    average_start_time_df.set_index(["racer_id", "枠"], inplace=True)
    
    # その日のデータについて諸々の統計量をマージしたdf
    stocastic_df_date = rank_i_ratio_df_date.join(average_start_time_df)
    
    # dfにdate列を追加
    stocastic_df_date["date"] = date
    # その日についてまとめたデータをリストに格納
    stocastic_df_list.append(stocastic_df_date)

# あらゆる日付について縦方向にマージ
stocastic_df = pd.concat(stocastic_df_list)
# indexを振り直す
stocastic_df.reset_index(inplace=True)

stocastic_df

  0%|          | 0/183 [00:00<?, ?it/s]

Unnamed: 0,racer_id,枠,1着率,2着率,3着率,average_start_time,date
0,2014,3,0.0,0.0,0.0,0.150000,2020-10-02
1,2014,5,0.0,0.0,1.0,0.130000,2020-10-02
2,2787,2,0.0,0.0,1.0,0.180000,2020-10-02
3,2787,3,0.0,0.0,0.0,0.120000,2020-10-02
4,2903,1,1.0,0.0,0.0,0.160000,2020-10-02
...,...,...,...,...,...,...,...
1557488,5174,5,0.0,0.0,0.0,0.153333,2021-04-06
1557489,5174,6,0.0,0.0,0.0,0.175238,2021-04-06
1557490,5175,4,0.0,0.0,0.0,0.300000,2021-04-06
1557491,5175,5,0.0,0.0,0.0,0.224444,2021-04-06


## 3. `race_df`に2で算出した統計量をマージ

In [3]:
merge_key_list = ["date", "枠", "racer_id"]
race_df_w_stocastic = pd.merge(race_df, stocastic_df,
                               left_on=merge_key_list, right_on=merge_key_list,
                               how="left"
                              )
race_df_w_stocastic

Unnamed: 0,date,venue,raceNumber,枠,racer_id,racer_class,racer_name,num_false_start,num_late_start,motorNo,...,タイム,進入コース,start_time,weather_曇り,weather_雨,weather_雪,1着率,2着率,3着率,average_start_time
0,2020-10-01,三　国,1.0,1,4961,2,西橋 奈未,1,0,34,...,"1'47""9",1.0,0.21,0,0,0,,,,
1,2020-10-01,三　国,1.0,2,4071,3,古賀 千晶,1,0,52,...,"1'52""1",2.0,0.25,0,0,0,,,,
2,2020-10-01,三　国,1.0,3,4849,4,森田 太陽,0,0,26,...,,3.0,0.21,0,0,0,,,,
3,2020-10-01,三　国,1.0,4,4746,3,大豆生田　蒼,0,0,53,...,"1'49""1",4.0,0.23,0,0,0,,,,
4,2020-10-01,三　国,1.0,5,4987,3,島倉 都,0,0,35,...,"1'53""7",5.0,0.26,0,0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155239,2021-04-06,蒲　郡,12.0,2,3177,3,宮本 紀美,0,0,24,...,"1'53""4",2.0,0.14,0,1,0,0.000000,0.142857,0.285714,0.182143
155240,2021-04-06,蒲　郡,12.0,3,4758,2,富樫 麗加,1,0,59,...,"1'48""9",3.0,0.09,0,1,0,0.058824,0.294118,0.294118,0.105294
155241,2021-04-06,蒲　郡,12.0,4,4611,2,今井 美亜,0,0,68,...,"1'51""8",4.0,0.11,0,1,0,0.125000,0.250000,0.000000,0.163125
155242,2021-04-06,蒲　郡,12.0,5,3994,2,茶谷 桜,0,0,52,...,,5.0,0.14,0,1,0,0.000000,0.000000,0.187500,0.184375


## 4. 3.で作った統計量入りのdfをpickleファイルにして保存

In [4]:
race_df_w_stocastic.to_pickle('../../data/arrangedData/race_df_w_stocastic_3.pkl')