# ボートレーサーごとの統計量をdfに追加　その3
- 各枠からの1着率, 2着率, 3着率をそれぞれ計算し、特徴量として追加
- その枠でのスタートタイムの平均値を算出し、特徴量として追加

## 1. 統計解析用のdfをロード
- `race_df`に格納
- 2_データ前処理.ipnbを参照

In [1]:
import pandas as pd
race_df = pd.read_pickle('../../data/arrangedData/data_formatted_for_statistic.pkl').reset_index(drop=True)
# 中身を確認
race_df

Unnamed: 0,date,venue,raceNumber,枠,racer_id,racer_class,racer_name,num_false_start,num_late_start,motorNo,...,exhibition_cource,exhibition_ST,flying,late,着順,タイム,start_time,weather_曇り,weather_雨,weather_雪
0,2020-11-01,下　関,1.0,1,4060,2,島田 一生,0,0,35,...,1,0.05,0.0,0,1,"1'48""4",0.11,1,0,0
1,2020-11-01,下　関,1.0,2,3680,3,森林 太,0,0,37,...,2,0.11,0.0,0,2,"1'50""8",0.17,1,0,0
2,2020-11-01,下　関,1.0,3,3123,3,中西 宏文,0,0,66,...,3,0.18,0.0,0,5,,0.09,1,0,0
3,2020-11-01,下　関,1.0,4,5001,3,坂本 雅佳,0,0,20,...,4,0.01,0.0,0,4,"1'53""9",0.14,1,0,0
4,2020-11-01,下　関,1.0,5,4219,3,本岡 勝利,0,0,60,...,5,0.21,0.0,0,3,"1'52""1",0.17,1,0,0


## 2. これまでのレース結果から統計量を算出
- 各選手が、その枠からの1着率・２着率・3着率を算出し、それぞれ列に格納
- 選手ごと、枠ごと、日付ごとに、その前日までのレースを対象に統計量を算出

In [2]:
from tqdm.notebook import tqdm

def calc_rank_i_ratio(data, rank_i):
    # rank_i着であった率を算出
    total_count = data.count()
    rank_i_count = (data == rank_i).sum()
    rank_i_ratio = rank_i_count / total_count
    
    return rank_i_ratio

date_list = sorted(race_df["date"].unique())
date_list = date_list[1:]

stocastic_df_list = []

for date in tqdm(date_list):

    rank_i_ratio_df_list = []
    
    for i in range(1,4):
        race_df_before_the_date = race_df[race_df["date"] < date]

        # その日までのデータを集めたdfに対して、選手ごとにi着率を算出
        rank_i_ratio_df = race_df_before_the_date.groupby(
            ["racer_id", "枠"])["着順"].apply(
            calc_rank_i_ratio, rank_i=i).reset_index() # reset_indexすることでgroupbyオブジェクトをpandas dfに変換
        
        rank_i_ratio_df.rename(columns={"着順": "{0}着率".format(i)}, inplace=True)
        
        # racer_idと枠をmultiindexとして指定
        rank_i_ratio_df.set_index(["racer_id", "枠"], inplace=True)
        
        rank_i_ratio_df_list.append(rank_i_ratio_df)
    
    # その日までのレーサーごとの統計量を横方向に結合
    rank_i_ratio_df_date = pd.concat(rank_i_ratio_df_list, axis=1)
    
    # レーサーごと・枠番ごとの平均start timeを算出
    average_start_time_df = race_df_before_the_date.groupby(
        ["racer_id", "枠"])["start_time"].mean().reset_index()
    
    average_start_time_df.rename(columns={"start_time": "average_start_time"}, inplace=True)
    
    average_start_time_df.set_index(["racer_id", "枠"], inplace=True)
    
    # その日のデータについて諸々の統計量をマージしたdf
    stocastic_df_date = rank_i_ratio_df_date.join(average_start_time_df)
    
    # dfにdate列を追加
    stocastic_df_date["date"] = date
    # その日についてまとめたデータをリストに格納
    stocastic_df_list.append(stocastic_df_date)

# あらゆる日付について縦方向にマージ
stocastic_df = pd.concat(stocastic_df_list)
# indexを振り直す
stocastic_df.reset_index(inplace=True)

stocastic_df

  0%|          | 0/144 [00:00<?, ?it/s]

Unnamed: 0,racer_id,枠,1着率,2着率,3着率,average_start_time,date
0,2014,3,0.0,0.0,0.0,0.200000,2020-11-02
1,2014,6,0.0,0.0,0.0,0.170000,2020-11-02
2,2841,3,0.0,0.0,0.0,0.130000,2020-11-02
3,2841,6,0.0,0.0,0.0,0.330000,2020-11-02
4,2844,5,0.0,0.0,0.0,0.210000,2020-11-02
...,...,...,...,...,...,...,...
1209635,5174,5,0.0,0.0,0.0,0.159333,2021-03-29
1209636,5174,6,0.0,0.0,0.0,0.175238,2021-03-29
1209637,5175,4,0.0,0.0,0.0,0.270000,2021-03-29
1209638,5175,5,0.0,0.0,0.0,0.189375,2021-03-29


## 3. `race_df`に2で算出した統計量をマージ

In [3]:
merge_key_list = ["date", "枠", "racer_id"]
race_df_w_stocastic = pd.merge(race_df, stocastic_df,
                               left_on=merge_key_list, right_on=merge_key_list,
                               how="left"
                              )
race_df_w_stocastic

Unnamed: 0,date,venue,raceNumber,枠,racer_id,racer_class,racer_name,num_false_start,num_late_start,motorNo,...,着順,タイム,start_time,weather_曇り,weather_雨,weather_雪,1着率,2着率,3着率,average_start_time
0,2020-11-01,下　関,1.0,1,4060,2,島田 一生,0,0,35,...,1,"1'48""4",0.11,1,0,0,,,,
1,2020-11-01,下　関,1.0,2,3680,3,森林 太,0,0,37,...,2,"1'50""8",0.17,1,0,0,,,,
2,2020-11-01,下　関,1.0,3,3123,3,中西 宏文,0,0,66,...,5,,0.09,1,0,0,,,,
3,2020-11-01,下　関,1.0,4,5001,3,坂本 雅佳,0,0,20,...,4,"1'53""9",0.14,1,0,0,,,,
4,2020-11-01,下　関,1.0,5,4219,3,本岡 勝利,0,0,60,...,3,"1'52""1",0.17,1,0,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124957,2021-03-29,蒲　郡,11.0,2,3596,3,河上 年昭,0,0,67,...,4,"1'53""1",0.23,0,0,0,0.000000,0.466667,0.066667,0.172667
124958,2021-03-29,蒲　郡,11.0,3,4512,1,高野 哲史,0,0,19,...,1,"1'49""1",0.17,0,0,0,0.076923,0.307692,0.153846,0.199231
124959,2021-03-29,蒲　郡,11.0,4,4118,1,宇佐見 淳,0,0,39,...,6,,0.16,0,0,0,0.100000,0.150000,0.300000,0.146000
124960,2021-03-29,蒲　郡,11.0,5,4016,2,西川　新太郎,1,0,38,...,3,"1'52""4",0.12,0,0,0,0.000000,0.000000,0.142857,0.200000


## 4. 3.で作った統計量入りのdfをpickleファイルにして保存

In [4]:
race_df_w_stocastic.to_pickle('../../data/arrangedData/race_df_w_stocastic_3.pkl')