In [44]:
import glob
import pandas as pd

#### レース情報をまとめたpicleファイルを読み込んで一つにまとめる

In [45]:
# ファイル内容確認用
all_files = glob.glob('./crawledData/*.pkl')

# その日のレース情報をまとめたdfを要素にもつリスト
race_df_list = [pd.read_pickle(file) for file in all_files]
# concatして一つのdfにまとめる
race_df = pd.concat(race_df_list)

#### 解析に使いやすいようにデータを整形する

In [50]:
race_df_arranged = race_df.copy()

# それぞれの列のデータタイプの変更など
race_df_arranged["boatNo"] = race_df_arranged["boatNo"].astype("int")
race_df_arranged["motorNo"] = race_df_arranged["motorNo"].astype("int")
race_df_arranged["num_false_start"] = race_df_arranged["num_false_start"].map(lambda x: int(str(x)[1]))
race_df_arranged["num_late_start"] = race_df_arranged["num_late_start"].map(lambda x: int(str(x)[1]))
race_df_arranged["racer_id"] = race_df_arranged["racer_id"].astype("int")
race_df_arranged["ボート2連率"] = race_df_arranged["ボート2連率"].astype("float64")
race_df_arranged["ボート3連率"] = race_df_arranged["ボート3連率"].astype("float64")
race_df_arranged["racer_class"] = race_df_arranged["racer_class"].map({"A1": 1, "A2": 2, "B1": 3, "B2": 4})
race_df_arranged["モーター2連率"] = race_df_arranged["モーター2連率"].astype("float64")
race_df_arranged["モーター3連率"] = race_df_arranged["モーター3連率"].astype("float64")
race_df_arranged["exhibitionTime"] = race_df_arranged["exhibitionTime"].astype("float64")
race_df_arranged["exhibition_ST"] = race_df_arranged["exhibition_ST"].astype("float64")
race_df_arranged["exhibition_cource"] = race_df_arranged["exhibition_cource"].astype("int")
race_df_arranged["temperature"] = race_df_arranged["temperature"].astype("float64")
race_df_arranged["tilt"] = race_df_arranged["tilt"].astype("float64")
race_df_arranged["water_temperature"] = race_df_arranged["water_temperature"].astype("float64")
race_df_arranged["wave_height"] = race_df_arranged["wave_height"].astype("int")
race_df_arranged["weight"] = race_df_arranged["weight"].astype("float64")
race_df_arranged["wind_speed"] = race_df_arranged["wind_speed"].astype("int")

# weatherをdummy変数に変換
dummy_df_weather = pd.get_dummies(race_df_arranged["weather"], prefix='weather', drop_first=True)
race_df_arranged = pd.concat([race_df_arranged, dummy_df_weather], axis=1)

# 転覆等があり、着順が6までつけられなかったレースを削除. 転覆になった選手の行だけではなくそのレースごと列を削除する
race_df_test = race_df[race_df["着順"].astype(str).str.contains("\D")]
race_df_test.reset_index(inplace=True)
# 着順に文字列が含まれるものがあるレースのdate, venue, raceNumber
remove_race_list = race_df_test[["date", "venue", "raceNumber"]].values.tolist()
remove_race_list = list(set([tuple(race) for race in remove_race_list]))
race_df_arranged.drop(index=remove_index_list, inplace=True)
# 着順のカラムをinterger型に変換
race_df_arranged["着順"] = race_df_arranged["着順"].astype("int")

# multiindexは扱いがめんどくさいので全部カラムにする
race_df_arranged.reset_index(inplace=True)

# そのままinputにすることができない列を削除
race_df_arranged.drop(["boatNo", "motorNo", "racer_id", "racer_name", "weather", "タイム"], axis=1, inplace=True)

# dateとraceNumberでsort
race_df_arranged["raceNumber"] = race_df_arranged["raceNumber"].astype("float")
race_df_arranged.sort_values(["date", "venue", "raceNumber", "枠"], inplace=True)

# pickleファイルで保存
race_df_arranged.to_pickle('./arrangedData/data_formatted_1.pkl')

  raw_cell, store_history, silent, shell_futures)


In [51]:
race_df_arranged

Unnamed: 0,date,venue,raceNumber,枠,num_false_start,num_late_start,racer_class,ボート2連率,ボート3連率,モーター2連率,...,late,temperature,tilt,water_temperature,wave_height,weight,wind_speed,着順,weather_曇り,weather_雨
3414,2021-03-05,びわこ,1.0,1,0,0,1,29.14,46.36,19.20,...,0,10.0,-0.5,10.0,6,52.0,5,1,0,1
3415,2021-03-05,びわこ,1.0,2,0,0,3,26.42,45.91,42.59,...,0,10.0,-0.5,10.0,6,52.0,5,4,0,1
3416,2021-03-05,びわこ,1.0,3,0,0,3,34.23,50.34,32.67,...,0,10.0,-0.5,10.0,6,45.5,5,6,0,1
3417,2021-03-05,びわこ,1.0,4,1,0,3,44.57,60.57,32.99,...,0,10.0,-0.5,10.0,6,52.0,5,5,0,1
3418,2021-03-05,びわこ,1.0,5,0,0,3,28.10,42.48,45.05,...,0,10.0,0.0,10.0,6,52.3,5,3,0,1
3419,2021-03-05,びわこ,1.0,6,0,0,3,38.89,53.09,37.09,...,0,10.0,0.0,10.0,6,52.0,5,2,0,1
3420,2021-03-05,びわこ,2.0,1,1,0,2,33.53,46.71,41.10,...,0,10.0,-0.5,10.0,6,58.5,6,1,1,0
3421,2021-03-05,びわこ,2.0,2,0,0,3,32.65,48.98,30.32,...,0,10.0,-0.5,10.0,6,47.1,6,6,1,0
3422,2021-03-05,びわこ,2.0,3,0,0,3,32.67,44.67,31.21,...,0,10.0,0.0,10.0,6,52.5,6,4,1,0
3423,2021-03-05,びわこ,2.0,4,0,0,3,27.33,46.58,33.69,...,0,10.0,-0.5,10.0,6,52.0,6,2,1,0
