# ボートレースのレース情報をクロールしpickleファイルに保存

In [2]:
from datetime import datetime
from datetime import timedelta
from http.client import RemoteDisconnected
from bs4 import BeautifulSoup
import urllib.request
import time
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm

## 1. モジュールをロード
### 1.1 web pageから情報を取ってきてpandas dfに格納するモジュール
#### 1.1.1 出走表のページからスクレイプするモジュール
- 例：https://boatrace.jp/owpc/pc/race/racelist?rno=11&jcd=15&hd=20210224

In [17]:
def scrape_racelist(soup, rno, jcd, hd):
    """
    racelistのページに書かれている情報をクロール
    :return:
    """
    table = soup.find(class_="contentsFrame1_inner").find_all(class_="table1")[1]
    rows = table.find_all("tbody", {"class": "is-fs12"})
    
    race_result_dict_list = []
    
    for i, row in enumerate(rows, 1):
        race_result_dict = {"date": "-".join([hd[0:4], hd[5:7], hd[8:10]]),
                            "venue": jcd, "raceNumber": rno[:-1]
                           }
        # 枠
        race_result_dict["枠"] = i
        # racer id
        race_result_dict["racer_id"] = row.find(class_="is-fs11").text.split("\n")[1][-6:-2]
        race_result_dict["racer_class"] = row.find(class_="is-fs11").text.split("\n")[2][-2:]

        # 選手名。最後の[1:-1]は改行を削除するため
        racer_name = row.find(class_="is-fs18 is-fBold").text[1:-1]
        
        # race_result_listの要素としてクロールした結果のリストを追加
        race_result_dict["racer_name"] = racer_name

        # racer data
        racer_column_3 = row.find_all("td", {"class": "is-lineH2"})[0].text.split("\n")
        race_result_dict["num_false_start"] = racer_column_3[1][-3:-1]
        race_result_dict["num_late_start"] = racer_column_3[2][-3:-1]

        # crawl motor data
        motor_column = row.find_all("td", {"class": "is-lineH2"})[3].text.split("\n")
        race_result_dict["motorNo"] = motor_column[1][-4:-1]
        race_result_dict["モーター2連率"] = motor_column[2][-7:-1]
        race_result_dict["モーター3連率"] = motor_column[3][-7:-1]

        # crawl boat data
        boat_column = row.find_all("td", {"class": "is-lineH2"})[4].text.split("\n")
        race_result_dict["boatNo"] = boat_column[1][-4:-1]
        race_result_dict["ボート2連率"] = boat_column[2][-7:-1]
        race_result_dict["ボート3連率"] = boat_column[3][-7:-1]
        
        race_result_dict_list.append(race_result_dict)
        
    # dictをdfに変換
    race_result_df = pd.DataFrame.from_dict(race_result_dict_list)

    time.sleep(0.1)

    return race_result_df

#### 1.1.2 直前情報のページからスクレイプするモジュール
- 例：https://boatrace.jp/owpc/pc/race/beforeinfo?rno=11&jcd=15&hd=20210224

In [37]:
def scrape_beforeinfo(soup, rno, jcd, hd):
    """
    exhibitionの情報など、直前情報ページに書かれている情報をクロール
    :param soup:
    :param rno:
    :param jcd:
    :param hd:
    :return:

    # TODO: プロペラ
    # TODO: 部品交換
    # TODO: 前走成績
    # TODO: 調整重量 (adjustment weight) (kg)
    # TODO: 風向き

    """
    race_result_dict_list = []
    
    table = soup.find(class_="contentsFrame1_inner").find_all(class_="table1")[1]
    rows = table.find_all("tbody", {"class": "is-fs12"})
        
    for i, row in enumerate(rows, 1):
        
        race_result_dict = {"date": "-".join([hd[0:4], hd[5:7], hd[8:10]]),
                        "venue": jcd,
                        "raceNumber": rno[:-1]
                        }
        # 枠
        race_result_dict["枠"] = i
        
        # 水面気象情報
        table3 = soup.find(class_="contentsFrame1_inner").find(class_="weather1")
        weather_data = (table3.find_all(class_="weather1_bodyUnitLabelData"))
        weather_string = table3.find_all(class_="weather1_bodyUnitLabelTitle")

        race_result_dict["temperature"] = weather_data[0].text[:-1]
        race_result_dict["weather"] = weather_string[1].text
        race_result_dict["wind_speed"] = weather_data[1].text[:-1]
        race_result_dict["water_temperature"] = weather_data[2].text[:-1]
        race_result_dict["wave_height"] = weather_data[3].text[:-2]

        # racer weight (kg)
        # 書いていないことがあり、その場合エラーになる
        race_result_dict["weight"] = row.find("td", {"rowspan": "2"}).text[:-2]

        # 展示タイム
        race_result_dict["exhibitionTime"] = row.find_all("td", {"rowspan": "4"})[3].text

        # チルト角度
        race_result_dict["tilt"] = row.find_all("td", {"rowspan": "4"})[4].text
        
        # 最初に定義したリストに辞書型のデータを追加
        race_result_dict_list.append(race_result_dict)

    # dictを入れたlistをdfに変換
    beforeinfo_df_1 = pd.DataFrame.from_dict(race_result_dict_list)
    
    
    # 展示競争のコース・スタートタイムをクロール
    race_result_dict_list_2 = []
    exhibition_start_rows = soup.find(class_="is-w238").find(class_="is-p10-0").find_all("tr")
    
    for i, exhibition_start_row in enumerate(exhibition_start_rows, 1):
        
        race_result_dict_2 = {}
        race_result_dict_2["exhibition_cource"] = i
        race_result_dict_2["枠"] = int(exhibition_start_row.find(class_=re.compile("table1_boatImage1Number")).text)
        race_result_dict_2["exhibition_start_time"] = exhibition_start_row.find(class_="table1_boatImage1Time").text
        
        race_result_dict_list_2.append(race_result_dict_2)
    
    # dictを入れたlistをdfに変換
    beforeinfo_df_2 = pd.DataFrame.from_dict(race_result_dict_list_2)
    
    
    # df_1とdf_2を枠番をkeyにしてマージ
    beforeinfo_df = pd.merge(beforeinfo_df_1, beforeinfo_df_2, on="枠")

    time.sleep(0.1)

    return beforeinfo_df

#### 1.1.3 レース結果のページからスクレイプするモジュール
- 例：https://boatrace.jp/owpc/pc/race/raceresult?rno=11&jcd=15&hd=20210224

In [18]:
def scrape_raceresult(soup, rno, jcd, hd):
    
    race_result_dict_list = []
    
    table = soup.find(class_="contentsFrame1_inner").find_all(class_="table1")[1]
    rows = table.find_all("tbody")
        
    for row in rows:
        race_result_dict = {"date": "-".join([hd[0:4], hd[5:7], hd[8:10]]),
                            "venue": jcd,
                            "raceNumber": rno[:-1]
                            }
        race_result_dict["着順"] = row.find_all("td")[0].text
        # 枠番はintegerにしておかないとconcatした時に別の行として扱われてしまう
        race_result_dict["枠"] = int(row.find_all("td")[1].text)
        race_result_dict["タイム"] = row.find_all("td")[3].text
                
        # 最初に定義したリストに辞書型のデータを追加
        race_result_dict_list.append(race_result_dict)
    
    # dictを入れたlistをdfに変換
    raceresult_df_1 = pd.DataFrame.from_dict(race_result_dict_list)
    raceresult_df_1 = raceresult_df_1.set_index(["date", "venue", "raceNumber", "枠"])

    # start timeについてクロール
    race_result_dict_list_2 = []
    
    table_2 = soup.find(class_="contentsFrame1_inner").find_all(class_="table1")[2]
    rows_start_time = table_2.find_all(class_="table1_boatImage1TimeInner")
    rows_cource = table_2.find_all(class_=re.compile("table1_boatImage1Number is-type"))

    for i, row_start_time, row_cource in zip(range(1, 7), rows_start_time, rows_cource):
        race_result_dict_2 = {"date": "-".join([hd[0:4], hd[5:7], hd[8:10]]),
                            "venue": jcd,
                            "raceNumber": rno[:-1]
                            }
        
        race_result_dict_2["進入コース"] = i
        race_result_dict_2["枠"] = int(row_cource.text)

        # false lateスタートがあった時のtry-except
        try:
            race_result_dict_2["start_time"] = float(row_start_time.text.split()[0])
        except ValueError:
            race_result_dict_2["start_time"] = np.nan
        
        # 最初に定義したリストに辞書型のデータを追加
        race_result_dict_list_2.append(race_result_dict_2)
    
    # dictを入れたlistをdfに変換
    raceresult_df_2 = pd.DataFrame.from_dict(race_result_dict_list_2)
    raceresult_df_2 = raceresult_df_2.set_index(["date", "venue", "raceNumber", "枠"])
    
    raceresult_df = pd.merge(raceresult_df_1, raceresult_df_2, how="left", left_index=True, right_index=True)
    
    raceresult_df.reset_index(inplace=True)
    
    time.sleep(0.1)

    return raceresult_df

### 1.2 そのほかcrawl, scrapeに必要なモジュール

In [15]:
def make_url(crawl_key, rno, jcd, hd):
    """
    :param crawl_key: 何をcrawleするか。選択肢は、"odds3t"（オッズ）, "racelist"(出走表）,
    "beforeinfo" (直前情報）もしくは"raceresult" (レース結果)
    :param rno: レース番号。8Rなど、1-12の数字 + R をstrで
    :param jcd: 会場名。"桐　生"、"びわこ"など
    :param hd: holding day (レース開催日)、2019/03/28などyyyy/mm/ddの形で入力（strで）
    :return dds_url: 公式サイト最終オッズが書かれているページのurl. これを使ってcrawlする
    """
    jcd_dict =  {"桐　生": "01", "戸　田": "02", "江戸川": "03", "平和島": "04", "多摩川": "05", "浜名湖": "06", "蒲　郡": "07", "常　滑": "08",
                "　津　": "09", "三　国": "10", "びわこ": "11", "住之江": "12", "尼　崎": "13", "鳴　門": "14", "丸　亀": "15", "児　島": "16",
                "宮　島": "17", "徳　山": "18", "下　関": "19", "若　松": "20", "芦　屋": "21", "福　岡": "22", "唐　津": "23", "大　村": "24"
                }
    rno = rno[:-1]
    hd = hd[0:4] + hd[5:7] + hd[8:10]

    odds_url = "http://boatrace.jp/owpc/pc/race/" + crawl_key + "?rno=" + rno + "&jcd=" + jcd_dict[jcd] + "&hd=" + hd

    return odds_url


def html_parser(site_url):
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
    }

    try:
        request = urllib.request.Request(url=site_url, headers=headers)
        response = urllib.request.urlopen(request)

        html = response.read().decode('utf-8')
        soup = BeautifulSoup(html, 'lxml')

    # データベース作成の際、remotedisconnectedになった場合,そのレースをパス
    except RemoteDisconnected:
        print("remote disconnected error !")
        return None

    except ConnectionResetError:
        print("Connection Reset error !")
        return None

    return soup

def get_extractor(crawl_key):
    
    """
    クロール先に応じたcrawlerを用意
    
    """
    
    extractor_dict = {"racelist": scrape_racelist,
                      "beforeinfo": scrape_beforeinfo,
                      "raceresult": scrape_raceresult,
                      }
    
    return extractor_dict[crawl_key]

## 2. 実行
- 最初の以下の行にてクロールを行う日付を指定

　　　　　　　　`　hd_list = ["2021/02/0" + str(day) for day in range(1,10)]`
- クロール元：ボートレース 公式サイト（https://boatrace.jp/owpc/pc/race/racelist?rno=12&jcd=01&hd=20210325など）
- 保存先：'./crawledData/　以下。日にちごとにファイルを作成し保存

In [None]:
hd_list = ["2021/04/" + str(day).zfill(2) for day in range(1,32)]

crawl_key_list = ["racelist", "beforeinfo", "raceresult"]
jcd_list =  ["桐　生", "戸　田", "江戸川", "平和島", "多摩川", "浜名湖", "蒲　郡", "常　滑",
                "　津　", "三　国", "びわこ", "住之江", "尼　崎", "鳴　門", "丸　亀", "児　島",
                "宮　島", "徳　山", "下　関", "若　松", "芦　屋", "福　岡", "唐　津", "大　村"
            ]

for hd in hd_list:
    print("{0} のデータをクロール中".format(hd))

    # 1日単位でデータを集めてファイルに保存する
    today_race_df_list = []

    for jcd in tqdm(jcd_list):
        for i in range(1, 13):
            rno = str(i) + "R"

            # その日レースがない場所は飛ばすためのtry-except         
            try:
                # 色々なkeyに対してクロールして特定のレースの情報がまとまったdfを作る
                race_info_df_list = []

                for crawl_key in crawl_key_list:
                    raceResult_url = make_url(crawl_key, rno, jcd, hd)

                    # パース
                    soup = html_parser(raceResult_url)

                    # extractorの指定
                    the_extractor = get_extractor(crawl_key)

                    # 対象サイトをcrawl
                    race_information_df = the_extractor(soup, rno, jcd, hd)
                    race_information_df = race_information_df.set_index(["date", "venue", "raceNumber", "枠"])

                    race_info_df_list.append(race_information_df)

                this_race_df =pd.concat(race_info_df_list, axis=1)
                # 今回のレースのデータを本日のデータを集めたリストに格納
                today_race_df_list.append(this_race_df)

            except IndexError:
                # print(hd + " " + jcd + rno +"データなし")
                pass

    # 本日のレースデータを集めたリストをdfに変換    
    today_race_df = pd.concat(today_race_df_list, axis = 0)

    # pickleファイルで保存
    today_race_df.to_pickle('../../data/crawledData/{0}.pkl'.format("".join(hd.split("/"))))

2021/04/01 のデータをクロール中


  0%|          | 0/24 [00:00<?, ?it/s]

2021/04/02 のデータをクロール中


  0%|          | 0/24 [00:00<?, ?it/s]

2021/04/03 のデータをクロール中


  0%|          | 0/24 [00:00<?, ?it/s]

2021/04/04 のデータをクロール中


  0%|          | 0/24 [00:00<?, ?it/s]

2021/04/05 のデータをクロール中


  0%|          | 0/24 [00:00<?, ?it/s]

2021/04/06 のデータをクロール中


  0%|          | 0/24 [00:00<?, ?it/s]

2021/04/07 のデータをクロール中


  0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
today_race_df_list[1]

In [13]:
# ファイル内容確認用
df = pd.read_pickle('../../data/crawledData/20210401.pkl')
df.loc[("2021-04-01", "大　村", "9"), :]

  return self._getitem_tuple(key)


Unnamed: 0_level_0,racer_id,racer_class,racer_name,num_false_start,num_late_start,motorNo,モーター2連率,モーター3連率,boatNo,ボート2連率,...,exhibitionTime,tilt,exhibition_cource,exhibition_ST,flying,late,着順,タイム,進入コース,start_time
枠,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4295,A1,小坂 尚哉,F0,L0,62,38.95,54.65,38,26.97,...,6.95,-0.5,1,0.27,0,0,２,"1'49""9",1,0.11
2,4961,A2,西橋 奈未,F1,L0,21,31.14,46.11,55,25.61,...,6.95,-0.5,4,0.07,0,0,６,,3,0.28
3,3711,A2,江本 真治,F0,L0,67,29.88,51.83,40,32.6,...,7.02,-0.5,2,0.23,0,0,５,,4,0.2
4,3257,A1,田頭 実,F1,L0,59,28.18,43.65,52,29.89,...,6.9,-0.5,3,0.17,0,0,１,"1'49""3",2,0.08
5,4657,B1,江崎 一雄,F1,L0,39,35.26,54.21,53,34.64,...,6.92,-0.5,5,0.11,0,0,３,"1'52""2",5,0.17
6,4553,A2,坪口 竜也,F0,L0,12,46.34,61.59,32,27.22,...,6.91,0.0,6,0.07,0,0,４,"1'53""3",6,0.22


In [38]:
# crawlerの動きを確認する用

crawl_key = "beforeinfo"
jcd =  "大　村"
hd = "2021/04/01"
rno = "9R"

raceResult_url = make_url(crawl_key, rno, jcd, hd)
print(raceResult_url)

# パース
soup = html_parser(raceResult_url)

# extractorの指定
the_extractor = get_extractor(crawl_key)

# 対象サイトをcrawl
race_information_df = the_extractor(soup, rno, jcd, hd)
race_information_df = race_information_df.set_index(["date", "venue", "raceNumber", "枠"])
race_information_df

http://boatrace.jp/owpc/pc/race/beforeinfo?rno=9&jcd=24&hd=20210401


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,temperature,weather,wind_speed,water_temperature,wave_height,weight,exhibitionTime,tilt,exhibition_cource,exhibition_start_time
date,venue,raceNumber,枠,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2021-04-01,大　村,9,1,21.0,曇り,0,15.0,0,52.0,6.95,-0.5,1,0.27
2021-04-01,大　村,9,2,21.0,曇り,0,15.0,0,47.0,6.95,-0.5,3,0.23
2021-04-01,大　村,9,3,21.0,曇り,0,15.0,0,51.5,7.02,-0.5,4,0.17
2021-04-01,大　村,9,4,21.0,曇り,0,15.0,0,50.5,6.9,-0.5,2,0.07
2021-04-01,大　村,9,5,21.0,曇り,0,15.0,0,51.5,6.92,-0.5,5,0.11
2021-04-01,大　村,9,6,21.0,曇り,0,15.0,0,52.0,6.91,0.0,6,0.07
