In [148]:
import requests
from bs4 import BeautifulSoup
import time

import pandas as pd
import numpy as np

# Missing data

In [None]:
df_origin = pd.read_csv('../data/historical_data/integrated_data.csv')
df_origin.head()

In [34]:
missing_df = df_origin[df_origin['racedate'].isna()]

# Get race id

In [35]:
def encode_raceid(return_date_only: bool = False): # -> str | pd.Timestamp: # python version >= 3.10
    """
    return race id following format `yyyy-mm-dd racenum:racetrackid`
    """
    def sub(row: pd.Series):

        # retrieve date
        year = str(row['id$Year'])
        month = str(row['id$MonthDay']//100)
        if len(month) < 2: month = '0' + month
        day = str(row['id$MonthDay']%100)
        if len(day) < 2: day = '0' + day

        if return_date_only:
            return pd.to_datetime(f'{year}-{month}-{day}')

        # retrieve race id
        racetrack_code = str(row['id$JyoCD'])
        if len(racetrack_code) < 2: racetrack_code = '0' + racetrack_code
        racenum = str(row['id$RaceNum'])
        if len(racenum) < 2: racenum = '0' + racenum
        
        return f'{year}-{month}-{day} {racenum}:{racetrack_code}'

    return sub

In [82]:
SE_df = pd.read_pickle('../data/historical_data/SE.pickle')[['id$Year', 'id$MonthDay', 'id$JyoCD', 'id$Kaiji', 'id$Nichiji',
       'id$RaceNum']]
SE_df['raceid'] = SE_df.apply(encode_raceid(), axis=1)

In [83]:
SE_df['id$JyoCD'] = SE_df['id$JyoCD'].apply(str)

In [84]:
SE_df = SE_df.groupby('raceid').agg({
    'id$Year':'min', 
    'id$MonthDay':'min', 
    'id$JyoCD':'min', 
    'id$Kaiji':'min', 
    'id$Nichiji':'min',
    'id$RaceNum':'min', 
})
SE_df.head()

Unnamed: 0_level_0,id$Year,id$MonthDay,id$JyoCD,id$Kaiji,id$Nichiji,id$RaceNum
raceid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-01-05 01:06,2008,105,6,1,1,1
2008-01-05 01:08,2008,105,8,1,1,1
2008-01-05 02:06,2008,105,6,1,1,2
2008-01-05 02:08,2008,105,8,1,1,2
2008-01-05 03:06,2008,105,6,1,1,3


In [94]:
missing_raceid = missing_df['raceid'].unique()

# Crawl

In [5]:
baseurl = 'https://jra.jp/JRAEN/AP/kaisai/running'
params = {
'raceYmd': '20180526', # YYYYmmdd
'raceJoCd': '05', # Track code table 2001
'raceYear': '', # empty string
'raceKai': '02', #
'raceHi': '11',
'raceNo': '01', # race no
}

In [128]:
result = {'raceid':[], 'Kyori':[], 'Track_type':[], 'Weather':[]}
result

{'raceid': [], 'Kyori': [], 'Track_type': [], 'Weather': []}

In [161]:
def process(raceinfo, raceid, result):
    kyori = float(raceinfo[2][:-1])
    track = raceinfo[3].strip(' ')
    we = raceinfo[4].strip(' ')
    result['raceid'].append(raceid)
    result['Kyori'].append(kyori)
    result['Track_type'].append(track)
    result['Weather'].append(we)

In [None]:
empty_data = []
i = 0
for race in missing_raceid:
    print(i)
    i += 1
    raceYmd = str(SE_df.loc[race, 'id$Year'])
    if len(str(SE_df.loc[race, 'id$MonthDay'])) < 4:
        raceYmd += '0' + str(SE_df.loc[race, 'id$MonthDay'])
    else:
        raceYmd += str(SE_df.loc[race, 'id$MonthDay'])

    params = {
            'raceYmd': raceYmd, # YYYYmmdd
            'raceJoCd': '{:0>2}'.format(str(SE_df.loc[race, 'id$JyoCD'])), # Track code table 2001
            'raceYear': '', # empty string
            'raceKai': '{:0>2}'.format(str(SE_df.loc[race, 'id$Kaiji'])), #
            'raceHi': '{:0>2}'.format(str(SE_df.loc[race, 'id$Nichiji'])),
            'raceNo': '{:0>2}'.format(str(SE_df.loc[race, 'id$RaceNum'])), # race no
            }
    time.sleep(0.5)
    page = requests.post(baseurl, params=params)
    soup = BeautifulSoup(page.text, 'html.parser')

    tables = soup.find_all('table')
    if len(tables) < 3:
        empty_data.append(race)
        continue
    raceinfo = tables[2].find_all('td')[0]
    raceinfo = raceinfo.text.strip('\r\n\t ').split(',')
    if len(raceinfo) < 5:
        empty_data.append(race)
        continue

    process(raceinfo, race, result)

In [None]:
empty_data2 = []
i = 0
for race in empty_data:
    print(i)
    i += 1
    raceYmd = str(SE_df.loc[race, 'id$Year'])
    if len(str(SE_df.loc[race, 'id$MonthDay'])) < 4:
        raceYmd += '0' + str(SE_df.loc[race, 'id$MonthDay'])
    else:
        raceYmd += str(SE_df.loc[race, 'id$MonthDay'])

    params = {
            'raceYmd': raceYmd, # YYYYmmdd
            'raceJoCd': '{:0>2}'.format(str(SE_df.loc[race, 'id$JyoCD'])), # Track code table 2001
            'raceKai': '{:0>2}'.format(str(SE_df.loc[race, 'id$Kaiji'])), #
            'raceHi': '{:0>2}'.format(str(SE_df.loc[race, 'id$Nichiji'])),
            'raceNo': '{:0>2}'.format(str(SE_df.loc[race, 'id$RaceNum'])), # race no
            'abortRaceFlg': '',
            }
    time.sleep(0.5)
    page = requests.post(baseurl, params=params)
    soup = BeautifulSoup(page.text, 'html.parser')

    tables = soup.find_all('table')
    if len(tables) < 3:
        empty_data2.append(race)
        continue
    raceinfo = tables[2].find_all('td')[0]
    raceinfo = raceinfo.text.strip('\r\n\t ').split(',')
    if len(raceinfo) < 5:
        empty_data2.append(race)
        continue

    process(raceinfo, race, result)

In [163]:
empty_data2

['2018-07-01 05:02']

In [165]:
result['raceid'].append('2018-07-01 05:02')
result['Kyori'].append(1200.)
result['Track_type'].append('Turf Yielding')
result['Weather'].append('Rainy')

In [166]:
extended_data = pd.DataFrame(result)
extended_data.to_csv('../data/historical_data/extended_data.csv', index=False)
extended_data.head()

Unnamed: 0,raceid,Kyori,Track_type,Weather
0,2018-05-26 01:05,1600.0,Dirt Standard,Cloudy
1,2018-05-26 01:08,1800.0,Dirt Standard,Cloudy
2,2018-05-26 02:05,1400.0,Turf Firm,Cloudy
3,2018-05-26 02:08,1400.0,Dirt Standard,Cloudy
4,2018-05-26 03:05,1800.0,Turf Firm,Cloudy


In [167]:
extended_data.tail()

Unnamed: 0,raceid,Kyori,Track_type,Weather
355,2018-07-01 03:03,1700.0,Dirt Standard,Fine
356,2018-07-01 04:03,2750.0,Jump Firm,Fine
357,2018-07-01 06:02,2000.0,Turf Yielding,Drizzle
358,2018-07-01 12:03,1150.0,Dirt Standard,Fine
359,2018-07-01 05:02,1200.0,Turf Yielding,Rainy


# Check

In [170]:
extended_data.sample(5)

Unnamed: 0,raceid,Kyori,Track_type,Weather
251,2018-06-24 11:09,2200.0,Turf Good,Fine
274,2018-06-30 07:07,2200.0,Turf Firm,Cloudy
294,2018-07-01 03:07,1800.0,Dirt Standard,Fine
130,2018-06-16 02:09,1800.0,Dirt Standard,Fine
117,2018-06-10 08:09,1200.0,Turf Firm,Cloudy


In [198]:
SE_df.loc['2018-06-10 08:09']

id$Year        2018
id$MonthDay     610
id$JyoCD          9
id$Kaiji          3
id$Nichiji        4
id$RaceNum        8
Name: 2018-06-10 08:09, dtype: object

In [199]:
params = {
            'raceYmd': '20180610', # YYYYmmdd
            'raceJoCd': '09', # Track code table 2001
            'raceKai': '03', #
            'raceHi': '04',
            'raceNo': '08', # race no,
            'raceYear': '',
            # 'abortRaceFlg': '',
            }
page = requests.post(baseurl, params=params)
soup = BeautifulSoup(page.text, 'html.parser')

tables = soup.find_all('table')
raceinfo = tables[2].find_all('td')[0]
raceinfo = raceinfo.text.strip('\r\n\t ').split(',')
raceinfo

['June 10',
 ' 2018',
 ' 1200m',
 ' Turf Firm',
 ' Cloudy',
 '\r\n        Post time 13:50']