In [56]:
import sys

print("{}{: >25}{}{: >10}{}".format('|','Variable Name','|','Memory','|'))
print(" ------------------------------------ ")
for var_name in dir():
    if not var_name.startswith("_") and sys.getsizeof(eval(var_name)) > 1000000: #ここだけアレンジ
        print("{}{: >25}{}{: >10}{}".format('|',var_name,'|',sys.getsizeof(eval(var_name)),'|'))

|            Variable Name|    Memory|
 ------------------------------------ 


# make race_id_list

In [15]:
import codecs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
%matplotlib inline

plt.style.use("ggplot")
import pandas as pd
import time
import datetime
from tqdm.notebook import tqdm
import requests
from bs4 import BeautifulSoup
import re
from urllib.request import urlopen
import xml.etree.ElementTree as et
from lxml import etree
from dateutil import parser
from selenium.webdriver import Chrome, ChromeOptions
import chromedriver_binary
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options


def update_data(old,new):
    filtered_old = old[~old.index.isin(new.index)]
    return pd.concat([filtered_old,new])

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

In [3]:
race_id_list = []
for ids in (2023060105,2023060106,2023070105,2023070106,2023100101,2023100102):
    for r in range(1, 13, 1):
        race_id = (str(ids) + str(r).zfill(2))
        race_id_list.append(race_id)

In [3]:
race_id_list = []
for year in range(2012,2014,1):
    for place in range(1, 11, 1):
        for kai in range(1, 13, 1):
            for day in range(1, 13, 1):
                for r in range(1, 13, 1):
                    race_id = (
                        str(year)
                        + str(place).zfill(2)
                        + str(kai).zfill(2)
                        + str(day).zfill(2)
                        + str(r).zfill(2)
                    )
                    race_id_list.append(race_id)

# definition

In [16]:
old = '0331'
new = '0407'

def scrape_kaisai_date(from_: str, to_: str):
    """
    yyyy-mmの形式でfrom_とto_を指定すると、間のレース開催日一覧が返ってくる関数。
    to_の月は含まないので注意。
    """
    print('getting race date from {} to {}'.format(from_, to_))
    # 間の年月一覧を作成
    date_range = pd.date_range(start=from_, end=to_, freq="W")
    # 開催日一覧を入れるリスト
    kaisai_date_list = []
    for year, month in tqdm(zip(date_range.year, date_range.month), total=len(date_range)):
        #取得したdate_rangeから、スクレイピング対象urlを作成する。
        #urlは例えば、https://race.netkeiba.com/top/calendar.html?year=2022&month=7 のような構造になっている。
        query = [
            'year=' + str(year),
            'month=' + str(month),
        ]
        url = 'https://race.netkeiba.com/top/calendar.html' + '?' + '&'.join(query)
        html = urlopen(url).read()
        time.sleep(0.1)
        soup = BeautifulSoup(html, "html.parser")
        a_list = soup.find('table', class_='Calendar_Table').find_all('a')
        for a in a_list:
            kaisai_date_list.append(re.findall('(?<=kaisai_date=)\d+', a['href'])[0])
        
        from_ = from_.replace('-', '')
        to_ = to_.replace('-', '')
        kaisai_date_list = [date for date in kaisai_date_list if from_ <= date <= to_]
    return kaisai_date_list
    
def scrape_race_id_list(kaisai_date_list: list, from_shutuba=False, waiting_time=10):
    """
    開催日をyyyymmddの文字列形式でリストで入れると、レースid一覧が返ってくる関数。
    レース前日準備のためrace_idを取得する際には、from_shutuba=Trueにする。
    ChromeDriverは要素を取得し終わらないうちに先に進んでしまうことがあるので、その場合の待機時間をwaiting_timeで指定。
    """
    race_id_list = []
    options = ChromeOptions()
    driver = Chrome(options=options)
    #画面サイズをなるべく小さくし、余計な画像などを読み込まないようにする
    driver.set_window_size(8, 8)
    print('getting race_id_list')
    for kaisai_date in tqdm(kaisai_date_list):
        try:
            query = [
                'kaisai_date=' + str(kaisai_date)
            ]
            url = 'https://race.netkeiba.com/top/race_list.html' + '?' + '&'.join(query)
            print('scraping: {}'.format(url))
            driver.get(url)
            try:
                # 取得し終わらないうちに先に進んでしまうのを防ぐ
                time.sleep(1)
                a_list = driver.find_element(By.CLASS_NAME, 'RaceList_Box').find_elements(By.TAG_NAME, 'a')
            except:
                #それでも取得できなかったらもう10秒待つ
                print('waiting more {} seconds'.format(waiting_time))
                time.sleep(waiting_time)
                a_list = driver.find_element(By.CLASS_NAME, 'RaceList_Box').find_elements(By.TAG_NAME, 'a')
            for a in a_list:
                if from_shutuba:
                    race_id = re.findall('(?<=shutuba.html\?race_id=)\d+', a.get_attribute('href'))
                else:
                    race_id = re.findall('(?<=result.html\?race_id=)\d+', a.get_attribute('href'))
                if len(race_id) > 0:
                    race_id_list.append(race_id[0])
        except Exception as e:
            print(e)
            break
    driver.close()
    return race_id_list

def scrape_race_results(race_id_list, pre_race_results={}):
    race_results = pre_race_results
    for race_id in tqdm(race_id_list):
        if race_id in race_results.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            df = pd.read_html(url)[0]

            # horse_idとjockey_idをスクレイピング
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")
            
            # horse_id
            horse_id_list = []
            horse_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all("a", attrs={"href": re.compile("^/horse")})
            for a in horse_a_list:
                horse_id = re.findall(r"\d+", a["href"])
                horse_id_list.append(horse_id[0])
            
            # jockey_id
            jockey_id_list = []
            jockey_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all("a", attrs={"href": re.compile("^/jockey")})
            for a in jockey_a_list:
                jockey_id = re.findall(r"\d+", a["href"])
                jockey_id_list.append(jockey_id[0])
                
            # trainer_id
            trainer_id_list = []
            trainer_a_list = soup.find("table", attrs={"summary": "レース結果"}).find_all("a", attrs={"href": re.compile("^/trainer")})
            for a in trainer_a_list:
                trainer_id = re.findall(r"\d+", a["href"])
                trainer_id_list.append(trainer_id[0])

            df["horse_id"] = horse_id_list
            df["jockey_id"] = jockey_id_list
            df["trainer_id"] = trainer_id_list
            race_results[race_id] = df
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_results

def scrape_race_info(race_id_list,pre_race_infos= {}):
    race_infos = pre_race_infos
    for race_id in tqdm(race_id_list):
        if race_id in race_infos.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            html = requests.get(url)
            html.encoding = "EUC-JP"
            soup = BeautifulSoup(html.text, "html.parser")

            texts = (
                soup.find("div", attrs={"class": "data_intro"}).find_all("p")[0].text
                +soup.find("div", attrs={"class": "data_intro"}).find_all("p")[1].text
            )
            info = re.findall(r'\w+', texts) #Qiitaでバックスラッシュを打つとバグるので大文字にしてあります。
            info_dict = {}
            for text in info:
                if text in ["芝", "ダート"]:
                    info_dict["race_type"] = text
                if "障" in text:
                    info_dict["race_type"] = "障害"
                if "m" in text:
                    info_dict["course_len"] = int(re.findall(r"\d+", text)[-1]) #0 ここも同様に大文字にしてます。
                if text in ["良", "稍重", "重", "不良"]:
                    info_dict["ground_state"] = text
                if text in ["曇", "晴", "雨", "小雨", "小雪", "雪"]:info_dict["weather"] = text
                if "年" in text:
                    info_dict["date"] = text
            race_infos[race_id] = info_dict
            time.sleep(0.1)
        except IndexError:
            continue
        except Exception as e:
            print(e)
            break
    return race_infos

def scrape_horse_birth(horse_id_list, pre={}):
    horse_birth = pre
    for horse_id in tqdm(horse_id_list):
        if horse_id in horse_birth.keys():
            continue
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            df = pd.read_html(url)[1]
            horse_birth[horse_id] = df[1][0]
            time.sleep(0.01)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return horse_birth

def scrape_horse_results(horse_id_list, pre={}):
    horse_results = pre
    session =  requests.Session()
    login_data = {'login_id':'hajimefutaki7946@gmaik.com','pswd':'4Wr2m3Tj3DCMmr5'}
    session.post('https://regist.netkeiba.com/account/?pid=login&action=auth',login_data)
    for horse_id in tqdm(horse_id_list):
        if horse_id in horse_results.keys():
            continue
        try:
            url = 'https://db.netkeiba.com/horse/' + horse_id
            html = session.get(url)
            df = pd.read_html(html.content)[3]
            #df = pd.read_html(url)[3]
            if df.columns[0]=='受賞歴':
                df = pd.read_html(html.content)[4]
            horse_results[horse_id] = df
            time.sleep(0.01)
        except IndexError:
            continue
        except Exception as e:
            import traceback
            traceback.print_exc()
            print(e)
            break
        except:
            break
    return horse_results

def scrape_return_tables(race_id_list, pre_return_tables={}):
    return_tables = pre_return_tables
    for race_id in tqdm(race_id_list):
        if race_id in return_tables.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            f = urlopen(url)
            html = f.read()
            html = html.replace(b'<br />', b'br')
            dfs = pd.read_html(html)
            return_tables[race_id] = pd.concat([dfs[1], dfs[2]])
            time.sleep(0.1)
        except IndexError:
            continue
        except:
            break
    return return_tables

def scrape_jockey_results(jockey_id_list, pre={}):
    jockey_results = pre
    for jockey_id in tqdm(jockey_id_list):
        if jockey_id in jockey_results.keys():
            continue
        for no in range(1,3,1):
            try:
                url = 'https://db.netkeiba.com/?pid=jockey_detail&id=' + jockey_id + '&page=' + str(no)
                df = pd.read_html(url)[0]
                jockey_results[jockey_id,no] = df
                time.sleep(0.01)
            except IndexError:
                #print('e1')
                continue
            except Exception as e:
                #print('e2')
                continue
    return jockey_results

def scrape_trainer_results(trainer_id_list, pre={}):
    trainer_results = pre
    for trainer_id in tqdm(trainer_id_list):
        if trainer_id in trainer_results.keys():
            continue
        for no in range(1,3,1):
            try:
                url = 'https://db.netkeiba.com/?pid=trainer_detail&id=' + trainer_id + '&page=' + str(no)
                df = pd.read_html(url)[0]
                trainer_results[trainer_id,no] = df
                time.sleep(0.01)
            except IndexError:
                #print('e1')
                continue
            except Exception as e:
                #print('e2')
                continue
    return trainer_results

def scrape_race_card_table(race_id_list):
    data = pd.DataFrame()
    for race_id in tqdm(race_id_list):
        url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + race_id
        df = pd.read_html(url)[0]
        df = df.T.reset_index(level=0,drop=True).T
        
        html = requests.get(url)
        html.encoding = 'EUC-JP'
        soup = BeautifulSoup(html.text,'html.parser')
        
        texts = soup.find('div',attrs={'class':'RaceData01'}).text
        texts = re.findall(r'\w+',texts)
        df['発走時刻'] = [texts[0]+':'+texts[1][:2]] * len(df)
        for text in texts:
            if 'm' in text:
                df['course_len'] = [str(re.findall(r'\d+',text)[0])] * len(df)
            #if text in ['曇','晴','雨','小雨','小雪','雪']:
            #    df['weather'] = [text] * len(df)
            #if text in ['良', '稍重', '重']:
            #    df['ground_state'] = [text] * len(df)
            #if '不' in text:
            #    df['ground_state'] = ['不良'] * len(df)
            #if '稍' in text:
            #    df['ground_state'] = ['稍重'] * len(df)
            if '芝' in text:
                df['R_type'] = ['芝'] * len(df)
            if '障' in text:
                df['R_type'] = ['障害'] * len(df)
            if 'ダ' in text:
                df['R_type'] = ['ダート'] * len(df)
                
        race_num = soup.find('span',attrs={'class':'RaceNum'}).text
        df['R'] = re.findall(r'\w+',race_num) * len(df)
        
        race_name = soup.find('div',attrs={'class':'RaceName'}).text
        df['race_name'] = re.findall(r'\w+',race_name) * len(df)
        
        date = soup.find('dd',attrs={'class':'Active'}).text
        df['date'] = re.findall(r'\d+/\d+',date)[0]
        
        race_infos2 = soup.find('div',attrs={'class':'RaceData02'}).text
        df['place'] = re.findall(r'\w+',race_infos2)[1]
        df['race_genre'] = re.findall(r'\w+',race_infos2)[3]
        df['race_grade'] = re.findall(r'\w+',race_infos2)[4]
        #df['頭数'] = re.findall(r'\w+',race_infos2)[8]
        
        horse_id_list = []
        horse_td_list = soup.find_all('td',attrs={'class':'HorseInfo'})
        for td in horse_td_list:
            horse_id = re.findall(r'\d+',td.find('a')['href'])[0]
            horse_id_list.append(horse_id)
        
        jockey_id_list = []
        jockey_td_list = soup.find_all('td',attrs={'class':'Jockey'})
        for td in jockey_td_list:
            jockey_id = re.findall(r'\d+',td.find('a')['href'])[0]
            jockey_id_list.append(jockey_id)

        trainer_id_list = []
        trainer_td_list = soup.find_all('td',attrs={'class':'Trainer'})
        for td in trainer_td_list:
            trainer_id = re.findall(r'\d+',td.find('a')['href'])[0]
            trainer_id_list.append(trainer_id)
        
        df["horse_id"] = horse_id_list
        df['jockey_id'] = jockey_id_list
        df["trainer_id"] = trainer_id_list
        df['place_id'] = df['place'].map(place_dict)
        df['course_len_id'] = df['course_len']
        df['R_type_id'] = df['R_type'].map(R_type_dict)
        df['course_id'] = df['place_id']+df['R_type_id']+df['course_len_id']
        df = df[['date','R','race_name','race_genre','race_grade','発走時刻','馬番','枠','horse_id','馬名','性齢','斤量','jockey_id','騎手','course_id','place','course_len','R_type']]
        
        df.index = [race_id]*len(df)
        df['date'] = df.index.str[:4] +'/'+ df['date']
        df['date'] = pd.to_datetime(df['date'])
        data = data.append(df)
        
        time.sleep(0.1)
        
    return data

place_dict = {'札幌':'01','函館':'02','福島':'03','新潟':'04','東京':'05','中山':'06','中京':'07','京都':'08','阪神':'09','小倉':'10'}
R_type_dict = {'芝':'01','ダート':'00','ダ':'00','障':'02','障害':'02'}

# scrape_race_id_list

In [17]:
now = datetime.datetime.now()
one_week_after = now + datetime.timedelta(weeks=1)
one_week_before = now + datetime.timedelta(weeks=-1)

to_ = str(now.date())
#to_ = str(one_week_after.date())
from_ = str(one_week_before.date())
kaisai_date_list = sorted(list(set(scrape_kaisai_date(from_, to_))))
with open('kaisai_date_list.txt', 'w') as f:
    for item in kaisai_date_list:
        f.write(str(item) + '\n')

race_id_list = list(set(scrape_race_id_list(kaisai_date_list)))
print(len(race_id_list))

getting race date from 2023-03-31 to 2023-04-07


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))


getting race_id_list


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2.0), HTML(value='')))

scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230401
scraping: https://race.netkeiba.com/top/race_list.html?kaisai_date=20230402

48


# scrape_race_result

In [18]:
race_results = scrape_race_results(race_id_list)

for key in race_results:
    race_results[key].index = [key] * len(race_results[key])

race_results = pd.concat([race_results[key] for key in race_results], sort=False)
race_infos = scrape_race_info(race_id_list)
race_infos = pd.concat([pd.DataFrame(race_infos[key], index=[key]) for key in race_infos])

for key in race_infos:
    race_infos[key].index = [key] * len(race_infos[key])

race_results = race_results.merge(race_infos, left_index=True, right_index=True, how='left')
race_results.to_pickle('race_results_new.pickle')
race_results_old = pd.read_pickle('race_results_2012-2023' + old + '.pickle')
race_results_new = pd.read_pickle('race_results_new.pickle')
race_results = update_data(race_results_old,race_results_new)
race_results.to_pickle('race_results_2012-2023' + new + '.pickle')
len(race_results.index.unique()) - len(race_id_list) - len(race_results_old.index.unique())

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=48.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=48.0), HTML(value='')))




0

# scrape horse_birth

In [19]:
horse_id_list = race_results_new['horse_id'].unique()
len(horse_id_list)

horse_birth = scrape_horse_birth(horse_id_list)

horse_birth = pd.DataFrame(horse_birth,index=['birth']).T
horse_birth['birth'] = pd.to_datetime(horse_birth['birth'], format="%Y年%m月%d日")
horse_birth.to_pickle('horse_birth_new.pickle')
horse_birth_old = pd.read_pickle('horse_birth_2012-2023' + old + '.pickle')
horse_birth_new = pd.read_pickle('horse_birth_new.pickle')
horse_birth = update_data(horse_birth_old,horse_birth_new)
horse_birth.to_pickle('horse_birth_2012-2023' + new + '.pickle')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=639.0), HTML(value='')))




# scrape horse_results

In [5]:
race_results_new = pd.read_pickle('race_results_new.pickle')
horse_id_list = race_results_new['horse_id'].unique()
len(horse_id_list)
horse_results = scrape_horse_results(horse_id_list)

for key in horse_results:
    horse_results[key].index = [key] * len(horse_results[key])
    
df = pd.concat([horse_results[key] for key in horse_results])
df.to_pickle('horse_results_new.pickle')
horse_results_old = pd.read_pickle('horse_results_2012-2023' + old + '.pickle')
horse_results_new = pd.read_pickle('horse_results_new.pickle')

horse_results = update_data(horse_results_old,horse_results_new)
horse_results.to_pickle('horse_results_2012-2023' + new + '.pickle')
len(horse_results)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=639.0), HTML(value='')))




1271260

# scrape return_tables

In [6]:
return_tables = scrape_return_tables(race_id_list)

for key in return_tables:
    return_tables[key].index = [key] * len(return_tables[key])
    
return_tables = pd.concat([return_tables[key] for key in return_tables])
return_tables.to_pickle('return_tables_new.pickle')
return_tables_old = pd.read_pickle('return_tables_2012-2023' + old + '.pickle')
return_tables_new = pd.read_pickle('return_tables_new.pickle')

return_tables = update_data(return_tables_old,return_tables_new)
return_tables.to_pickle('return_tables_2012-2023' + new + '.pickle')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=48.0), HTML(value='')))




# scrape jockey_results

In [7]:
race_results = pd.read_pickle('race_results_2012-2023' + new + '.pickle')
jockey_id_list_all = race_results['jockey_id'].unique()
len(jockey_id_list_all)
jockey_results = scrape_jockey_results(jockey_id_list_all)
jrbu = jockey_results.copy()
for key in jockey_results:
    jockey_results[key].index = [key] * len(jockey_results[key])

jockey_results = pd.concat([jockey_results[key] for key in jockey_results], sort=False)
jockey_results['id'] = jockey_results.index
jockey_results['id'] = jockey_results[['id']].astype(str)
jockey_results['id'] = jockey_results['id'].map(lambda x: str(x)[2:7])
jockey_results.index.name = 'jockey_id'
jockey_results = jockey_results.set_index('id')
jockey_results.index.name = 'jockey_id'
jockey_results.to_pickle('jockey_results_new.pickle')
jockey_results_old = pd.read_pickle('jockey_results_2012-2023' + old + '.pickle')
jockey_results_new = pd.read_pickle('jockey_results_new.pickle')

jockey_results = pd.concat([jockey_results_old,jockey_results_new])
jockey_results = jockey_results.drop_duplicates()
jockey_results = jockey_results.sort_values(['jockey_id','日付'], ascending=[True, False])
jockey_results.to_pickle('jockey_results_2012-2023' + new + '.pickle')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=438.0), HTML(value='')))




# scrape trainer_results

In [8]:
race_results = pd.read_pickle('race_results_2012-2023' + new + '.pickle')
trainer_id_list_all = race_results['trainer_id'].unique()
len(trainer_id_list_all)
trainer_results = scrape_trainer_results(trainer_id_list_all)

for key in trainer_results:
    trainer_results[key].index = [key] * len(trainer_results[key])

trainer_results = pd.concat([trainer_results[key] for key in trainer_results], sort=False)
trainer_results['id'] = trainer_results.index
trainer_results['id'] = trainer_results[['id']].astype(str)
trainer_results['id'] = trainer_results['id'].map(lambda x: str(x)[2:7])
trainer_results.index.name = 'trainer_id'
trainer_results = trainer_results.set_index('id')
trainer_results.index.name = 'trainer_id'
trainer_results.to_pickle('trainer_results_new.pickle')
trainer_results_old = pd.read_pickle('trainer_results_2012-2023' + old + '.pickle')
trainer_results_new = pd.read_pickle('trainer_results_new.pickle')

trainer_results = pd.concat([trainer_results_old,trainer_results_new])
trainer_results = trainer_results.drop_duplicates()
trainer_results = trainer_results.sort_values(['trainer_id','日付'], ascending=[True, False])
trainer_results.to_pickle('trainer_results_2012-2023' + new + '.pickle')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=492.0), HTML(value='')))




# scrape corner_ranks

In [9]:
def scrape_corner_rank(race_id_list, pre_corner_ranks={}):
    corner_ranks = pre_corner_ranks
    for race_id in tqdm(race_id_list):
        if race_id in corner_ranks.keys():
            continue
        try:
            url = "https://db.netkeiba.com/race/" + race_id
            f = urlopen(url)
            html = f.read()
            html = html.replace(b'<br />', b'br')
            dfs = pd.read_html(html)
            corner_ranks[race_id] = dfs[4]
            time.sleep(0.1)
        except IndexError:
            continue
        except:
            break
    return corner_ranks

def convert_comma(s: str) -> str:
    result = ""
    bracket_level = 0
    for c in s:
        if c == "(":
            result += c
            bracket_level += 1
        elif c == ")":
            result += c
            bracket_level -= 1
        elif c == "," and bracket_level > 0:
            result += ":"
        else:
            result += c
    return result

def parse_string(s: str) -> pd.DataFrame:
    # 結果を保存するリスト
    df_list = []
    x = 0
    y = 0
    
    # 文字列を 1 文字ずつ取り出す
    i = 0
    while i < len(s):
        c = s[i]
        if c == "*":
            y += 1
        elif c == "(":
            x = 0
            y -= 1
        elif c == ")":
            x = 0
            y -= 1
        elif c == ":":
            x += 1
        elif c == ",":
            y -= 1.5
            x = 0
        elif c == "-":
            y -= 3
        elif c == "=":
            y -= 5
        elif c.isdigit():
            # 数字の場合は、数字を連続して取り出す
            value = 0
            while i < len(s) and s[i].isdigit():
                value = value * 10 + int(s[i])
                i += 1
            # 結果を保存するリストに追加する
            df_list.append({"number": value,"x": x, "y": y})
            continue
        i += 1
            
    # リストを DataFrame に変換して返す
    return pd.DataFrame(df_list)

In [10]:
corner_ranks = scrape_corner_rank(race_id_list)

for key in corner_ranks:
    corner_ranks[key].index = [key] * len(corner_ranks[key])
    
corner_ranks = pd.concat([corner_ranks[key] for key in corner_ranks])
corner_ranks.columns = ['corner', 'rank']
corner_ranks['rank'] = corner_ranks['rank'].astype(str).transform(convert_comma)
#corner_ranks = corner_ranks[corner_ranks[0]=='4コーナー'].drop(0,axis=1)
corner_ranks.to_pickle('corner_ranks_new.pickle')
df = pd.read_pickle('corner_ranks_2012-2023'+ old +'.pickle')
df = pd.concat([df, corner_ranks],axis=0)
df.to_pickle('corner_ranks_2012-2023'+ new +'.pickle')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=48.0), HTML(value='')))




In [11]:
df = df[df['corner']=='4コーナー']

In [12]:
df_parse = pd.DataFrame()
for i in tqdm(range(df.index.nunique())):
    test = []
    test = parse_string(df['rank'][i])
    test.index = [df.index[i]] * len(test)
    df_parse = pd.concat([df_parse,test])

df_parse['race_id'] = df_parse.index
df_parse.to_pickle('corner_ranks_parse_2012-2023'+ new +'.pickle')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=38637.0), HTML(value='')))




In [13]:
hr_master = pd.read_pickle('horse_results_2012-2023'+new+'.pickle')
hr = hr_master.copy()

def extract_number(string):
    # Use the search function to find the first occurrence of a sequence of one or more digits
    result = re.search(r'\d+$', string)
    # If a match was found, extract the matched string and return it as an integer
    if result:
        return int(result.group())
    # If no match was found, return None
    return None
#地方競争はここで除外
hr['place'] = hr['開催'].str.extract(r'(\D+)')[0].map(place_dict).fillna('11')
hr['events'] = hr['開催'].str.extract(r'(^\d+)')
hr.dropna(subset=['R'], inplace=True)
hr['R'] = hr['R'].astype(int)
hr['R'] = hr['R'].apply(lambda x: str(x).zfill(2))
hr['weeks'] = hr['開催'].apply(extract_number)
hr.dropna(subset=['weeks'], inplace=True)
hr['weeks'] = hr['weeks'].astype(int)
hr['weeks'] = hr['weeks'].apply(lambda x: str(x).zfill(2))
hr['race_id'] = hr['日付'].str[:4] + hr['place'] +hr['events'].str.zfill(2) + hr['weeks'] + hr['R']
hr.drop(['place','events','weeks'], axis=1, inplace=True)
hr['number'] = hr['馬番']

In [14]:
hr_test = hr.merge(df_parse, on=['race_id','number'],how="left")
hr_test.index = hr.index
hr_test.drop(['number'], axis=1, inplace=True)
hr_test.to_pickle('horse_results_corner_rankd_merge_2012-2023'+new+'.pickle')

# scrape race_card_table

In [32]:
scrape_race_card_table(race_id_list).to_pickle('race_card_table.pickle')
df = pd.read_pickle('race_card_tables.pickle')
df

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=72.0), HTML(value='')))




IndexError: list index out of range

In [None]:
del df, horse_birth,horse_birth_old,\
    jockey_results,jockey_results_new,jockey_results_old,\
    race_results,race_results_new,race_results_old,\
    return_tables,return_tables_old,\
    trainer_results,trainer_results_new,trainer_results_old,

In [251]:
hr_test.loc['2020103656']

Unnamed: 0,日付,開催,天気,R,レース名,映像,頭数,枠番,馬番,オッズ,人気,着順,騎手,斤量,距離,馬場,馬場指数,タイム,着差,ﾀｲﾑ指数,通過,ペース,上り,馬体重,厩舎ｺﾒﾝﾄ,備考,勝ち馬(2着馬),賞金,race_id,number,x,y
2020103656,2022/12/11,6阪神4,晴,11,阪神ジュベナイルF(G1),,18.0,5.0,9,2.6,1.0,1,川田将雅,54.0,芝1600,良,-13.0,1:33.1,-0.4,100.0,8-8,33.7-36.1,35.5,462(-6),,,(シンリョクカ),6633.0,202209060411,9,1.0,-4.0
2020103656,2022/10/29,4東京8,晴,11,アルテミスS(G3),,10.0,3.0,3,1.4,1.0,2,川田将雅,54.0,芝1600,良,-21.0,1:33.9,0.1,88.0,6-6,35.8-33.8,33.3,468(+4),,,ラヴェル,1210.2,202205040811,3,1.0,-5.0
2020103656,2022/07/30,2新潟1,晴,5,2歳新馬,,12.0,2.0,2,2.1,1.0,1,川田将雅,54.0,芝1600,良,-13.0,1:35.8,-0.5,75.0,7-7,38.2-32.0,31.4,464(0),,出遅れ,(クルゼイロドスル),700.0,202204020105,2,1.0,-5.5


In [253]:
len(hr_test)-len(hr)

0

In [3]:
hr_master = pd.read_pickle('test_horse_results_2012-20221223.pickle')

# practice

In [36]:
url = 'https://race.netkeiba.com/race/shutuba.html?race_id=' + '202306010211'
df = pd.read_html(url)[0]
df = df.T.reset_index(level=0,drop=True).T

html = requests.get(url)
html.encoding = 'EUC-JP'
soup = BeautifulSoup(html.text,'html.parser')
date = soup.find('dd',attrs={'class':'Active'}).text
df['date'] = re.findall(r'\d+/\d+',date)[0]
df


Unnamed: 0,枠,馬番,印,馬名,性齢,斤量,騎手,厩舎,馬体重(増減),Unnamed: 9_level_1,人気,登録,メモ,date
0,1,1,,ウイングレイテスト,牡6,57,松岡,美浦畠山,512(+2),---.-,**,,,1/7
1,2,2,,エイシンチラー,牝5,55,Ｍデムーロ,美浦田中剛,482(+2),---.-,**,,,1/7
2,3,3,,グラティアス,牡5,57,ムルザバエ,美浦宮田,506(+10),---.-,**,,,1/7
3,4,4,,サクラトゥジュール,牡6,57,田辺,美浦堀,526(+2),---.-,**,,,1/7
4,4,5,,ワールドバローズ,牡5,57,石川,栗東石坂,460(0),---.-,**,,,1/7
5,5,6,,インテンスライト,牡7,57,菊沢,美浦菊沢,502(+12),---.-,**,,,1/7
6,5,7,,ココロノトウダイ,牡6,58,バシュロ,美浦手塚,540(+2),---.-,**,,,1/7
7,6,8,,シュヴァリエローズ,牡5,57,三浦,栗東清水久,454(-8),---.-,**,,,1/7
8,6,9,,ノルカソルカ,牡6,57,石橋脩,栗東藤岡,502(+4),---.-,**,,,1/7
9,7,10,,キングエルメス,牡4,56,内田博,栗東矢作,496(0),---.-,**,,,1/7


In [39]:
df['date'] = df.index.str[:4] +'/'+ df['date']
#df['date'] = df['date'].str.replace('年','-').str.replace('月','-').str.replace('日','')

AttributeError: Can only use .str accessor with string values!