In [3]:
import pandas as pd
from bs4 import BeautifulSoup
from io import StringIO
import requests
from tqdm import tqdm
import time
import os
import re
import datetime

In [4]:
def getTables(response):
    # HTTPステータスコードが200（成功）の場合のみ処理を続行
    if response.getcode() == 200:
        # HTMLをパースしてBeautifulSoupオブジェクトを作成
        bs = BeautifulSoup(response, 'html.parser')
        bs = bs.decode('UTF-8')
        html_string = str(bs)
        html_io = StringIO(html_string)

        # テーブルデータを抽出
        tables = pd.read_html(html_io)
        df = tables[0]
        print(df)
        return df

    else:
        print(f"HTTPステータスコード {response.getcode()}: ページの取得に失敗しました")
        

In [5]:
#開催年
years = [str(i).zfill(4) for i in range(2021, 2022)]
#開催場所 01:札幌, 02:函館, 03:福島, 04:新潟, 05:東京, 06:中山, 07:中京, 08:京都, 09::阪神, 10:小倉
places = [str(i).zfill(2) for i in range (5, 6)]
#開催回
times = [str(i).zfill(2) for i in range(1, 2)]
#開催日
days = [str(i).zfill(2) for i in range(1, 2)]
#レースNo
races = [str(i).zfill(2) for i in range(1, 13)]

raceIdList = []
for y in years:
    for p in places:
        for t in times:
            for d in days:
                for r in races:
                    raceIdList.append(y + p + t + d + r)
                    

In [6]:

url = 'https://db.netkeiba.com/race/'
colName = ['raceId', 'htmlBytes']
df = pd.DataFrame(columns=colName)
escapeList = []

# pickleファイルが存在するか確認し、データを読み込む
if os.path.isfile('race_html.pkl'):
    df = pd.read_pickle('race_html.pkl')
    escapeList = df['raceId'].to_list()

# ページネーション用の競走IDリストを生成する関数
def addEscapeList(id: str, ll: list):
    idAry = [id[0:4], id[4:6], id[6:8], id[8:10], id[10:12]]
    for r in range(1, 13):
        idAry[4] = str(r).zfill(2)
        ll.append(''.join(idAry))
    if idAry[3] == '01':
        for d in range(2, 9):
            idAry[3] = str(d).zfill(2)
            ll = addEscapeList(''.join(idAry), ll)
    if idAry[2] == '01':
        for t in range(2, 9):
            idAry[2] = str(t).zfill(2)
            ll = addEscapeList(''.join(idAry), ll)

    return ll

# raceIdListに実際の競走IDのリストがあると仮定します
# raceIdList = ...

for raceId in tqdm(raceIdList):
    try:
        if raceId in escapeList:
            continue
        response = url + raceId
        html = requests.get(response)
        #html.encoding = 'utf-8'
        soup = BeautifulSoup(html.content, 'html.parser')

        if 'レース結果' in soup.text:
            tmpDf = pd.DataFrame([[raceId, html.content]], columns=colName)  # 'colmuns'の誤りを修正
            df = pd.concat([df, tmpDf], axis=0, ignore_index=True)
        else:
            escapeList = addEscapeList(raceId, escapeList)  # 変数名を修正

        time.sleep(3)
    except Exception as e:  # 例外をキャッチしてログに記録
        print(f'例外が発生しました: {str(e)}')

# DataFrameをpickleファイルに保存
df.to_pickle('race_html.pkl')


100%|██████████| 12/12 [00:00<?, ?it/s]


In [11]:
htmlDf = pd.read_pickle('race_html.pkl')

raceInfoList = []

#raceResultDf = pd.DataFrame()
for idx, dat in tqdm(htmlDf.iterrows(), total=len(htmlDf)):
    raceId = dat['raceId']
    htmlBytes = dat['htmlBytes']

    soup = BeautifulSoup(htmlBytes.decode('euc-jp','ignore'), 'html.parser')
    mainrace_data = soup.find('div', class_='mainrace_data')
    rowdata = {}
    rowdata['raceId'] = raceId
    rowdata['レース名'] = mainrace_data.find('h1').text
    rowdata['R'] = mainrace_data.find('dt').text.replace('\n','').replace(' ','').replace('R', '')
    spantexts = mainrace_data.find('span').text.replace('\xa0','').replace(' ','').split('/')
    
    rowdata['コース種'] = '障害' if '障' in spantexts[0] else 'ダート' if 'ダ' in spantexts[0] else '芝'
    
    rowdata['コース回り'] = '右' if '右' in spantexts[0] else '左' if '左' in spantexts[0] else '障害'

    rowdata['距離'] = int(re.findall('\d+',spantexts[0])[0])

    rowdata['天気'] = spantexts[1][3:]

    rowdata['馬場'] = spantexts[2].split(':')[1]
    rowdata['発走'] = spantexts[3][3:]

    smalltxt = mainrace_data.find('p', class_='smalltxt').text.replace('\xa0',' ').replace('  ',' ').split(' ')
    dt = datetime.datetime.strptime(smalltxt[0], '%Y年%m月%d日')
    rowdata['日付'] = dt.strftime('%Y/%m/%d')

    placeDict = {
        '01':'札幌', '02':'函館', '03':'福島', '04':'新潟', '05':'東京', '06':'中山', '07':'中京', '08':'京都', '09':'阪神', '10':'小倉'
    }
    rowdata['開催場所'] = placeDict[raceId[4:6]]

    if 'G1' in rowdata['レース名']:
        raceGrade = 'G1'
    elif 'G2' in rowdata['レース名']:
        raceGrade = 'G2'
    elif 'G3' in rowdata['レース名']:
        raceGrade = 'G3'
    elif '未勝利' in smalltxt[2]:
        raceGrade = '未勝利'
    elif '新馬' in smalltxt[2]:
        raceGrade = '新馬'
    elif '1勝クラス' in smalltxt[2] or '500万' in smalltxt[2]:
        raceGrade = '1勝クラス'
    elif '2勝' in smalltxt[2] or '1000万' in smalltxt[2]:
        raceGrade = '2勝クラス'
    elif '3勝' in smalltxt[2] or '1600万' in smalltxt[2]:   
        raceGrade = '3勝クラス'
    else:
        raceGrade = 'オープン'  

    if '牡・牝' in smalltxt[3]:
        restriction = '牡・牝'
    elif '牝' in smalltxt[3]:
        restriction = '牝'
    elif '牡' in smalltxt[3]:
        restriction = '牡'
    else:
        restriction = '無'

    rowdata['制限'] = restriction

    if 'ハンデ' in smalltxt[3]:
        handicap = 'ハンデ'
    elif '別定' in smalltxt[3]:
        handicap = '別定'
    else:
        handicap = '定量'
    
    rowdata['ハンデ'] = handicap

    raceInfoList.append(rowdata)
raceInfoDf = pd.DataFrame(raceInfoList)
raceInfoDf.to_pickle('race_info.pkl')

  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:00<00:00, 17.97it/s]


In [14]:
raceResultDf = pd.read_pickle('race_info.pkl')
raceResultDf

Unnamed: 0,raceId,レース名,R,コース種,コース回り,距離,天気,馬場,発走,日付,開催場所,制限,ハンデ
0,202105010101,3歳未勝利,1,ダート,左,1400,晴,重,10:10,2021/01/30,東京,牝,定量
1,202105010102,3歳新馬,2,ダート,左,1400,晴,重,10:40,2021/01/30,東京,無,定量
2,202105010103,3歳未勝利,3,ダート,左,1600,晴,重,11:10,2021/01/30,東京,無,定量
3,202105010104,3歳1勝クラス,4,ダート,左,1600,晴,重,11:40,2021/01/30,東京,無,定量
4,202105010105,3歳未勝利,5,芝,左,1800,晴,良,12:30,2021/01/30,東京,無,定量
5,202105010106,3歳新馬,6,芝,左,1600,晴,良,13:00,2021/01/30,東京,無,定量
6,202105010107,4歳以上1勝クラス,7,ダート,左,1400,晴,重,13:30,2021/01/30,東京,無,定量
7,202105010108,4歳以上2勝クラス,8,ダート,左,2100,晴,重,14:01,2021/01/30,東京,無,定量
8,202105010109,銀蹄ステークス,9,ダート,左,1400,晴,重,14:35,2021/01/30,東京,無,ハンデ
9,202105010110,クロッカスステークス(L),10,芝,左,1400,晴,良,15:10,2021/01/30,東京,無,別定


In [None]:
table = soup.find_all('table')[0]

columns = []

for head in table.find_all('th'):
    columns.append(head.text)

columns = ['raceId'] + columns + ['horseId', 'jockeyId', 'trainerId']
df = pd.DataFrame(columns=columns)

for i, row in enumerate(table.find_all('tr')):
    if i == 0:
        continue
    items = [raceId]

    cells = row.find_all('td')

    for cell in cells:
        items.append(cell.text.replace('\n',''))

    items.append(str(cells[3]).split('/horse/')[1].split('/')[0])
    items.append(str(cells[6]).split('/recent/')[1].split('/')[0])
    items.append(str(cells[18]).split('/recent')[1].split('/')[0])

    df.loc[i] = items
    break

raceResultDf = pd.concat([raceResultDf, df],axis=0)


In [12]:
htmlBytes

b'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" lang="ja" xml:lang="ja" id="html">\n<head>\n\n<title>4\xba\xd0\xb0\xca\xbe\xe51\xbe\xa1\xa5\xaf\xa5\xe9\xa5\xb9\xa1\xc32021\xc7\xaf1\xb7\xee30\xc6\xfc | \xb6\xa5\xc7\xcf\xa5\xc7\xa1\xbc\xa5\xbf\xa5\xd9\xa1\xbc\xa5\xb9 - netkeiba.com</title>\n\n\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n<meta http-equiv="content-language" content="ja" />\n<meta http-equiv="content-type" content="text/html; charset=euc-jp" />\n<meta http-equiv="content-script-type" content="text/javascript" />\n<meta http-equiv="content-style-type" content="text/css" />\n<meta name="description" content="netkeiba.com\xa4\xac\xb8\xd8\xa4\xeb\xb9\xf1\xc6\xe2\xba\xc7\xc2\xe7\xb5\xe9\xa4\xce\xb6\xa5\xc7\xcf\xa5\xc7\xa1\xbc\xa5\xbf\xa5\xd9\xa1\xbc\xa5\xb9\xa4\xc7\xa4\xb9\xa1\xa350\xcb\xfc\xc6\xac\xb0\xca\xbe\xe5\xa4\xce\xb6\xa5\xc1\xf6\xc

In [None]:
df.to_csv('data.csv',encoding='utf-8')
df.columns
print(df)