In [1]:
import glob
import os
import time 
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from Config import params_config

In [2]:
parameters = params_config.parameters
parameters

{'TARGET_URL': 'https://race.netkeiba.com/?pid=schedule&select=schedule&year={YEAR}',
 'YEAR_RANGE': range(2002, 2020),
 'DRIVER_DIR': './chromedriver',
 'PAGE_LOAD_TIMEOUT': 10}

In [3]:
def initialize_chromdriver(parameters):
    driver = webdriver.Chrome(executable_path=parameters['DRIVER_DIR'])
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    
    return driver

In [4]:
def load_target_url_page(driver, parameters, year):
    target_url = parameters['TARGET_URL'].format(YEAR=year)
    
    try:
        driver.get(target_url)
        print('We can load the URL:', driver.current_url)
    except:
        print("The page load was time out")

In [5]:
def close_chrome_window(driver):
    driver.quit()

In [6]:
def back_chrome_window(driver):
    driver.back()

In [29]:
driver = initialize_chromdriver(parameters)

In [30]:
#  Year毎にfor roop
year = parameters['YEAR_RANGE'][17]
load_target_url_page(driver, parameters, year)

We can load the URL: https://race.netkeiba.com/?pid=schedule&select=schedule&year=2019


### Get race list from schedule pages

In [9]:
def get_race_info_from_schedule_page(driver):
    # find table tag info 
    race_schedule_block_elem = driver.find_element_by_class_name('race_schedule_block')
    race_schedule_block_elem_list = race_schedule_block_elem.find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')
    
    # define table header
    race_schedule_block_header = race_schedule_block_elem_list[0].find_elements_by_tag_name('th')
    race_schedule_block_header = [race_schedule_block_header[i].text for i in range(len(race_schedule_block_header))]
    
    # make race info data frame
    race_schedule_block_info_list = []
#     for i in range(1, len(race_schedule_block_elem_list)):
    for i in range(1, len(race_schedule_block_elem_list)-120):
        race_schedule_block_body = race_schedule_block_elem_list[i].find_elements_by_tag_name('td')
        race_schedule_block_body = [race_schedule_block_body[i].text for i in range(len(race_schedule_block_body))]
        race_schedule_block_info_list.append(race_schedule_block_body)
    
    return pd.DataFrame(race_schedule_block_info_list, columns=race_schedule_block_header)

In [10]:
race_schedule_block_df = get_race_info_from_schedule_page(driver)
race_schedule_block_df

Unnamed: 0,日付,レース名,格,場,距離,条件,重量
0,2019/01/05(土),中山金杯,G3,中山,芝2000m,4歳上,ハンデ
1,2019/01/05(土),京都金杯,G3,京都,芝1600m,4歳上,ハンデ
2,2019/01/06(日),シンザン記念,G3,京都,芝1600m,3歳,別定
3,2019/01/12(土),フェアリーS,G3,中山,芝1600m,3歳牝,別定
4,2019/01/13(日),日経新春杯,G2,京都,芝2400m,4歳上,ハンデ
5,2019/01/14(月),京成杯,G3,中山,芝2000m,3歳,別定
6,2019/01/20(日),アメリカJCC,G2,中山,芝2200m,4歳上,別定
7,2019/01/20(日),東海S,G2,中京,ダ1800m,4歳上,別定
8,2019/01/26(土),愛知杯,G3,中京,芝2000m,4歳上牝,ハンデ


### Click and get each race information

In [72]:
def make_web_driver_click(driver, parameters, xpath, verbose=True):
    try:
        WebDriverWait(driver, parameters['PAGE_LOAD_TIMEOUT']).until(EC.element_to_be_clickable((By.XPATH, xpath)))
        driver.find_element_by_xpath(xpath).click()
        if verbose:
            print('We can load the XPATH and now locate in:', driver.current_url)
        
    except:
        print('The page load was time out')

In [26]:
def click_into_top_page_of_each_race(driver, parameters, race_idx):
    xpath_to_each_race = '//*[@id="main"]/div/table/tbody/tr[{TR_IDX}]/td[2]/a'.format(TR_IDX=race_idx)
    make_web_driver_click(driver, parameters, xpath_to_each_race)

In [65]:
def click_into_race_table_page_of_each_race(driver, parameters):
    xpath_to_each_race_table = '//*[@id="main"]/div[1]/div/div[2]/div[2]/div/ul[1]/li[2]/a/span'
    make_web_driver_click(driver, parameters, xpath_to_each_race_table, verbose=False) 
    
    xpath_to_each_race_table = '//*[@id="race_main"]/div[1]/ul[2]/li[1]/a'
    make_web_driver_click(driver, parameters, xpath_to_each_race_table) 

In [73]:
race_idx = 9 # 2 to race_schedule_block_df.shape[0]+1
print('We get the information of race:', list(race_schedule_block_df.loc[race_idx-2, :]))

click_into_top_page_of_each_race(driver, parameters, race_idx)

We get the information of race: ['2019/01/20(日)', '東海S', 'G2', '中京', 'ダ1800m', '4歳上', '別定']
We can load the XPATH and now locate in: https://race.netkeiba.com/?pid=special&id=0057


In [74]:
click_into_race_table_page_of_each_race(driver, parameters)

We can load the XPATH and now locate in: https://race.netkeiba.com/?pid=race_old&id=c201907010211


In [154]:
def get_race_table(driver):
    xpath_to_race_table = '//*[@id="shutuba"]/diary_snap/table/tbody'
    race_table_elem = driver.find_element_by_xpath(xpath_to_race_table)

    race_table_header_elem = race_table_elem.find_element_by_xpath('tr').find_elements_by_xpath('th')
    race_table_header = [race_table_header_elem[i].text.replace('\n','') for i in range(len(race_table_header_elem))]

    race_table_body_elem = race_table_elem.find_elements_by_xpath('tr')[3:]
    race_table_body_list = []
    for i in range(len(race_table_body_elem)):
        race_table_body_elem_list = race_table_body_elem[i].find_elements_by_xpath('td')
        race_table_body = [race_table_body_elem_list[j].text.replace('\n','') for j in range(len(race_table_body_elem_list))]
        race_table_body_list.append(race_table_body[:-2])
        
    return pd.DataFrame(race_table_body_list, columns=race_table_header[:-1])

In [153]:
get_race_table(driver)

Unnamed: 0,枠番,馬番,あなたの印※,馬名,性齢,負担重量,騎手,厩舎,馬体重,単勝オッズ,人気
0,1,1,…◎○▲△☆消,グレンツェント,牡6,56.0,ミナリク,加藤征,484(+6),28.3,5
1,2,2,…◎○▲△☆消,アスカノロマン,牡8,56.0,太宰,川村,526(+2),85.5,8
2,3,3,…◎○▲△☆消,チュウワウィザード,牡4,56.0,川田,大久保,480(+3),4.8,2
3,4,4,…◎○▲△☆消,インティ,牡5,56.0,武豊,野中,514(-2),1.5,1
4,4,5,…◎○▲△☆消,スマハマ,牡4,55.0,藤岡佑,高橋亮,538(+4),7.9,4
5,5,6,…◎○▲△☆消,アングライフェン,牡7,56.0,鮫島駿,安田隆,496(+18),73.6,7
6,5,7,…◎○▲△☆消,アンジュデジール,牝5,55.0,横山典,昆,488(+2),7.5,3
7,6,8,…◎○▲△☆消,モルトベーネ,牡7,56.0,藤岡康,松永昌,478(+12),184.0,11
8,6,9,…◎○▲△☆消,シャイニービーム,牡7,56.0,藤懸,羽月,458(0),333.8,12
9,7,10,…◎○▲△☆消,コスモカナディアン,牡6,56.0,丸山,金成,500(+6),32.6,6


In [138]:
race_table_body

['2',
 '2',
 '…◎○▲△☆消',
 'アスカノロマン',
 '牡8',
 '56.0',
 '太宰',
 '川村',
 '526(+2)',
 '85.5',
 '8',
 '',
 ' ']

In [68]:
back_chrome_window(driver)

In [69]:
# close_chrome_window(driver)