In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import os
import sys
import datetime
import time
import re
import itertools
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException

In [3]:
parameters = {

    # parameters about scraping
    'URL_ABOUT_KEIBA_YAHOO': 'https://keiba.yahoo.co.jp/schedule/list/{YEAR}/?place={PLACE}',
#     'YEAR_RANGE': range(1986, 2020),
#     'PLACE_RANGE': range(1, 10),
    'YEAR_RANGE': range(1986, 1987),
    'PLACE_RANGE': range(1, 2),
    'DRIVER_DIR': '../chromedriver',
    'PAGE_LOAD_TIMEOUT': 10,
}

## Initialize ChromeDriver

In [4]:
def initialize_chrome_driver(parameters):
    chrome_options = Options()
    chrome_options.add_argument('--dns-prefetch-disable')
    driver = Chrome(executable_path=parameters['DRIVER_DIR'], chrome_options=chrome_options)
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    return driver

def _load_target_url_page(target_url):
    try:
        driver.get(target_url)
        print('We could load the URL:', driver.current_url)
    except (TimeoutException, urllib3.exceptions.MaxRetryError, InvalidSessionIdException) as e:
        print('We could not load the URL because of: ', e)
        driver.refresh()

In [5]:
driver = initialize_chrome_driver(parameters)

## Get race date info

In [6]:
def _extract_race_date_info(year, place, race_date_and_place_text):
    race_text_1st = re.split('\n', race_date_and_place_text)[0]
    race_text_2nd = re.split('\n', race_date_and_place_text)[1]
    
    race_year = year
    race_month = int(re.split('月|日', race_text_1st)[0])
    race_date = int(re.split('月|日', race_text_1st)[1])
    race_place = place
    race_kai = int(re.split('回|日', race_text_2nd)[0])
    race_nichi = int(re.sub("\\D", "", re.split('回|日', race_text_2nd)[1]))
    
    return [race_year, race_place, race_kai, race_nichi, race_month, race_date]

In [7]:
def _get_round_list(driver, xpath_to_date):
    url_to_round_list = driver.find_element_by_xpath(xpath_to_date+'/a').get_attribute("href")
    _load_target_url_page(url_to_round_list)
    length_of_round = len(driver.find_elements_by_class_name('scheRNo'))
    return list(range(1, length_of_round+1))

In [8]:
def _make_each_year_place_race_date_list(driver, target_url, year, place):
    each_year_place_race_date_list = []
    idx = 2
    while True :
        try:
            _load_target_url_page(target_url)
            xpath_to_date = '//*[@id="wrap"]/div[1]/div[1]/table/tbody/tr[{IDX}]/td[1]'.format(IDX=idx)
            race_date_and_place_text = driver.find_element_by_xpath(xpath_to_date).text
            race_date_info_list = _extract_race_date_info(year, place, race_date_and_place_text)

            round_list = _get_round_list(driver, xpath_to_date)
            each_year_place_race_date_list += [list(itertools.chain.from_iterable([race_date_info_list, [i]])) for i in round_list]

            idx += 2
            time.sleep(1)

        except NoSuchElementException:
            print('break')
            break
    return each_year_place_race_date_list

In [9]:
def get_race_date_df(driver, parameters):
    race_date_df = pd.DataFrame()
    for year in parameters['YEAR_RANGE']:
        for place in parameters['PLACE_RANGE']:
            target_url = parameters['URL_ABOUT_KEIBA_YAHOO'].format(YEAR=year, PLACE=place)
            each_year_place_race_date_list = _make_each_year_place_race_date_list(driver, target_url, year, place)
            race_date_df = pd.concat([race_date_df, pd.DataFrame(each_year_place_race_date_list)])
    
    race_date_df.columns = ['race_year', 'race_place_id', 'race_kai', 'race_nichi', 'race_month', 'race_date', 'race_round']
    race_date_df = race_date_df.loc[:, ['race_year', 'race_place_id', 'race_kai', 'race_nichi', 'race_round', 'race_month', 'race_date']]
    return race_date_df

In [10]:
race_date_df = get_race_date_df(driver, parameters)

We could load the URL: https://keiba.yahoo.co.jp/schedule/list/1986/?place=1
We could load the URL: https://keiba.yahoo.co.jp/race/list/86010101/
We could load the URL: https://keiba.yahoo.co.jp/schedule/list/1986/?place=1
We could load the URL: https://keiba.yahoo.co.jp/race/list/86010102/
We could load the URL: https://keiba.yahoo.co.jp/schedule/list/1986/?place=1
We could load the URL: https://keiba.yahoo.co.jp/race/list/86010103/
We could load the URL: https://keiba.yahoo.co.jp/schedule/list/1986/?place=1
We could load the URL: https://keiba.yahoo.co.jp/race/list/86010104/
We could load the URL: https://keiba.yahoo.co.jp/schedule/list/1986/?place=1
We could load the URL: https://keiba.yahoo.co.jp/race/list/86010105/
We could load the URL: https://keiba.yahoo.co.jp/schedule/list/1986/?place=1
We could load the URL: https://keiba.yahoo.co.jp/race/list/86010106/
We could load the URL: https://keiba.yahoo.co.jp/schedule/list/1986/?place=1
We could load the URL: https://keiba.yahoo.co.j

In [11]:
race_date_df

Unnamed: 0,race_year,race_place_id,race_kai,race_nichi,race_round,race_month,race_date
0,1986,1,1,1,1,6,7
1,1986,1,1,1,2,6,7
2,1986,1,1,1,3,6,7
3,1986,1,1,1,4,6,7
4,1986,1,1,1,5,6,7
5,1986,1,1,1,6,6,7
6,1986,1,1,1,7,6,7
7,1986,1,1,1,8,6,7
8,1986,1,1,1,9,6,7
9,1986,1,1,1,10,6,7


In [12]:
# race_date_df.to_csv('race_date_info.csv', index=False)