In [1]:
import re
import glob
import os
import time 
import datetime
import pandas as pd
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException

import warnings
warnings.filterwarnings('ignore')

from Config import params_config, db_config
from Utils.bulk_insert import BulkInsert

In [33]:
parameters = params_config.parameters
db_params = db_config.db_params
con = pymysql.connect(**db_params)

In [3]:
def initialize_chrome_driver(parameters):
    driver = webdriver.Chrome(executable_path=parameters['DRIVER_DIR'])
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    return driver


def get_nearest_future_holidays_list():
    target_datetime = datetime.date.today()
    target_datetime = datetime.date(2019, 7, 14)
    if target_datetime.weekday() == 6:
        sunday_datetime_str = target_datetime.strftime("%Y%m%d")
        target_datetime_minus_1 = target_datetime + datetime.timedelta(days=-1)
        saturday_datetime_str = target_datetime_minus_1.strftime("%Y%m%d")
        return [saturday_datetime_str, sunday_datetime_str]
    
    while True:
        if target_datetime.weekday() == 5:
            saturday_datetime_str = target_datetime.strftime("%Y%m%d")
            target_datetime_plus_1 = target_datetime + datetime.timedelta(days=1)
            sunday_datetime_str = target_datetime_plus_1.strftime("%Y%m%d")
            return [saturday_datetime_str, sunday_datetime_str]
        else:
            target_datetime = target_datetime + datetime.timedelta(days=1)


def get_latest_holidays_list(target_datetime):
    while True:
        if target_datetime.weekday() == 5:
            saturday_datetime_str = target_datetime.strftime("%Y%m%d")
            target_datetime_plus_1 = target_datetime + datetime.timedelta(days=1)
            sunday_datetime_str = target_datetime_plus_1.strftime("%Y%m%d")
            return [saturday_datetime_str, sunday_datetime_str]
        else:
            target_datetime = target_datetime + datetime.timedelta(days=-1)

In [22]:
driver = initialize_chrome_driver(parameters)

In [23]:
target_datetime_list = get_nearest_future_holidays_list()
target_datetime_str = '20190601'
target_datetime_str

'20190601'

## KeibaLabScraper
### scraping_race_info_in

In [14]:
# Functions
def _back_chrome_window_for_number_of(times):
    for _ in range(times):
        driver.back()
    return None

def _load_target_url_page(target_datetime_str):
    target_url = parameters['TARGET_URL_OF_KAIBALAB_RACE'] + target_datetime_str
    try:
        driver.get(target_url)
        print('We could load the URL:', driver.current_url)
    except Exception as e:
        print(e)

def _make_web_driver_click_by(xpath, verbose=True):
    for i in range(parameters['RETRIES']):
        try:
            WebDriverWait(driver, parameters['PAGE_LOAD_TIMEOUT']).until(
                EC.element_to_be_clickable((By.XPATH, xpath)))
            driver.find_element_by_xpath(xpath).click()

            if verbose:
                print('We could load the XPATH and now locate in:', driver.current_url)

        except TimeoutException:
            print('Timeout, Retrying... ({TIME}/{MAX})'.format(TIME=i+1, MAX=parameters['RETRIES']))
            continue
        else:
            return 'click_success'
    return 'click_failed'

def _is_race_kakutei(xpath_table_of_race_result, table_tbody_idx):
    xpath_for_kakutei_box = \
        xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[3]'.format(TBODY_IDX=table_tbody_idx + 1)
    try:
        driver.find_element_by_xpath(xpath_for_kakutei_box).find_element_by_tag_name('a')
        return True
    except NoSuchElementException:
        return False

def _make_xpath_list_of_race_result_link():
    xpath_list_of_race_result_link = []

    xpath_race_info_div = '//*[@id="raceInfo"]/div[1]'
    race_table_header_list = \
        driver.find_element_by_xpath(xpath_race_info_div).find_elements_by_tag_name('table')

    for table_header_idx in range(len(race_table_header_list)):
        xpath_table_of_race_result = \
            xpath_race_info_div + '/table[{TABLE_IDX}]'.format(TABLE_IDX=table_header_idx + 1)
        race_table_body_elem_list = driver.find_element_by_xpath(
            xpath_table_of_race_result).find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')

        for table_tbody_idx in range(len(race_table_body_elem_list)):
            xpath_tdoby_of_table_race_result = \
                xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[2]/a'.format(TBODY_IDX=table_tbody_idx + 1)
            if _is_race_kakutei(xpath_table_of_race_result, table_tbody_idx):
                xpath_list_of_race_result_link.append(['kakutei', xpath_tdoby_of_table_race_result])
            else:
                xpath_list_of_race_result_link.append(['not_kakutei', xpath_tdoby_of_table_race_result])

    if xpath_list_of_race_result_link is None:  # case that this day has no races
        return None

    return xpath_list_of_race_result_link

def _get_race_id(xpath_of_race_result_link):
    race_id = \
        int(re.sub('\\D', '', driver.current_url.split(parameters['TARGET_URL_OF_KAIBALAB_RACE'])[1]))
    while True:
        if len(str(race_id)) != 12:
            print('Failed to get the correct race_id, so retry to get the id')
            _make_web_driver_click_by(xpath_of_race_result_link[1])
            race_id = int(re.sub('\\D', '', driver.current_url.split(
                parameters['TARGET_URL_OF_KAIBALAB_RACE'])[1]))
        else:
            break
    return race_id

def _is_race_id_existing_in_db(race_id):
    select_query = 'SELECT race_id FROM keibalab_race_master WHERE race_id = "{RACE_ID}";'.format(RACE_ID=race_id)
    cursor = con.cursor()
    cursor.execute(select_query)
    fetch_result = cursor.fetchone()
    if fetch_result is None:
        return False
    elif int(fetch_result[0]) == race_id:
        return True

def _get_race_data_box_list(race_id):
    race_data_box = driver.find_element_by_xpath('//*[@id="tab1"]/div[2]/div[1]/div[1]/div[1]/div[2]')
    race_timing = race_data_box.find_element_by_xpath('//p[@class="bold"]').text
    race_title = race_data_box.find_element_by_xpath('//h1[@class="raceTitle fL"]').text
    try:
        race_weather = race_data_box.find_elements_by_xpath('//div[@class="weather_ground fL"]/ul/li')[0].text
        race_condition = race_data_box.find_elements_by_xpath('//div[@class="weather_ground fL"]/ul/li')[1].text
    except IndexError:
        race_weather = 'unknown'
        race_condition = 'unknown'
    course_syokin_elem = race_data_box.find_elements_by_xpath('//ul[@class="classCourseSyokin clearfix"]/li')
    course_syokin_list = ','.join([course_syokin_elem[i].text.replace('\u3000', ' ')
                                   for i in range(len(course_syokin_elem))]).replace(',', ' ')

    return [race_id, race_timing, race_title, race_weather, race_condition, course_syokin_list]

def _get_race_result_tbody_list(race_id):
    race_result_tbody_row_elem_list = \
        driver.find_elements_by_xpath('//*[@class="DbTable stripe resulttable"]/tbody/tr')
    race_result_tbody_list = []

    for i in range(len(race_result_tbody_row_elem_list)):
        arrival_order = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[1]').text
        post_position = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[2]').text

        horse_number = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[3]').text
        horse_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[4]').text
        href_to_the_horse = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[4]/a').\
            get_attribute("href")
        horse_age = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[5]').text
        horse_weight = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[15]').text
        horse_impost = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[6]').text

        jockey_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[7]').text
        href_to_the_jockey = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[7]/a').\
            get_attribute("href")

        popularity_order = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[8]').text
        win_odds = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[9]').text

        trainer_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[14]').text
        href_to_the_trainer = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[14]/a').\
            get_attribute("href")

        race_result_tbody_list.append(
            [race_id, arrival_order, post_position, horse_number, horse_name, href_to_the_horse, horse_age,
             horse_weight, horse_impost, jockey_name, href_to_the_jockey, popularity_order, win_odds, trainer_name,
             href_to_the_trainer])

    return race_result_tbody_list

def _get_race_prior_info_tbody_list(race_id):

    while True:
        xpath_to_web_newspaper = '//*[@id="dbNav"]/div/ul/li[1]/a'
        _make_web_driver_click_by(xpath_to_web_newspaper, verbose=False)
        xpath_to_yoko_gata = '//*[@id="dbnewWrap"]/div/article/div/section/div[4]/div[1]/div[1]/ul/li[2]/a'
        is_success = _make_web_driver_click_by(xpath_to_yoko_gata, verbose=False)

        if is_success == 'click_failed':
            return None

        race_info_tbody_row_elem_list = driver.find_elements_by_xpath('//*[@id="top"]/table/tbody/tr')
        if len(race_info_tbody_row_elem_list) == 0:
            print('NG')
            _back_chrome_window_for_number_of(2)
        else:
            break

    race_info_tbody_list = []
    for i in range(len(race_info_tbody_row_elem_list)):
#         try:
        post_position = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[1]').text
        horse_number = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[2]').text
        horse_name = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[1]').text
        href_to_the_horse = \
            race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[1]/a').get_attribute("href")
        jockey_name = \
            race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[4]/dl/dd[2]/a')[0].get_attribute("href")
        href_to_the_jockey = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[2]').text
        trainer_name = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[3]/a').text
        sex_and_age, interval = \
            race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[3]').text.split(trainer_name)
        horse_age = re.sub("\\D", "", sex_and_age)
        horse_sex = re.match('[0-9a-zA-Zあ-んア-ン一-鿐]', sex_and_age).group()

        try:
            popularity_order = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[0].text
            win_odds = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[1].text
            horse_weight = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[2].text
            horse_weight_increment_from_previous = \
                race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[3].text

        except IndexError:
            popularity_order = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[5]').text
            win_odds = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[5]').text
            horse_weight = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[5]').text
            horse_weight_increment_from_previous = \
                race_info_tbody_row_elem_list[i].find_element_by_xpath('td[5]').text

        owner_name = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[0].text
        href_to_the_owner = \
            race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[0].get_attribute("href")
        breeder_name = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[1].text

        jockey_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[0].text
        horse_number_finish_first_second = \
            race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[1].text
        stallion_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[2].text
        conbi_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[3].text

        zensou_info_elem = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td/a/dl')
        zensou_info_list = [(zensou_info_elem[i].text.replace('\n', ' ').replace('  ', ' '))
                            for i in range(len(zensou_info_elem))]
        zensou_info_list = [[zensou_info_list[i].replace(' ', ',')] for i in range(len(zensou_info_elem))]
        zensou_info_list = str(zensou_info_list).replace('\'', '')

#         except IndexError:
#             continue

        race_info_tbody_list.append(
            [race_id, post_position, horse_number, horse_name, href_to_the_horse, jockey_name, href_to_the_jockey,
             trainer_name, horse_age, horse_sex, popularity_order, win_odds, horse_weight,
             horse_weight_increment_from_previous, owner_name, href_to_the_owner, breeder_name,
             jockey_finish_first_second, horse_number_finish_first_second, stallion_finish_first_second,
             conbi_finish_first_second, zensou_info_list])

    return race_info_tbody_list

In [28]:
_load_target_url_page(target_datetime_str)

We could load the URL: https://www.keibalab.jp/db/race/20190601


In [29]:
xpath_list_of_race_result_link = _make_xpath_list_of_race_result_link()
print(len(xpath_list_of_race_result_link))
xpath_list_of_race_result_link

24


[['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[1]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[2]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[3]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[4]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[5]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[6]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[7]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[8]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[9]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[10]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[11]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[12]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[2]/tbody/tr[1]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[2

In [30]:
xpath_of_race_result_link = xpath_list_of_race_result_link[13]
is_success = _make_web_driver_click_by(xpath_of_race_result_link[1])
is_success

We could load the XPATH and now locate in: https://www.keibalab.jp/db/race/201906010902/


'click_success'

In [31]:
race_id = _get_race_id(xpath_of_race_result_link[1])
race_id

201906010902

In [34]:
_is_race_id_existing_in_db(race_id)

False

In [None]:
race_data_box_list = _get_race_data_box_list(race_id)
race_data_box_list

In [None]:
race_result_tbody_list = _get_race_result_tbody_list(race_id)
race_result_tbody_list

In [None]:
race_prior_info_tbody_list = _get_race_prior_info_tbody_list(race_id)
race_prior_info_tbody_list