In [1]:
import re
import glob
import os
import time 
import datetime
import pandas as pd
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import warnings
warnings.filterwarnings('ignore')

from Config import params_config, db_config
from Utils.bulk_insert import BulkInsert

In [2]:
parameters = params_config.parameters

In [3]:
class BulkInsert(object):
    def __init__(self, con):
        self.con = con

    def execute(self, insert_data, target_table, col_names):
        if len(insert_data) == 0:
            return None

        try:
            cursor = self.con.cursor()
            query = self._generate_query(col_names, insert_data, target_table)
            cursor.execute(query)
            self.con.commit()
        except Exception:
            raise RuntimeError()

    @staticmethod
    def _generate_query(col_names, insert_data, target_table):
        schema = '(' + ', '.join(map(lambda x: str(x), col_names)) + ')'
        data_text =  ', '.join(map(lambda x: str(tuple(x)), insert_data))
        return 'INSERT INTO ' + target_table + schema + ' VALUES ' + data_text

In [4]:
def _bulk_insert(insert_list, target_table_name, insert_col_names):
    bi = BulkInsert(con)
    bi.execute(insert_data=insert_list, target_table=target_table_name, col_names=insert_col_names)

In [145]:
db_params = db_config.db_params
con = pymysql.connect(**db_params)

##  競馬結果のスクレイピングロジック
- 将来直近の土日に関して
    - ウェブ新聞の予想情報を取得
- 過去の土日に関して
    - ウェブ新聞の予想情報を取得
    - 結果払い戻し情報を取得
<br>

1986/01/05 まで情報あり<br>
土曜と日曜（レースがない日もある）だけ記録があるので、そこだけスクレイピングする

In [6]:
def initialize_chrome_driver(parameters):
    driver = webdriver.Chrome(executable_path=parameters['DRIVER_DIR'])
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    return driver


def get_nearest_future_holidays_list():
    target_datetime = datetime.date.today()
#     target_datetime = datetime.date(2019, 7, 15)
    if target_datetime.weekday() == 6:
        sunday_datetime_str = target_datetime.strftime("%Y%m%d")
        target_datetime_minus_1 = target_datetime + datetime.timedelta(days=-1)
        saturday_datetime_str = target_datetime_minus_1.strftime("%Y%m%d")
        return [saturday_datetime_str, sunday_datetime_str]
    
    while True:
        if target_datetime.weekday() == 5:
            saturday_datetime_str = target_datetime.strftime("%Y%m%d")
            target_datetime_plus_1 = target_datetime + datetime.timedelta(days=1)
            sunday_datetime_str = target_datetime_plus_1.strftime("%Y%m%d")
            return [saturday_datetime_str, sunday_datetime_str]
        else:
            target_datetime = target_datetime + datetime.timedelta(days=1)


def get_latest_holidays_list(target_datetime):
    while True:
        if target_datetime.weekday() == 5:
            saturday_datetime_str = target_datetime.strftime("%Y%m%d")
            target_datetime_plus_1 = target_datetime + datetime.timedelta(days=1)
            sunday_datetime_str = target_datetime_plus_1.strftime("%Y%m%d")
            return [saturday_datetime_str, sunday_datetime_str]
        else:
            target_datetime = target_datetime + datetime.timedelta(days=-1)

In [7]:
def initialize_chrome_driver(parameters):
    driver = webdriver.Chrome(executable_path=parameters['DRIVER_DIR'])
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    
    return driver 

In [8]:
driver = initialize_chrome_driver(parameters)

In [199]:
driver.back()
# driver.quit()

## Class: KeibaLabScraper

In [9]:
# first process for scraping
target_datetime_list =  get_nearest_future_holidays_list()
target_datetime_list

['20190720', '20190721']

In [30]:
def _load_target_url_page(driver, parameters, target_datetime_str):
    target_url = parameters['TARGET_URL_OF_KAIBALAB_RACE'] + target_datetime_str
    try:
        driver.get(target_url)
        print('We could load the URL:', driver.current_url)
    except Exception as e:
        print(e)

def _make_web_driver_click_by(driver, parameters, xpath, verbose=True):
    for i in range(parameters['RETRIES']):
        try:
            WebDriverWait(driver, parameters['PAGE_LOAD_TIMEOUT']).until(
                EC.element_to_be_clickable((By.XPATH, xpath)))
            driver.find_element_by_xpath(xpath).click()

            if verbose:
                print('We could load the XPATH and now locate in:', driver.current_url)

        except TimeoutException:
            print('Timeout, Retrying... ({I}/{MAX})'.format(I=i, MAX=parameters['RETRIES']))
            continue
        else:
            return 'click_success'
    return 'click_failed'

In [13]:
def _is_race_kakutei(xpath_table_of_race_result, table_tbody_idx):
    xpath_for_kakutei_box = xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[3]'.format(TBODY_IDX=table_tbody_idx+1)
    try:
        driver.find_element_by_xpath(xpath_for_kakutei_box).find_element_by_tag_name('a')
        return True
    except Exception:
        return False

In [28]:
def _make_xpath_list_of_race_result_link():
    xpath_list_of_race_result_link = []
    
    xpath_race_info_div = '//*[@id="raceInfo"]/div[1]'
    race_table_header_list = driver.find_element_by_xpath(xpath_race_info_div).find_elements_by_tag_name('table')
    
    for table_header_idx in range(len(race_table_header_list)):
        xpath_table_of_race_result = xpath_race_info_div + '/table[{TABLE_IDX_IDX}]'.format(TABLE_IDX_IDX=table_header_idx+1)
        race_table_body_elem_list = driver.find_element_by_xpath(xpath_table_of_race_result).find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')
        
        for table_tbody_idx in range(len(race_table_body_elem_list)):
            if _is_race_kakutei(xpath_table_of_race_result, table_tbody_idx):
                xpath_tdoby_of_table_race_result = xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[2]/a'.format(TBODY_IDX=table_tbody_idx+1)
                xpath_list_of_race_result_link.append(['kakutei', xpath_tdoby_of_table_race_result])
            else:
                xpath_tdoby_of_table_race_result = xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[2]/a'.format(TBODY_IDX=table_tbody_idx+1)
                xpath_list_of_race_result_link.append(['not_kakutei', xpath_tdoby_of_table_race_result])
                
    if xpath_list_of_race_result_link is None:
        return None
    
    return xpath_list_of_race_result_link

In [25]:
def get_race_id():
    race_id = \
        int(re.sub('\\D', '', driver.current_url.split(parameters['TARGET_URL_OF_KAIBALAB_RACE'])[1]))
    while True:
        if len(str(race_id)) != 12:
            print('failed to get the correct race_id, so retry to get the id')
            _make_web_driver_click_by(xpath_of_race_result_link[1])
            race_id = int(re.sub('\\D', '', driver.current_url.split(
                parameters['TARGET_URL_OF_KAIBALAB_RACE'])[1]))
        else:
            break
    return race_id

In [16]:
def get_race_data_box_list(race_id):
    race_data_box = driver.find_element_by_xpath('//*[@id="tab1"]/div[2]/div[1]/div[1]/div[1]/div[2]')
    race_timing = race_data_box.find_element_by_xpath('//p[@class="bold"]').text
    race_title = race_data_box.find_element_by_xpath('//h1[@class="raceTitle fL"]').text
    try:
        race_weather = race_data_box.find_elements_by_xpath('//div[@class="weather_ground fL"]/ul/li')[0].text
        race_condition =  race_data_box.find_elements_by_xpath('//div[@class="weather_ground fL"]/ul/li')[1].text
    except:
        race_weather = 'unknown'
        race_condition = 'unknown'
    
    course_syokin_elem = race_data_box.find_elements_by_xpath('//ul[@class="classCourseSyokin clearfix"]/li')
    course_syokin_list = ','.join([course_syokin_elem[i].text.replace('\u3000',' ') for i in range(len(course_syokin_elem))]).replace(',',' ')
    
    return [race_id, race_timing, race_title, race_weather, race_condition, course_syokin_list]

In [192]:
def _get_race_result_tbody_list(race_id):
    race_result_tbody_row_elem_list = driver.find_elements_by_xpath('//*[@class="DbTable stripe resulttable"]/tbody/tr')
    race_result_tbody_list = []
    
    for i in range(len(race_result_tbody_row_elem_list)):
        arrival_order = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[1]').text
        post_position = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[2]').text

        horse_number = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[3]').text
        horse_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[4]').text
        href_to_the_horse = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[4]/a').get_attribute("href")
        horse_age = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[5]').text
        horse_weight = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[15]').text 
        horse_impost = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[6]').text 

        jockey_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[7]').text
        href_to_the_jockey = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[7]/a').get_attribute("href")

        popularity_order = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[8]').text
        win_odds = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[9]').text

        trainer_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[14]').text
        href_to_the_trainer = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[14]/a').get_attribute("href")
        
        race_result_tbody_list.append([race_id, arrival_order, post_position, horse_number, horse_name, href_to_the_horse, horse_age, horse_weight, horse_impost, 
 jockey_name, href_to_the_jockey, popularity_order, win_odds, trainer_name, href_to_the_trainer])
    
    return race_result_tbody_list

In [200]:
_get_race_result_tbody_list(race_id)

[[201907140210,
  '1',
  '8',
  '11',
  'ポンデザール',
  'https://www.keibalab.jp/db/horse/2015104834/',
  '牝4',
  '492(＋6)',
  '53.0',
  '藤岡康太',
  'https://www.keibalab.jp/db/jockey/01116/',
  '3',
  '5.1',
  '[美]堀宣行',
  'https://www.keibalab.jp/db/trainer/01070/'],
 [201907140210,
  '2',
  '2',
  '2',
  'ドリームスピリット',
  'https://www.keibalab.jp/db/horse/2015102790/',
  '牡4',
  '512(＋8)',
  '54.0',
  '武藤雅',
  'https://www.keibalab.jp/db/jockey/01169/',
  '6',
  '14.6',
  '[美]武藤善則',
  'https://www.keibalab.jp/db/trainer/01064/'],
 [201907140210,
  '3',
  '1',
  '1',
  'ラクローチェ',
  'https://www.keibalab.jp/db/horse/2016100784/',
  '牡3',
  '458(0)',
  '53.0',
  '幸英明',
  'https://www.keibalab.jp/db/jockey/00732/',
  '1',
  '2.8',
  '[栗]角田晃一',
  'https://www.keibalab.jp/db/trainer/01121/'],
 [201907140210,
  '4',
  '6',
  '8',
  'トロピカルストーム',
  'https://www.keibalab.jp/db/horse/2013105912/',
  'セ6',
  '448(＋8)',
  '54.0',
  '池添謙一',
  'https://www.keibalab.jp/db/jockey/01032/',
  '7',
  '15.3',
  '[

In [None]:
# bulk_insert(insert_list=race_result_tbody_list,
#                              target_table_name='keibalab_race_result_list',
#                              insert_col_names=parameters['TABLE_COL_NAMES']['keibalab_race_result_list']) 

## 将来のレース情報を取得

In [181]:
target_datetime_str = target_datetime_list[1]
target_datetime_str = '20190714'
_load_target_url_page(driver, parameters, target_datetime_str)

We could load the URL: https://www.keibalab.jp/db/race/20190714


In [182]:
xpath_list_of_race_result_link = _make_xpath_list_of_race_result_link()
print(len(xpath_list_of_race_result_link))
xpath_list_of_race_result_link

36


[['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[1]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[2]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[3]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[4]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[5]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[6]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[7]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[8]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[9]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[10]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[11]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[12]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[2]/tbody/tr[1]/td[2]/a'],
 ['kakutei', '//*[@id="raceInfo"]/div[1]/table[2

In [183]:
xpath = xpath_list_of_race_result_link[33][1]
_make_web_driver_click_by(driver, parameters, xpath, verbose=True)

We could load the XPATH and now locate in: https://www.keibalab.jp/db/race/201907140210/


'click_success'

In [184]:
race_id = get_race_id()
race_id

201907140210

In [185]:
def is_race_id_existing_in_db(race_id):
    select_query = 'SELECT race_id FROM keibalab_race_master WHERE race_id = "{RACE_ID}";'.format(RACE_ID = race_id)
    cursor = con.cursor()
    cursor.execute(select_query)
    fetch_result = cursor.fetchone()
    if fetch_result is None:
        return False
    elif int(fetch_result[0]) == race_id:
        return True

In [186]:
is_race_id_existing_in_db(race_id)

False

In [187]:
race_data_box_list = get_race_data_box_list(race_id)
race_data_box_list

[201907140210,
 '2019/7/14(日) 2回函館4日目',
 '北海ハンデキャップ',
 '曇',
 '良',
 'サラ系3歳上1000万 (混)[指] ハンデ 芝2600m 12頭 15:11発走 本賞金 1500万 600万 380万 230万 150万']

In [201]:
def _get_race_prior_info_tbody_list(race_id):
    xpath_to_web_newspaper = '//*[@id="dbNav"]/div/ul/li[1]/a'
    _make_web_driver_click_by(driver, parameters, xpath_to_web_newspaper)
    xpath_to_yoko_gata = '//*[@id="dbnewWrap"]/div/article/div/section/div[4]/div[1]/div[1]/ul/li[2]/a'
    is_success = _make_web_driver_click_by(driver, parameters, xpath_to_yoko_gata)
    if is_success == 'click_failed':
        return None

    race_info_tbody_row_elem_list = driver.find_elements_by_xpath('//*[@id="top"]/table/tbody/tr')
    if len(race_info_tbody_row_elem_list) == 0:
        pass

    race_info_tbody_list = []
    for i in range(len(race_info_tbody_row_elem_list)):
        try:
            post_position = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[1]').text
            horse_number = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[2]').text
            horse_name = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[1]').text
            href_to_the_horse = \
                race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[1]/a').get_attribute("href")
            jockey_name = \
                race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[4]/dl/dd[2]/a')[0].get_attribute("href")
            href_to_the_jockey = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[2]').text
            trainer_name = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[3]/a').text
            sex_and_age, interval = \
                race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[3]').text.split(trainer_name)
            horse_age = re.sub("\\D", "", sex_and_age)
            horse_sex = re.match('[0-9a-zA-Zあ-んア-ン一-鿐]', sex_and_age).group()

            popularity_order = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[0].text
            win_odds = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[1].text
            horse_weight = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[2].text
            horse_weight_increment_from_previous = \
                race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[3].text
            owner_name = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[0].text
            href_to_the_owner = \
                race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[0].get_attribute("href")
            breeder_name = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[1].text

            jockey_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[0].text
            horse_number_finish_first_second = \
                race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[1].text
            stallion_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[2].text
            conbi_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[3].text

            zensou_info_elem = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td/a/dl')
            zensou_info_list = [(zensou_info_elem[i].text.replace('\n', ' ').replace('  ', ' '))
                                for i in range(len(zensou_info_elem))]
            zensou_info_list = [[zensou_info_list[i].replace(' ', ',')] for i in range(len(zensou_info_elem))]
            zensou_info_list = str(zensou_info_list).replace('\'', '')

        except IndexError:
            continue

        race_info_tbody_list.append(
            [race_id, post_position, horse_number, horse_name, href_to_the_horse, jockey_name, href_to_the_jockey,
             trainer_name, horse_age, horse_sex, popularity_order, win_odds, horse_weight,
             horse_weight_increment_from_previous, owner_name, href_to_the_owner, breeder_name,
             jockey_finish_first_second, horse_number_finish_first_second, stallion_finish_first_second,
             conbi_finish_first_second, zensou_info_list])

    return race_info_tbody_list

In [189]:
# xpath_to_web_newspaper = '//*[@id="dbNav"]/div/ul/li[1]/a'
# _make_web_driver_click_by(driver, parameters, xpath_to_web_newspaper)
# xpath_to_yoko_gata = '//*[@id="dbnewWrap"]/div/article/div/section/div[4]/div[1]/div[1]/ul/li[2]/a'
# _make_web_driver_click_by(driver, parameters, xpath_to_yoko_gata)

# race_info_tbody_row_elem_list = driver.find_elements_by_xpath('//*[@id="top"]/table/tbody/tr')

In [202]:
race_prior_info_tbody_list = _get_race_prior_info_tbody_list(race_id)

We could load the XPATH and now locate in: https://www.keibalab.jp/db/race/201907140210/umabashira.html
We could load the XPATH and now locate in: https://www.keibalab.jp/db/race/201907140210/umabashira.html?kind=yoko


In [203]:
race_prior_info_tbody_list

[[201907140210,
  '1',
  '1',
  'ラクローチェ',
  'https://www.keibalab.jp/db/horse/2016100784/',
  'https://www.keibalab.jp/db/jockey/00732/',
  '幸英明(53.0)◁◀◀◁',
  '栗・角田晃',
  '3',
  '牡',
  '1人気',
  '2.8倍',
  '458kg',
  '(0kg)',
  '畑佐博',
  'https://www.keibalab.jp/db/owner/867882/',
  '細川農場',
  '0.0%[0]',
  '21.4%[14]',
  '36.8%[19]',
  '9.4%[32]',
  '[[1,2東京11,19/5/25,芝24,3歳500万,②②②②,10頭8番3人,晴,良,2:25.5,33.8,S,458kg(－4),浜中俊,56.0,エデリー(0.2)], [7,3京都7,19/5/11,芝20,3歳500万,④⑤④④,8頭8番1人,晴,良,2:01.2,35.0,S,462kg(＋4),幸英明,56.0,ヒルノダカー(1.2)], [1,3京都1,19/4/20,芝20,3歳未勝利,②②②②,18頭12番1人,晴,良,2:00.6,35.7,M,458kg(---),幸英明,56.0,クラヴィーア(0.6)]]'],
 [201907140210,
  '2',
  '2',
  'ドリームスピリット',
  'https://www.keibalab.jp/db/horse/2015102790/',
  'https://www.keibalab.jp/db/jockey/01169/',
  '武藤雅(54.0)◁◁◀◁',
  '美・武藤善',
  '4',
  '牡',
  '6人気',
  '14.6倍',
  '512kg',
  '(＋8kg)',
  '落合幸弘',
  'https://www.keibalab.jp/db/owner/949808/',
  '中村俊紀',
  '100.0%[1]',
  '26.7%[15]',
  '0.0%[3]',
  '8.8%[159]',
  '[[1,1小倉3,19/2/16,芝26,