In [515]:
import re
import glob
import os
import time 
import datetime
import pandas as pd
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import warnings
warnings.filterwarnings('ignore')

from Config import params_config, db_config
from Utils.bulk_insert import BulkInsert

In [516]:
parameters = params_config.parameters

In [517]:
class BulkInsert(object):
    def __init__(self, con):
        self.con = con

    def execute(self, insert_data, target_table, col_names):
        if len(insert_data) == 0:
            return None

        try:
            cursor = self.con.cursor()
            query = self._generate_query(col_names, insert_data, target_table)
            cursor.execute(query)
            self.con.commit()
        except Exception:
            raise RuntimeError()

    @staticmethod
    def _generate_query(col_names, insert_data, target_table):
        schema = '(' + ', '.join(map(lambda x: str(x), col_names)) + ')'
        data_text =  ', '.join(map(lambda x: str(tuple(x)), insert_data))
        return 'INSERT INTO ' + target_table + schema + ' VALUES ' + data_text

In [518]:
def _bulk_insert(insert_list, target_table_name, insert_col_names):
    bi = BulkInsert(con)
    bi.execute(insert_data=insert_list, target_table=target_table_name, col_names=insert_col_names)

In [519]:
db_params = db_config.db_params
con = pymysql.connect(**db_params)

##  競馬結果のスクレイピングロジック
- 将来直近の土日に関して
    - ウェブ新聞の予想情報を取得
- 過去の土日に関して
    - ウェブ新聞の予想情報を取得
    - 結果払い戻し情報を取得
<br>

1986/01/05 まで情報あり<br>
土曜と日曜（レースがない日もある）だけ記録があるので、そこだけスクレイピングする

In [520]:
def initialize_chrome_driver(parameters):
    driver = webdriver.Chrome(executable_path=parameters['DRIVER_DIR'])
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    return driver


def get_nearest_future_holidays_list():
    target_datetime = datetime.date.today()
    target_datetime = datetime.date(2019, 7, 15)
    if target_datetime.weekday() == 6:
        sunday_datetime_str = target_datetime.strftime("%Y%m%d")
        target_datetime_minus_1 = target_datetime + datetime.timedelta(days=-1)
        saturday_datetime_str = target_datetime_minus_1.strftime("%Y%m%d")
        return [saturday_datetime_str, sunday_datetime_str]
    
    while True:
        if target_datetime.weekday() == 5:
            saturday_datetime_str = target_datetime.strftime("%Y%m%d")
            target_datetime_plus_1 = target_datetime + datetime.timedelta(days=1)
            sunday_datetime_str = target_datetime_plus_1.strftime("%Y%m%d")
            return [saturday_datetime_str, sunday_datetime_str]
        else:
            target_datetime = target_datetime + datetime.timedelta(days=1)


def get_latest_holidays_list(target_datetime):
    while True:
        if target_datetime.weekday() == 5:
            saturday_datetime_str = target_datetime.strftime("%Y%m%d")
            target_datetime_plus_1 = target_datetime + datetime.timedelta(days=1)
            sunday_datetime_str = target_datetime_plus_1.strftime("%Y%m%d")
            return [saturday_datetime_str, sunday_datetime_str]
        else:
            target_datetime = target_datetime + datetime.timedelta(days=-1)

In [521]:
def initialize_chrome_driver(parameters):
    driver = webdriver.Chrome(executable_path=parameters['DRIVER_DIR'])
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    
    return driver 

In [650]:
driver = initialize_chrome_driver(parameters)

In [157]:
driver.back()
# driver.quit()

## Class: KeibaLabScraper

In [522]:
# first process for scraping
target_datetime_list =  get_nearest_future_holidays_list()
target_datetime_list

['20190720', '20190721']

In [523]:
def _load_target_url_page(driver, parameters, target_datetime_str):
    target_url = parameters['TARGET_URL_OF_KAIBALAB_RACE'] + target_datetime_str
    try:
        driver.get(target_url)
        print('We could load the URL:', driver.current_url)
    except Exception as e:
        print(e)

def _make_web_driver_click_by(driver, parameters, xpath, verbose=True):
    try:
        WebDriverWait(driver, self.parameters['PAGE_LOAD_TIMEOUT']).until(
            EC.element_to_be_clickable((By.XPATH, xpath)))
        driver.find_element_by_xpath(xpath).click()

        if verbose:
            print('We could load the XPATH and now locate in:', driver.current_url)

    except Exception as e:
        print(e)

In [524]:
def _is_race_kakutei(xpath_table_of_race_result, table_tbody_idx):
    xpath_for_kakutei_box = xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[3]'.format(TBODY_IDX=table_tbody_idx+1)
    try:
        driver.find_element_by_xpath(xpath_for_kakutei_box).find_element_by_tag_name('a')
        return True
    except Exception:
        return False

In [525]:
def _make_xpath_list_of_race_result_link():
    xpath_list_of_race_result_link = []
    
    xpath_race_info_div = '//*[@id="raceInfo"]/div[1]'
    race_table_header_list = driver.find_element_by_xpath(xpath_race_info_div).find_elements_by_tag_name('table')
    
    for table_header_idx in range(len(race_table_header_list)):
        xpath_table_of_race_result = xpath_race_info_div + '/table[{TABLE_IDX_IDX}]'.format(TABLE_IDX_IDX=table_header_idx+1)
        race_table_body_elem_list = driver.find_element_by_xpath(xpath_table_of_race_result).find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')
        
        for table_tbody_idx in range(len(race_table_body_elem_list)):
            if is_race_kakutei(xpath_table_of_race_result, table_tbody_idx):
                xpath_tdoby_of_table_race_result = xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[2]/a'.format(TBODY_IDX=table_tbody_idx+1)
                xpath_list_of_race_result_link.append(['kakutei', xpath_tdoby_of_table_race_result])
            else:
                xpath_tdoby_of_table_race_result = xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[2]/a'.format(TBODY_IDX=table_tbody_idx+1)
                xpath_list_of_race_result_link.append(['not_kakutei', xpath_tdoby_of_table_race_result])
                
    if xpath_list_of_race_result_link is None:
        return None
    
    return xpath_list_of_race_result_link

In [526]:
def get_race_id():
    return re.sub('\\D', '', driver.current_url.split(parameters['TARGET_URL_OF_KAIBALAB_RACE'])[1])

In [527]:
def get_race_data_box_list(race_id):
    race_data_box = driver.find_element_by_xpath('//*[@id="tab1"]/div[2]/div[1]/div[1]/div[1]/div[2]')
    race_timing = race_data_box.find_element_by_xpath('//p[@class="bold"]').text
    race_title = race_data_box.find_element_by_xpath('//h1[@class="raceTitle fL"]').text
    try:
        race_weather = race_data_box.find_elements_by_xpath('//div[@class="weather_ground fL"]/ul/li')[0].text
        race_condition =  race_data_box.find_elements_by_xpath('//div[@class="weather_ground fL"]/ul/li')[1].text
    except:
        race_weather = 'unknown'
        race_condition = 'unknown'
    
    course_syokin_elem = race_data_box.find_elements_by_xpath('//ul[@class="classCourseSyokin clearfix"]/li')
    course_syokin_list = ','.join([course_syokin_elem[i].text.replace('\u3000',' ') for i in range(len(course_syokin_elem))]).replace(',',' ')
    
    return [race_id, race_timing, race_title, race_weather, race_condition, course_syokin_list]

In [528]:
def get_race_result_tbody_list(race_id):
    race_result_tbody_row_elem_list = driver.find_elements_by_xpath('//*[@class="DbTable stripe resulttable"]/tbody/tr')
    race_result_tbody_list = []
    
    for i in range(len(race_result_tbody_row_elem_list)):
        arrival_order = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[1]').text
        post_position = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[2]').text

        horse_number = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[3]').text
        horse_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[4]').text
        href_to_the_horse = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[4]/a').get_attribute("href")
        horse_age = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[5]').text
        horse_weight = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[15]').text 
        horse_impost = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[6]').text 

        jockey_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[7]').text
        href_to_the_jockey = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[7]/a').get_attribute("href")

        popularity_order = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[8]').text
        win_odds = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[9]').text

        trainer_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[14]').text
        href_to_the_trainer = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[14]/a').get_attribute("href")
        
        race_result_tbody_list.append([race_id, arrival_order, post_position, horse_number, horse_name, href_to_the_horse, horse_age, horse_weight, horse_impost, 
 jockey_name, href_to_the_jockey, popularity_order, win_odds, trainer_name, href_to_the_trainer])
    
    return race_result_tbody_list

In [None]:
# bulk_insert(insert_list=race_result_tbody_list,
#                              target_table_name='keibalab_race_result_list',
#                              insert_col_names=parameters['TABLE_COL_NAMES']['keibalab_race_result_list']) 

## 将来のレース情報を取得

In [651]:
target_datetime_str = target_datetime_list[1]
target_datetime_str = '20190720'
_load_target_url_page(driver, parameters, target_datetime_str)

We could load the URL: https://www.keibalab.jp/db/race/20190720


In [652]:
xpath_list_of_race_result_link = _make_xpath_list_of_race_result_link()
print(len(xpath_list_of_race_result_link))
xpath_list_of_race_result_link

9


[['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[1]/td[2]/a'],
 ['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[2]/td[2]/a'],
 ['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[1]/tbody/tr[3]/td[2]/a'],
 ['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[2]/tbody/tr[1]/td[2]/a'],
 ['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[2]/tbody/tr[2]/td[2]/a'],
 ['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[2]/tbody/tr[3]/td[2]/a'],
 ['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[3]/tbody/tr[1]/td[2]/a'],
 ['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[3]/tbody/tr[2]/td[2]/a'],
 ['not_kakutei', '//*[@id="raceInfo"]/div[1]/table[3]/tbody/tr[3]/td[2]/a']]

In [653]:
xpath = xpath_list_of_race_result_link[0][1]
make_web_driver_click_by(driver, parameters, xpath, verbose=True)

We can load the XPATH and now locate in: https://www.keibalab.jp/db/race/201907200309/


In [654]:
race_id = get_race_id()
race_id

'201907200309'

In [655]:
race_data_box_list = get_race_data_box_list(race_id)
race_data_box_list

['201907200309',
 '2019/7/20(土) 2回福島7日目',
 '栗子特別',
 'unknown',
 'unknown',
 '500万 (混)(特) 定量 芝2000m 21頭 発走']

In [638]:
def get_arrival_order_before(race_info_tbody_row_elem_list, i, number):
    try:
        arrival_order_before = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[{NUMBER}]/a/dl/dd'.format(NUMBER=number+11))[0].text
    except IndexError:
        try:
            arrival_order_before = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[{NUMBER}]/dl/dd'.format(NUMBER=number+11))[0].text
        except IndexError:
            arrival_order_before = ''
    return arrival_order_before

def get_race_prior_info_tbody_list(race_id):
    xpath_to_web_newspaper = '//*[@id="dbNav"]/div/ul/li[1]/a'
    make_web_driver_click_by(driver, parameters, xpath_to_web_newspaper)
    xpath_to_yoko_gata = '//*[@id="dbnewWrap"]/div/article/div/section/div[4]/div[1]/div[1]/ul/li[2]/a'
    make_web_driver_click_by(driver, parameters, xpath_to_yoko_gata)
    
    race_info_tbody_row_elem_list = driver.find_elements_by_xpath('//*[@id="top"]/table/tbody/tr')
    if len(race_info_tbody_row_elem_list) == 0:
        pass
    
    race_info_tbody_list = []
    for i in range(len(race_info_tbody_row_elem_list)):
        post_position = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[1]').text
        horse_number = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[2]').text
        horse_name = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[1]').text
        href_to_the_horse = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[1]/a').get_attribute("href")
        jockey_name = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[2]').text
        trainer_name = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[3]/a').text
        
        sex_and_age, interval = race_info_tbody_row_elem_list[i].find_element_by_xpath('td[4]/dl/dd[3]').text.split(trainer_name)
        horse_age = re.sub("\\D", "", sex_and_age)
        horse_sex = re.match('[0-9a-zA-Zあ-んア-ン一-鿐]' , sex_and_age).group()
        
        popularity_order = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[0].text
        win_odds = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[1].text
        horse_weight = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[2].text
        horse_weight_increment_from_previous = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[5]/dl/dd')[3].text
        owner_name = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[0].text
        href_to_owner_name = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[0].get_attribute("href")
        breeder_name = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[6]/dl/dd/a')[1].text
        
        jockey_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[0].text
        horse_number_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[1].text
        stallion_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[2].text
        conbi_finish_first_second = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td[7]/i')[3].text
        
        arrival_orders_before = []
        for number in range(5):
            arrival_orders_before.append(get_arrival_order_before(race_info_tbody_row_elem_list, i, number))
        arrival_orders_before = str(arrival_orders_before).replace('\'', '')
        
        zensou_info_elem = race_info_tbody_row_elem_list[i].find_elements_by_xpath('td/a/dl')
        zensou_info_list = [(zensou_info_elem[i].text.replace('\n',' ').replace('  ',' ')) for i in range(len(zensou_info_elem))]
        zensou_info_list = [[zensou_info_list[i].replace(' ', ',')] for i in range(len(zensou_info_elem))]
        zensou_info_list = str(zensou_info_list).replace('\'', '')
        
        race_info_tbody_list.append([race_id, post_position, horse_number, horse_name, href_to_the_horse, jockey_name, trainer_name,
                                   horse_age, horse_sex, popularity_order, win_odds, horse_weight, horse_weight_increment_from_previous,
                                   owner_name, href_to_owner_name, breeder_name, jockey_finish_first_second, horse_number_finish_first_second,
                                   stallion_finish_first_second, conbi_finish_first_second, arrival_orders_before, zensou_info_list])

    return race_info_tbody_list

In [639]:
race_prior_info_tbody_list = get_race_prior_info_tbody_list(race_id)

We can load the XPATH and now locate in: https://www.keibalab.jp/db/race/201907140310/umabashira.html
We can load the XPATH and now locate in: https://www.keibalab.jp/db/race/201907140310/umabashira.html?kind=yoko


In [640]:
race_prior_info_tbody_list[0]

['201907140310',
 '1',
 '1',
 'スナークライデン',
 'https://www.keibalab.jp/db/horse/2014100190/',
 '津村明(57.0)◁◀◀◁',
 '栗・川村禎',
 '5',
 '牡',
 '5人気',
 '8.9倍',
 '480kg',
 '(－2kg)',
 '杉本豊',
 'https://www.keibalab.jp/db/owner/167808/',
 '小泉牧場',
 '25.0%[44]',
 '11.5%[87]',
 '23.1%[52]',
 '20.0%[5]',
 '[2, 4, 1, 8, 7]',
 '[[2,2福島1,19/6/29,ダ115,郡山特別,－－⑧⑥,16頭5番7人,小雨,稍,1:07.6,36.2,H,482kg(0),三浦皇,57.0,レッドアネラ(0.2)], [4,3京都7,19/5/11,ダ12,4歳上1000万,－－⑩⑨,16頭14番15人,晴,良,1:11.9,36.5,H,482kg(－6),和田竜,57.0,ジャスパーウ(0.6)], [8,1小倉8,19/3/3,ダ10,4歳上500万,－－⑨⑥,14頭10番5人,雨,不,0:58.9,35.9,H,486kg(－4),中井裕,57.0,ガウル(0.5)], [7,1京都2,19/1/6,ダ12,4歳上500万,－－⑪⑩,16頭12番11人,曇,良,1:12.1,36.0,M,490kg(＋2),和田竜,57.0,ヒロシゲゴー(1.5)]]']