In [None]:
import re
import glob
import os
import time 
import datetime
import pandas as pd
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import warnings
warnings.filterwarnings('ignore')

from Config import params_config, db_config
from Utils.bulk_insert import BulkInsert

In [None]:
parameters = params_config.parameters
parameters

In [None]:
class BulkInsert(object):
    def __init__(self, con):
        self.con = con

    def execute(self, insert_data, target_table, col_names):
        if len(insert_data) == 0:
            return None

        try:
            cursor = self.con.cursor()
            query = self._generate_query(col_names, insert_data, target_table)
            cursor.execute(query)
            self.con.commit()
        except Exception:
            raise RuntimeError()

    @staticmethod
    def _generate_query(col_names, insert_data, target_table):
        schema = '(' + ', '.join(map(lambda x: str(x), col_names)) + ')'
#         data_text = str(tuple(insert_data))
        data_text =  ', '.join(map(lambda x: str(tuple(x)), insert_data))
        return 'INSERT INTO ' + target_table + schema + ' VALUES ' + data_text

In [None]:
def bulk_insert(insert_list, target_table_name, insert_col_names):
    bi = BulkInsert(con)
    bi.execute(insert_data=insert_list, target_table=target_table_name, col_names=insert_col_names)

In [None]:
db_params = db_config.db_params
con = pymysql.connect(**db_params)

### 過去の競馬結果をスクレイピングするmain関数処理
- 1986/01/05 まで情報あり
- 土曜と日曜（レースがない日もある）だけ記録があるので、そこだけスクレイピングする

In [None]:
def get_today_datetime():
    return datetime.date.today()

In [None]:
def get_latest_holiday_list(target_datetime):
    while True:
        if target_datetime.weekday() == 5:
            saturday_datetime_str = target_datetime.strftime("%Y%m%d")
            target_datetime_plus_1 = target_datetime + datetime.timedelta(days=1)
            sunday_datetime_str = target_datetime_plus_1.strftime("%Y%m%d")
            return [saturday_datetime_str, sunday_datetime_str]
        else:
            target_datetime = target_datetime + datetime.timedelta(days=-1)

In [None]:
def scraping_race_result_until_start_date(parameters):
    # first process for scraping
    target_datetime =  get_today_datetime()  # type: datetime.date
    target_datetime_list =  get_latest_holiday_list(target_datetime)
    print(target_datetime_list)
    
    # scarping keibalab web info
    print('scraping the web site info in {TARGET_DATE}'.format(TARGET_DATE = target_datetime_list[0]))
    ## ENTERE MY CODE ##
    print('scraping the web site info in {TARGET_DATE}'.format(TARGET_DATE = target_datetime_list[1]))
    ## ENTERE MY CODE ##
    
    # after the second time process for scraping
    while True:
        target_datetime_str = target_datetime_list[0]  # type: str
        target_datetime = datetime.datetime.strptime(target_datetime_str, '%Y%m%d') - datetime.timedelta(days=1)  # type: datetime.date
        target_datetime_list =  get_latest_holiday_list(target_datetime)
        print(target_datetime_list)
        
        # scarping keibalab web info
        print('scraping the web site info in {TARGET_DATE}'.format(TARGET_DATE = target_datetime_list[0]))
        ## ENTERE MY CODE ##
        print('scraping the web site info in {TARGET_DATE}'.format(TARGET_DATE = target_datetime_list[1]))
        ## ENTERE MY CODE ##

        if parameters['START_DATE'] in target_datetime_list:
            break

In [None]:
scraping_race_result_until_start_date(parameters)

In [None]:
def initialize_chrome_driver(parameters):
    driver = webdriver.Chrome(executable_path=parameters['DRIVER_DIR'])
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    
    return driver 

In [None]:
def close_chrome_window(driver):
    driver.quit()

In [None]:
def back_chrome_window(driver):
    driver.back()

In [None]:
driver = initialize_chrome_driver(parameters)

In [None]:
# back_chrome_window(driver)
# close_chrome_window(driver)

## Class: KeibaLabScraper

- 例：20181229,1230の日はレースがないため、例外処理を行うようにしておく

In [None]:
# first process for scraping
target_datetime =  get_today_datetime()  # type: datetime.date
target_datetime_list =  get_latest_holiday_list(target_datetime)
target_datetime_list

In [None]:
def make_web_driver_click_by(driver, parameters, xpath, verbose=True):
    try:
        WebDriverWait(driver, parameters['PAGE_LOAD_TIMEOUT']).until(EC.element_to_be_clickable((By.XPATH, xpath)))
        driver.find_element_by_xpath(xpath).click()
        if verbose:
            print('We can load the XPATH and now locate in:', driver.current_url)
            
    except Exception as e:
        print(e)

In [None]:
def load_target_url_page(driver, parameters, target_datetime_str):
    target_url = parameters['TARGET_URL'] + target_datetime_str
    
    try:
        driver.get(target_url)
        print('We can load the URL:', driver.current_url)
    except:
        print("The page load was time out")

In [None]:
target_datetime_str = target_datetime_list[0]
load_target_url_page(driver, parameters, target_datetime_str)

In [None]:
def is_race_kakutei(xpath_table_of_race_result, table_tbody_idx):
    xpath_for_kakutei_box = xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[3]'.format(TBODY_IDX=table_tbody_idx+1)
    try:
        driver.find_element_by_xpath(xpath_for_kakutei_box).find_element_by_tag_name('a')
        return True
    except Exception as e:
        return False

In [None]:
def make_xpath_list_of_race_result_link():
    xpath_list_of_race_result_link = []
    
    xpath_race_info_div = '//*[@id="raceInfo"]/div[1]'
    race_table_header_list = driver.find_element_by_xpath(xpath_race_info_div).find_elements_by_tag_name('table')
    
    for table_header_idx in range(len(race_table_header_list)):
        xpath_table_of_race_result = xpath_race_info_div + '/table[{TABLE_IDX_IDX}]'.format(TABLE_IDX_IDX=table_header_idx+1)
        race_table_body_elem_list = driver.find_element_by_xpath(xpath_table_of_race_result).find_element_by_tag_name('tbody').find_elements_by_tag_name('tr')
        
        for table_tbody_idx in range(len(race_table_body_elem_list)):
            if is_race_kakutei(xpath_table_of_race_result, table_tbody_idx):
                xpath_tdoby_of_table_race_result = xpath_table_of_race_result + '/tbody/tr[{TBODY_IDX}]/td[2]/a'.format(TBODY_IDX=table_tbody_idx+1)
                xpath_list_of_race_result_link.append(xpath_tdoby_of_table_race_result)
            else:
                continue
                
    return xpath_list_of_race_result_link

In [None]:
xpath_list_of_race_result_link = make_xpath_list_of_race_result_link()
print(len(xpath_list_of_race_result_link))
xpath_list_of_race_result_link

In [None]:
xpath = xpath_list_of_race_result_link[18]
make_web_driver_click_by(driver, parameters, xpath, verbose=True)

In [None]:
def get_race_id():
    return re.sub('\\D', '', driver.current_url.split(parameters['TARGET_URL'])[1])

In [None]:
race_id = get_race_id()
race_id

In [None]:
def get_race_data_box_list(race_id):
    race_data_box = driver.find_element_by_xpath('//*[@id="tab1"]/div[2]/div[1]/div[1]/div[1]/div[2]')
    race_timing = race_data_box.find_element_by_xpath('//p[@class="bold"]').text
    race_title = race_data_box.find_element_by_xpath('//h1[@class="raceTitle fL"]').text
    race_weather = race_data_box.find_elements_by_xpath('//div[@class="weather_ground fL"]/ul/li')[0].text
    race_condition =  race_data_box.find_elements_by_xpath('//div[@class="weather_ground fL"]/ul/li')[1].text
    
    course_syokin_elem = race_data_box.find_elements_by_xpath('//ul[@class="classCourseSyokin clearfix"]/li')
    course_syokin_list = ','.join([course_syokin_elem[i].text.replace('\u3000',' ') for i in range(len(course_syokin_elem))]).replace(',',' ')
    
    return [race_id, race_timing, race_title, race_weather, race_condition, course_syokin_list]

In [None]:
race_data_box_list = get_race_data_box_list(race_id)
race_data_box_list

In [None]:
def get_race_result_tbody_list(race_id):
    race_result_tbody_row_elem_list = driver.find_elements_by_xpath('//*[@class="DbTable stripe resulttable"]/tbody/tr')
    race_result_tbody_list = []
    
    for i in range(len(race_result_tbody_row_elem_list)):
        arrival_order = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[1]').text
        post_position = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[2]').text

        horse_number = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[3]').text
        horse_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[4]').text
        href_to_the_horse = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[4]/a').get_attribute("href")
        horse_age = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[5]').text
        horse_weight = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[15]').text 
        horse_impost = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[6]').text 

        jockey_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[7]').text
        href_to_the_jockey = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[7]/a').get_attribute("href")

        popularity_order = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[8]').text
        win_odds = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[9]').text

        trainer_name = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[14]').text
        href_to_the_trainer = race_result_tbody_row_elem_list[i].find_element_by_xpath('td[14]/a').get_attribute("href")
        
        race_result_tbody_list.append([race_id, arrival_order, post_position, horse_number, horse_name, href_to_the_horse, horse_age, horse_weight, horse_impost, 
 jockey_name, href_to_the_jockey, popularity_order, win_odds, trainer_name, href_to_the_trainer])
    
    return race_result_tbody_list

In [None]:
race_result_tbody_list = get_race_result_tbody_list(race_id)

In [None]:
race_result_tbody_list

In [None]:
bulk_insert(insert_list=[race_data_box_list],
                             target_table_name='keibalab_race_master',
                             insert_col_names=parameters['TABLE_COL_NAMES']['keibalab_race_master'])

In [None]:
bulk_insert(insert_list=race_result_tbody_list,
                             target_table_name='keibalab_race_result_list',
                             insert_col_names=parameters['TABLE_COL_NAMES']['keibalab_race_result_list']) 