In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import sys
import datetime
import pymysql
import re
import time
import locale
import urllib3
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, InvalidSessionIdException

from Utils.bulk_insert import BulkInsert

In [None]:
db_params = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': 'daigo1123',
    'database': 'dev_netkeiba',
    'port': 3306,
    'charset': 'utf8'
}
con = pymysql.connect(**db_params)

parameters = {

    # parameters about scraping
    'URL_ABOUT_NETKEIBA': {
        'RACE_TABLE': 'https://race.netkeiba.com/?pid=race_old&id=c',
        'RACE_RESULT': 'https://race.netkeiba.com/?pid=race&id=c{RACE_ID}&mode=result',
        'RACE_PAST5_RESULT': 'https://race.netkeiba.com/?pid=race&id=c{RACE_ID}&mode=shutuba'
    },
    'MIN_YEAR': 2019,
    'MAX_YEAR': 2019,
    'MIN_MONTH': 11,
    'MAX_MONTH': 12,
    'MIN_DATE': 1,
    'MAX_DATE': 31,
    'DRIVER_DIR': './chromedriver',
    'PAGE_LOAD_TIMEOUT': 10,
#     'RETRIES': 5,
#     'RETRIES_WHEN_WEB_CLICK': 3,
#     'INITIALIZE_AND_RETRIES': 3,


    # parameters about model training

    # col names in database tables
    'TABLE_COL_NAMES': {
        'race_master': [
            'race_id',
            'race_title',
            'race_course',
            'race_weather',
            'race_condition',
            'race_year',
            'race_month',
            'race_date',
            'race_dow',
            'starting_time',
            'race_other_info'
        ],
        'race_table_info': [
            'race_id',
            'bracket_num',
            'horse_num',
            'horse_name',
            'horse_age',
            'horse_sex',
            'weight_penalty',
            'jockey_name',
            'href_to_jockey',
            'owner_name',
            'href_to_owner',
            'horse_weight',
            'horse_weight_increment',
            'win_odds',
            'popularity_order'
        ],
        'race_result_info': [
            'race_id',
            'bracket_num',
            'horse_num',
            'arrival_time',
            'arrival_diff',
            'arrrival_order'
        ],
        'race_refund_info': [
            'race_id',
            'refund_type',
            'groupby_index',
            'horse_num',
            'refund_yen',
            'popularity_order'
        ],
        'race_past_5_result_info':[
            'race_id',
            'bracket_num',
            'horse_num',
            'past_x',
            'arrival_order'
        ]
    },

    # col names in dataframe
    'DATAFRAME_COL_NAMES': {

    }
}

In [None]:
# Common functions
def _fetchall_and_make_list_by(query):
    try:
        cursor = con.cursor()
        cursor.execute(query)
        fetch_result = cursor.fetchall()
        fetch_result_list = [item for item in fetch_result]
        cursor.close()
        return fetch_result_list
    except Exception as e:
        print(e)

def _bulk_insert(insert_list, target_table_name, insert_col_names):
    try:
        bi = BulkInsert(con)
        bi.execute(insert_data=insert_list, target_table=target_table_name, col_names=insert_col_names)
    except TypeError as e:
        print(e)
        raise TypeError

## Initialize ChromeDriver（Common functions）

In [None]:
def initialize_chrome_driver(parameters):
    chrome_options = Options()
    chrome_options.add_argument('--dns-prefetch-disable')
    driver = Chrome(executable_path=parameters['DRIVER_DIR'], chrome_options=chrome_options)
    driver.set_page_load_timeout(parameters['PAGE_LOAD_TIMEOUT'])
    driver.maximize_window()
    return driver

In [None]:
def _load_target_url_page(target_url):
    try:
        driver.get(target_url)
        print('We could load the URL:', driver.current_url)
    except (TimeoutException, urllib3.exceptions.MaxRetryError, InvalidSessionIdException) as e:
        print('We could not load the URL because of: ', e)
        driver.refresh()

In [None]:
driver = initialize_chrome_driver(parameters)

## Get info about race master and table 

In [None]:
def _get_num_str(num):
    num_str = str(num) if num >= 10 else '0' + str(num)
    return num_str

def _make_race_ids_list():
    query = """
        SELECT race_year, race_place_id, race_kai, race_nichi, race_round
        FROM race_calender_master
        WHERE 0=0
        AND race_year>={MIN_YEAR} AND race_year<={MAX_YEAR}
        AND race_month>={MIN_MONTH} AND race_month<={MAX_MONTH}
        AND race_date>={MIN_DATE} AND race_date<={MAX_DATE};
    """.format(MIN_YEAR=parameters['MIN_YEAR'], MAX_YEAR=parameters['MAX_YEAR'], 
                        MIN_MONTH=parameters['MIN_MONTH'], MAX_MONTH=parameters['MAX_MONTH'],
                        MIN_DATE=parameters['MIN_DATE'], MAX_DATE=parameters['MAX_DATE'])
    return _fetchall_and_make_list_by(query)

def _make_race_id_and_target_url(race_calender):
    race_id = ''.join(map(lambda x: _get_num_str(x), race_calender))
    target_url = parameters['URL_ABOUT_NETKEIBA']['RACE_TABLE'] + race_id
    return race_id, target_url

def _is_the_race_id_existing_in_master(race_id):
    query = "SELECT race_id FROM race_master WHERE race_id = '{RACE_ID}'".format(RACE_ID=race_id)
    race_id_list_in_master_existing = _fetchall_and_make_list_by(query)
    query = "SELECT race_id FROM race_table_info WHERE race_id = '{RACE_ID}'".format(RACE_ID=race_id)
    race_id_list_in_table_existing = _fetchall_and_make_list_by(query)
    if len(race_id_list_in_master_existing) > 0 and len(race_id_list_in_table_existing) > 0:
        return True
    else:
        return False
    
def _does_the_race_master_url_have_info():
    try:
        driver.find_elements_by_xpath('//*[@id="page"]/div[2]/div/div[1]/div[3]/div[2]')[0]
        int(driver.find_element_by_class_name('HorseList').find_elements_by_class_name('Txt_C')[0].text)  # wakuban
        return True
    except (IndexError, ValueError):
        return False
    
def _get_race_date(race_id):
    race_date_info = driver.find_elements_by_id('RaceList_DateList')[0].find_elements_by_class_name('Active')[0].text
    race_year = race_id[:4]
    
    try:
        race_month = re.split('月|日|\(|\)', race_date_info)[0]
        race_date = re.split('月|日|\(|\)', race_date_info)[1]
    except IndexError:
        race_month = re.split('/', race_date_info)[0]
        race_date = re.split('/', race_date_info)[1]
    
    race_date_str = race_year + '-' +race_month + '-' + race_date
    locale.setlocale(locale.LC_TIME, 'ja_JP.UTF-8')
    race_dow = datetime.datetime.strptime(race_date_str, '%Y-%m-%d').strftime('%A')[0]
    
    return race_year, race_month, race_date, race_dow

In [None]:
def _extract_race_master_info(race_id):
    xpath_to_race_name = '//*[@id="page"]/div[2]/div/div[1]/div[3]/div[2]'
    race_master_info_elem = driver.find_elements_by_xpath(xpath_to_race_name)[0]

    race_title = race_master_info_elem.find_elements_by_class_name('RaceName')[0].text
    race_data_01 = race_master_info_elem.find_elements_by_class_name('RaceData01')[0].text.replace(u'\n',u'')
    starting_time = re.search('(.*)発走', race_data_01.split('/')[0]).group(1)
    race_coure = race_data_01.split('/')[1]
    race_weather = re.search('天候:(.*)', race_data_01.split('/')[2]).group(1)
    race_condition = re.search('馬場:(.*)', race_data_01.split('/')[3]).group(1)
    race_year, race_month, race_date, race_dow = _get_race_date(race_id)
    race_other_info = re.sub(r"\s+", " ", race_master_info_elem.find_elements_by_class_name('RaceData02')[0].text.replace(u'\n',u' '))

    return ([
        race_id, 
        race_title, 
        race_coure, 
        race_weather, 
        race_condition, 
        race_year, 
        race_month, 
        race_date, 
        race_dow, 
        starting_time, 
        race_other_info
    ])

In [None]:
def _get_horse_weight_and_increment_one(horse_weight_info):
    if horse_weight_info != '':
        horse_weight = int(re.split('\(|\)', horse_weight_info)[0])
        horse_weight_increment = re.split('\(|\)', horse_weight_info)[1]
    else:
        horse_weight = ''
        horse_weight_increment = ''
    return horse_weight, horse_weight_increment

In [None]:
def _extract_race_table_info(race_id):
    table_element_list = driver.find_elements_by_class_name('HorseList')
    this_race_table_info = []
    for row in range(len(table_element_list)):
        bracket_num = int(table_element_list[row].find_element_by_xpath('td[1]').text)
        horse_num = int(table_element_list[row].find_element_by_xpath('td[2]').text)
        horse_name = table_element_list[row].find_element_by_xpath('td[4]').text
        href_to_horse = table_element_list[row].find_element_by_xpath('td[4]/div/div/span/a').get_attribute("href")
        horse_age = int(re.sub("\\D", "", table_element_list[row].find_element_by_xpath('td[5]').text))
        horse_sex = re.match('[0-9a-zA-Zあ-んア-ン一-鿐]', table_element_list[row].find_element_by_xpath('td[5]').text).group()

        weight_penalty = table_element_list[row].find_element_by_xpath('td[6]').text
        weight_penalty = float(weight_penalty) if weight_penalty != '' else ''

        jockey_name = table_element_list[row].find_element_by_xpath('td[7]').text
        href_to_jockey = table_element_list[row].find_element_by_xpath('td[7]/a').get_attribute("href")
        owner_name = table_element_list[row].find_element_by_xpath('td[8]/a').text
        href_to_owner = table_element_list[row].find_element_by_xpath('td[8]/a').get_attribute("href")

        try:
            horse_weight_info = table_element_list[row].find_element_by_xpath('td[9]').text
            horse_weight, horse_weight_increment = _get_horse_weight_and_increment_one(horse_weight_info)
        except ValueError:
            horse_weight, horse_weight_increment = '', ''

        win_odds = table_element_list[row].find_element_by_xpath('td[10]').text
        popularity_order = table_element_list[row].find_element_by_xpath('td[11]').text

        this_race_table_info.append([
            race_id,
            bracket_num,
            horse_num,
            horse_name,
            href_to_horse,
            horse_age,
            horse_sex,
            weight_penalty,
            jockey_name,
            href_to_jockey,
            owner_name,
            href_to_owner,
            horse_weight,
            horse_weight_increment,
            win_odds,
            popularity_order
        ])

    return this_race_table_info

In [None]:
def get_race_master_and_table_info():
    race_calender_master_list = _make_race_ids_list()
    for race_calender in race_calender_master_list:
        race_id, target_url = _make_race_id_and_target_url(race_calender)

        if _is_the_race_id_existing_in_master(race_id):
            print('Information about', target_url, 'is already existing in master')
            continue
        
        _load_target_url_page(target_url)
        if not _does_the_race_master_url_have_info():
            print('\tThis URL has no information about: ', race_id)
            continue

        race_master_list = _extract_race_master_info(race_id)
        race_table_info_list = _extract_race_table_info(soup, race_id)

In [None]:
race_calender_master_list = _make_race_ids_list()
print(len(race_calender_master_list))
race_calender = race_calender_master_list[0]
race_calender = (2020, 3, 1, 4, 1)
race_id, target_url = _make_race_id_and_target_url(race_calender)
print('Target URL to requests: ', target_url)

In [None]:
_load_target_url_page(target_url)

In [None]:
_is_the_race_id_existing_in_master(race_id)

In [None]:
race_master_list = _extract_race_master_info(race_id)
race_master_list

In [None]:
race_table_info_list = _extract_race_table_info(race_id)
race_table_info_list

In [None]:
# _bulk_insert([race_master_list], 'race_master', parameters['TABLE_COL_NAMES']['race_master'])

In [None]:
# _bulk_insert(race_table_info_list, 'race_table_info', parameters['TABLE_COL_NAMES']['race_table_info'])

## Get info about race result and refund

In [None]:
def _extract_race_ids_in_master_not_exist_in_race_result():
    query = """
        SELECT DISTINCT race_id 
        FROM race_master
        WHERE 0=0 
        AND (race_id NOT IN (SELECT DISTINCT race_id FROM race_result_info) OR race_id NOT IN (SELECT DISTINCT race_id FROM race_refund_info))
            AND race_year>={MIN_YEAR} AND race_year<={MAX_YEAR}
            AND race_month>={MIN_MONTH} AND race_month<={MAX_MONTH}
            AND race_date>={MIN_DATE} AND race_date<={MAX_DATE};
        """.format(MIN_YEAR=parameters['MIN_YEAR'], MAX_YEAR=parameters['MAX_YEAR'], 
                   MIN_MONTH=parameters['MIN_MONTH'], MAX_MONTH=parameters['MAX_MONTH'],
                   MIN_DATE=parameters['MIN_DATE'], MAX_DATE=parameters['MAX_DATE'])
    return _fetchall_and_make_list_by(query)

def _make_target_url_about_race_result(race_id):
        return parameters['URL_ABOUT_NETKEIBA']['RACE_RESULT'].format(RACE_ID=race_id)
    
def _does_the_race_result_url_have_info():
    try:
        driver.find_elements_by_class_name('HorseList')
        driver.find_element_by_class_name('FullWrap')
        return True
    except IndexError:
        return False

### Result info

In [None]:
def _extract_race_result_info(race_id):
    this_race_result_info = []
    table_element_list = driver.find_elements_by_class_name('HorseList')
    for row in range(len(table_element_list)):
        arrrival_order  = table_element_list[row].find_element_by_xpath('td[1]').text
        bracket_num = table_element_list[row].find_element_by_xpath('td[2]').text
        horse_num = table_element_list[row].find_element_by_xpath('td[3]').text
        arrival_time = table_element_list[row].find_element_by_xpath('td[8]').text
        arrival_diff = table_element_list[row].find_element_by_xpath('td[9]').text
        
        this_race_result_info.append([
            race_id,
            bracket_num,
            horse_num,
            arrival_time,
            arrival_diff,
            arrrival_order
        ])

    return this_race_result_info

### Refund info

In [None]:
def _get_tansho_or_fukusho_result(i, race_id, refund_type, result_list, payout_list, ninki_list):
    return [
            race_id, 
            refund_type,  
            i+1,
            result_list[i],
            payout_list[i].replace(',', ''),
            ninki_list[i]
    ]

def _get_wide_result(i, race_id, refund_type, result_list, payout_list, ninki_list):
    return [
            race_id, 
            refund_type,  
            int((i+2)/2),
            result_list[i],
            payout_list[int(i/2)].replace(',', ''),
            ninki_list[int(i/2)]
    ]

def _get_other_result(i, race_id, refund_type, result_list, payout_list, ninki_list):
        return [
            race_id, 
            refund_type,  
            1,
            result_list[i],
            payout_list[0].replace(',', ''),
            ninki_list[0]
        ]

In [None]:
def _get_payout_info(race_id, refund_table_elem, refund_type):
    result_list = re.split('\n| ' '', refund_table_elem.find_elements_by_tag_name('td')[0].text)
    payout_list = re.split('\n| ' '', refund_table_elem.find_elements_by_tag_name('td')[1].text.replace('円', ''))
    ninki_list = re.split('\n| ' '', refund_table_elem.find_elements_by_tag_name('td')[2].text.replace('人気', ''))

    payout_result_list = []
    for i in range(len(result_list)):
        if refund_type in ['単勝', '複勝']:
            payout_result_list.append(_get_tansho_or_fukusho_result(i, race_id, refund_type, result_list, payout_list, ninki_list))
        elif refund_type in ['ワイド']:
            payout_result_list.append(_get_wide_result(i, race_id, refund_type, result_list, payout_list, ninki_list))
        else:
            payout_result_list.append(_get_other_result(i, race_id, refund_type, result_list, payout_list, ninki_list))
        
    return payout_result_list

In [None]:
def _extract_race_refund_info(race_id):
    empty_refund_list = []
    refund_table_list = driver.find_element_by_class_name('FullWrap').find_elements_by_tag_name('tr')
    for idx in range(len(refund_table_list)):
        refund_table_elem = refund_table_list[idx]
        refund_type = refund_table_elem.find_element_by_tag_name('th').text
        empty_refund_list += _get_payout_info(race_id, refund_table_elem, refund_type)

    return empty_refund_list

In [None]:
def get_race_result_and_refund_info():
    existing_race_ids_in_master = _extract_race_ids_in_master_not_exist_in_race_result()
    
    for id_idx in range(len(existing_race_ids_in_master)):
        race_id = existing_race_ids_in_master[id_idx][0]
        target_url = _make_target_url_about_race_result(race_id)
        
        _load_target_url_page(target_url)
        if not _does_the_race_result_url_have_info():
            print('\tThis URL has no information about: ', race_id)
            continue

        race_result_info_list = _extract_race_result_info(race_id)
        race_refund_info_list = _extract_race_refund_info(race_id)
        
#         _bulk_insert(race_result_info_list, 'race_result_info', parameters['TABLE_COL_NAMES']['race_result_info'])
#         _bulk_insert(race_refund_info_list, 'race_refund_info', parameters['TABLE_COL_NAMES']['race_refund_info'])                        

In [None]:
# get_race_result_and_refund_info()

In [None]:
existing_race_ids_in_master = _extract_race_ids_in_master_not_exist_in_race_result()
existing_race_ids_in_master

In [None]:
id_idx = 9
race_id = existing_race_ids_in_master[id_idx][0]
target_url = _make_target_url_about_race_result(race_id)
print(target_url)

In [None]:
_load_target_url_page(target_url)

In [None]:
_does_the_race_result_url_have_info()

In [None]:
race_result_info_list = _extract_race_result_info(race_id)
race_result_info_list

In [None]:
race_refund_info_list = _extract_race_refund_info(race_id)
race_refund_info_list

## Get info about past 5 race result

In [None]:
def _extract_race_ids_in_master_not_exist_in_race_past_5_result():
    query = """
            SELECT DISTINCT race_id 
            FROM race_master
            WHERE 0=0
            AND race_id NOT IN (SELECT DISTINCT race_id FROM race_past_5_result_info)
            AND race_year>={MIN_YEAR} AND race_year<={MAX_YEAR}
            AND race_month>={MIN_MONTH} AND race_month<={MAX_MONTH}
            AND race_date>={MIN_DATE} AND race_date<={MAX_DATE};
        """.format(MIN_YEAR=parameters['MIN_YEAR'], MAX_YEAR=parameters['MAX_YEAR'], 
                   MIN_MONTH=parameters['MIN_MONTH'], MAX_MONTH=parameters['MAX_MONTH'],
                   MIN_DATE=parameters['MIN_DATE'], MAX_DATE=parameters['MAX_DATE'])
    result = _fetchall_and_make_list_by(query)
    return result

def _make_target_url_about_past_5_race_result(race_id):
    return parameters['URL_ABOUT_NETKEIBA']['RACE_PAST5_RESULT'].format(RACE_ID=race_id)

In [None]:
def _extract_past_5_race_result(race_id):
    this_race_past5_result_info = []
    table_element = driver.find_element_by_class_name('Shutuba_HorseList').find_elements_by_class_name('HorseList')
    table_length = len(table_element)

    for row in range(table_length):
        bracket_num = table_element[row].find_elements_by_tag_name('td')[0].text
        horse_num = table_element[row].find_elements_by_tag_name('td')[1].text

        race_name_elem_list = table_element[row].find_elements_by_class_name('Data_Item')
        for i in range(len(race_name_elem_list)):
            past_x = i+1
            race_name_element = race_name_elem_list[i].find_element_by_class_name('Data02')
            past_x_race_title = race_name_element.text
            past_x_race_id = int(re.sub('\\D', '', race_name_element.find_element_by_tag_name('a').get_attribute("href")))
            arrival_order = race_name_elem_list[i].find_element_by_class_name('Num').text
            this_race_past5_result_info.append([
                race_id, 
                bracket_num, 
                horse_num, 
                past_x, 
                past_x_race_title, 
                past_x_race_id, 
                arrival_order
            ])

    return this_race_past5_result_info

In [None]:
def get_past_5_race_result_info():
    existing_race_ids_in_master = _extract_race_ids_in_master_not_exist_in_race_past_5_result()
    
    for id_idx in range(len(existing_race_ids_in_master)):
        race_id = existing_race_ids_in_master[id_idx][0]
        target_url = _make_target_url_about_past_5_race_result(race_id)
        
        _load_target_url_page(target_url)

        race_past5_result_info_list = _extract_past_5_race_result(race_id)
        if len(race_past5_result_info_list) == 0:
            print('\tThis race has no past 5 race result info')
            continue

#         _bulk_insert(race_past5_result_info_list, 'race_past_5_result_info', 
#                      parameters['TABLE_COL_NAMES']['race_past_5_result_info'])                        

In [None]:
existing_race_ids_in_master = _extract_race_ids_in_master_not_exist_in_race_past_5_result()
existing_race_ids_in_master

In [None]:
id_idx = 0
race_id = existing_race_ids_in_master[id_idx][0]
target_url = _make_target_url_about_past_5_race_result(race_id)
# target_url = 'https://race.netkeiba.com/race/shutuba_past.html?race_id=202009020511'
print(target_url)

In [None]:
_load_target_url_page(target_url)

In [None]:
race_past5_result_info_list = _extract_past_5_race_result(race_id)
race_past5_result_info_list