In [1]:
import os
import sys
from bs4 import BeautifulSoup
import pymysql
import requests
import re
import time

from Utils.bulk_insert import BulkInsert

In [171]:
db_params = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': 'daigo1123',
    'database': 'dev_netkeiba',
    'port': 3306,
    'charset': 'utf8'
}
con = pymysql.connect(**db_params)

parameters = {

    # parameters about scraping
    'URL_ABOUT_NETKEIBA': {
        'RACE_TABLE': 'https://race.netkeiba.com/?pid=race_old&id=c',
        'RACE_RESULT': 'https://race.netkeiba.com/?pid=race&id=c{RACE_ID}&mode=result'
    },

    # parameters about model training

    # col names in database tables
    'TABLE_COL_NAMES': {
        'race_master': [
            'race_id',
            'race_title',
            'race_coure',
            'race_weather',
            'race_condition',
            'race_year',
            'race_month',
            'race_date',
            'race_dow',
            'starting_time',
            'race_info_1',
            'race_info_2',
            'race_info_3'
        ],
        'race_table_info': [
            'race_id',
            'bracket_num',
            'horse_num',
            'horse_name',
            'horse_age',
            'horse_sex',
            'weight_penalty',
            'jockey_name',
            'href_to_jockey',
            'owner_name',
            'href_to_owner',
            'horse_weight',
            'horse_weight_increment',
            'win_odds',
            'popularity_order'
        ]
    },

    # col names in dataframe
    'DATAFRAME_COL_NAMES': {

    }
}

## Get info about race prior table 

In [3]:
def get_num_str(num):
    num_str = str(num) if num >= 10 else '0' + str(num)
    return num_str

In [4]:
def make_race_id_and_target_url(event_year, event_place, event_month, event_time, event_race):
    race_id = str(event_year) + get_num_str(event_place) + get_num_str(event_month) + get_num_str(event_time) + get_num_str(event_race)
    target_url = parameters['URL_ABOUT_NETKEIBA']['RACE_TABLE'] + race_id
    
    return race_id, target_url

In [5]:
def is_request_result_ng(html, soup):
    if html.status_code == 200 and soup.find_all('table', attrs={'class', 'race_table_old nk_tb_common'}) != []:
        return False
    else:
        return True

In [6]:
def extract_common_info(soup, race_id):
    race_title = soup.find('div', class_='data_intro').find('h1').text.replace(u'\xa0',u' ')
    
    race_coure = soup.find('div', class_='data_intro').find_all('p')[0].text.replace(u'\xa0',u' ')
    
    race_weather = soup.find('div', class_='data_intro').find_all('p')[1].text.replace(u'\xa0',u' ').split('/')[0]
    race_weather = re.search('天気：(.*)', race_weather).group(1)
    
    race_condition = soup.find('div', class_='data_intro').find_all('p')[1].text.replace(u'\xa0',u' ').split('/')[1]
    race_condition = re.search('馬場：(.*)', race_condition).group(1)
    
    starting_time = soup.find('div', class_='data_intro').find_all('p')[1].text.replace(u'\xa0',u' ').split('/')[2]
    starting_time = re.search('発走：(.*)', starting_time).group(1)
    
    race_date_info = soup.find('div', class_='data_intro').find('div', class_='race_otherdata').find_all('p')[0].text
    race_year = re.split('/|\(|\)', race_date_info)[0]
    race_month = re.split('/|\(|\)', race_date_info)[1]
    race_date = re.split('/|\(|\)', race_date_info)[2]
    race_dow = re.split('/|\(|\)', race_date_info)[3]
    
    race_info_1 = soup.find('div', class_='data_intro').find('div', class_='race_otherdata').find_all('p')[1].text.replace(u'\xa0',u' ')
    race_info_2 = soup.find('div', class_='data_intro').find('div', class_='race_otherdata').find_all('p')[2].text.replace(u'\xa0',u' ')
    race_info_3 = soup.find('div', class_='data_intro').find('div', class_='race_otherdata').find_all('p')[3].text.replace(u'\xa0',u' ')
    
    return race_id, race_title, race_coure, race_weather, race_condition, race_year, race_month, race_date, race_dow, starting_time, race_info_1, race_info_2, race_info_3

In [7]:
def extract_race_table(soup, race_id):
    this_race_table_info = []
    
    table_length = len(soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')) 
    for row in range(3, table_length):
        bracket_num = int(soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[0].text)
        horse_num = int(soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[1].text)
        horse_name = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[3].find('a').text
        sex_and_age = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[4].text
        horse_age = int(re.sub("\\D", "", sex_and_age))
        horse_sex = re.match('[0-9a-zA-Zあ-んア-ン一-鿐]', sex_and_age).group()
        weight_penalty = float(soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[5].text)
        jockey_name = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[6].text
        href_to_jockey = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[6].find('a').attrs['href']
        owner_name = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[7].text
        href_to_owner = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[7].find('a').attrs['href']
        horse_weight_info = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[8].text
        if horse_weight_info != '':
            horse_weight = int(re.split('\(|\)', horse_weight_info)[0])
            horse_weight_increment = re.split('\(|\)', horse_weight_info)[1]
        else:
            horse_weight = ''
            horse_weight_increment = ''

        win_odds = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[9].text
        popularity_order = soup.find('table', class_='race_table_old nk_tb_common').find_all('tr')[row].find_all('td')[10].text
        
        this_race_table_info.append([
            race_id,
            bracket_num,
            horse_num,
            horse_name,
            horse_age,
            horse_sex,
            weight_penalty,
            jockey_name,
            href_to_jockey,
            owner_name,
            href_to_owner,
            horse_weight,
            horse_weight_increment,
            win_odds,
            popularity_order
        ])
        
    return this_race_table_info

In [12]:
def _execute_query(query):
    try:
        cursor = con.cursor()
        cursor.execute(query)
        cursor.close()
        con.commit()
    except Exception as e:
        print(e)

def _truncate_target_rows(race_id):
    queries = [
        'TRUNCATE TABLE race_master WHERE race_id = "{RACE_ID}";'.format(RACE_ID=race_id),
        'TRUNCATE TABLE race_table_info WHERE race_id = "{RACE_ID}";'.format(RACE_ID=race_id)
    ]
    for query in queries:
        print(query)
        _execute_query(query)

def _bulk_insert(race_id, insert_list, target_table_name, insert_col_names):
    try:
        bi = BulkInsert(con)
        bi.execute(insert_data=insert_list, target_table=target_table_name, col_names=insert_col_names)
    except TypeError as e:
        print(e)
        _truncate_target_rows(race_id)
        raise TypeError

In [None]:
def get_and_insert_race_master_and_table_info():
    for event_year in range(2019, 2020):
        for event_place in range(1,2):
            for event_month in range(1, 2):
                for event_time in range(1, 2):
                    for event_race in range(1, 2):
                        race_master_list = []
                        race_table_info_list = []
                        race_id, target_url = make_race_id_and_target_url(event_year, event_place, event_month, event_time, event_race)                    
                        html = requests.get(target_url)
                        html.encoding = 'EUC-JP'
                        soup = BeautifulSoup(html.text, 'html.parser')

                        if is_request_result_ng(html, soup):
                            print('Target URL to requests ', target_url, 'does not exist.')
                            break

                        print('Target URL to requests: ', target_url)
                        race_master_list.append(extract_common_info(soup, race_id))
                        race_table_info_list = race_table_info_list + extract_race_table(soup, race_id)
                        time.sleep(1)
                    
                    _bulk_insert(race_id, race_master_list, 'race_master', parameters['TABLE_COL_NAMES']['race_master'])
                    _bulk_insert(race_id, race_table_info_list, 'race_table_info', parameters['TABLE_COL_NAMES']['race_table_info'])                        
                
    return race_master_list, race_table_info_list

In [None]:
race_master_list, race_table_info_list = get_and_insert_race_master_and_table_info()

In [None]:
print(pd.DataFrame(race_master_list).shape)
print(pd.DataFrame(race_table_info_list).shape)

In [None]:
pd.DataFrame(race_master_list).to_csv('race_master.csv', index=False)

In [None]:
pd.DataFrame(race_table_info_list).to_csv('race_table_info_list.csv', index=False)

In [76]:
event_year = 2019
event_place = 4 
event_month = 2
event_time = 6
event_race = 2

race_id, target_url = make_race_id_and_target_url(event_year, event_place, event_month, event_time, event_race)
print(race_id)
print('Target URL to requests: ', target_url)

html = requests.get(target_url)
html.encoding = 'EUC-JP'
soup = BeautifulSoup(html.text, 'html.parser')

201904020602
Target URL to requests:  https://race.netkeiba.com/?pid=race_old&id=c201904020602


In [78]:
is_request_result_ng(html, soup)

False

In [50]:
race_master_list = list(extract_common_info(soup, race_id))
# race_master_list

In [51]:
race_table_info_list = extract_race_table(soup, race_id)
# race_table_info_list

In [52]:
# _bulk_insert(race_id, race_master_list, 'race_master', parameters['TABLE_COL_NAMES']['race_master'])

In [53]:
# _bulk_insert(race_id, race_table_info_list, 'race_table_info', parameters['TABLE_COL_NAMES']['race_table_info'])                        

## Get info about race result

In [172]:
def _fetchall_and_make_list_by(query, con):
    try:
        cursor = con.cursor()
        cursor.execute(query)
        fetch_result = cursor.fetchall()
        fetch_result_list = [item for item in fetch_result]
        cursor.close()
        return fetch_result_list
    except Exception as e:
        print(e)

In [173]:
def extract_existing_race_ids_in_master():
    query = """
        SELECT DISTINCT race_id 
        FROM race_master
        WHERE race_id NOT IN (SELECT DISTINCT race_id FROM race_result_info);
    """
    result = _fetchall_and_make_list_by(query, con)
    return result

In [174]:
existing_race_ids_in_master = extract_existing_race_ids_in_master()
existing_race_ids_in_master[0]

('201901010101',)

In [175]:
id_idx = 0
race_id = existing_race_ids_in_master[id_idx][0]
target_url_about_result = make_target_url_about_race_result(race_id)
print(target_url_about_result) 

https://race.netkeiba.com/?pid=race&id=c201901010101&mode=result


In [136]:
html = requests.get(target_url_about_result)
html.encoding = 'EUC-JP'
soup = BeautifulSoup(html.text, 'html.parser')

In [137]:
not soup.find_all('table', attrs={'class', 'race_table_01 nk_tb_common'})

False

In [141]:
table_length = len(soup.find('table', class_='race_table_01 nk_tb_common').find_all('tr'))
table_length

10

In [None]:
def extract_race_result_info

In [158]:
row = 2

In [180]:
arrrival_order  = soup.find('table', class_='race_table_01 nk_tb_common').find_all('tr')[row].find_all('td')[0].text
arrrival_order

'2'

In [160]:
bracket_num = soup.find('table', class_='race_table_01 nk_tb_common').find_all('tr')[row].find_all('td')[1].text
bracket_num

'3'

In [161]:
horse_num = soup.find('table', class_='race_table_01 nk_tb_common').find_all('tr')[row].find_all('td')[2].text
horse_num

'3'

In [178]:
arrival_time = soup.find('table', class_='race_table_01 nk_tb_common').find_all('tr')[row].find_all('td')[7].text
arrival_time

'1:50.1'

In [179]:
arrival_diff = soup.find('table', class_='race_table_01 nk_tb_common').find_all('tr')[row].find_all('td')[8].text
arrival_diff

'大'

In [179]:
arrival_diff = soup.find('table', class_='race_table_01 nk_tb_common').find_all('tr')[row].find_all('td')[8].text
arrival_diff

'大'

In [168]:
soup.find('dd', class_='fc').find_all('tr')

[<tr>
 <th class="tan">単勝</th>
 <td>01</td>
 <td class="txt_r">140円</td>
 <td class="txt_r">1人気</td>
 </tr>, <tr>
 <th align="center" class="fuku">複勝</th>
 <td>01<br/>03<br/>04</td>
 <td class="txt_r">110円<br/>110円<br/>470円</td>
 <td class="txt_r">1人気<br/>2人気<br/>7人気</td>
 </tr>, <tr>
 <th align="center" class="waku">枠連</th>
 <td>01 - 03</td>
 <td class="txt_r">190円</td>
 <td class="txt_r">1人気</td>
 </tr>, <tr>
 <th align="center" class="uren">馬連</th>
 <td>01 - 03</td>
 <td class="txt_r">190円</td>
 <td class="txt_r">1人気</td>
 </tr>, <tr>
 <th class="wide">ワイド</th>
 <td>01 - 03<br/>01 - 04<br/>03 - 04</td>
 <td class="txt_r">120円<br/>840円<br/>1,100円</td>
 <td class="txt_r">1人気<br/>11人気<br/>13人気</td>
 </tr>, <tr>
 <th class="utan">馬単</th>
 <td>01 → 03</td>
 <td class="txt_r">290円</td>
 <td class="txt_r">1人気</td>
 </tr>, <tr>
 <th class="sanfuku">三連複</th>
 <td>01 - 03 - 04</td>
 <td class="txt_r">1,610円</td>
 <td class="txt_r">6人気</td>
 </tr>, <tr>
 <th class="santan">三連単</th>
 <td>01 → 0

In [74]:
def get_race_result_info():
    existing_race_ids_in_master = extract_existing_race_ids_in_master()
    
    race_result_info_list = []
    rece_refund_info_list = []
    
    for id_idx in len(existing_race_ids_in_master):
        race_id = existing_race_ids_in_master[id_idx][0]
        target_url = make_target_url_about_race_result(race_id)
        
        html = requests.get(target_url)
        html.encoding = 'EUC-JP'
        soup = BeautifulSoup(html.text, 'html.parser')

        if not soup.find_all('table', attrs={'class', 'race_table_01 nk_tb_common'}):
            print('Target URL to requests ', target_url, 'does not exist.')
            break

        print('Target URL to requests: ', target_url)
        
        race_result_list = 