In [3]:
import os
import sys
import pymysql
import re
import time
import pandas as pd
import numpy as np

from Utils.bulk_insert import BulkInsert

import warnings
warnings.filterwarnings('ignore')

In [4]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [27]:
db_params = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': 'daigo1123',
    'database': 'dev_netkeiba',
    'port': 3306,
    'charset': 'utf8'
}
con = pymysql.connect(**db_params)

parameters = {
    'DATAFRAME_COL_NAMES': {
        'RACE_MASTER_INFO_COLS': [
            'race_id',
            'race_course_baba',
            'race_course_dist',
            'race_course_mawari',
            'race_weather',
            'race_condition',
            'race_year',
            'race_month',
            'race_date',
            'race_dow',
            'starting_hour',
            'starting_minutes'
        ],
        'RACE_TABLE_RESULT_INFO_COLS': [
            'race_id',
            'horse_num',
            'bracket_num',
            'href_to_horse',
            'horse_age',
            'horse_sex',
            'weight_penalty',
            'href_to_jockey',
            'href_to_owner',
            'popularity_order',
            'win_odds',
            'arrival_sec_diff_from_first',
            'arrival_order'
        ],
       'RACE_PAST_X_RESULT_INFO_COLS': [
            'race_id',
            'horse_num',
            'past_x',
            'past_x_arrival_order',
            'arrival_sec_diff_from_first'
        ]
    }
}

queries = {
    'RACE_MASTER_INFO':
        '''
        SELECT 
            race_id
            , SUBSTRING(race_course, 1, 1) AS race_course_baba
            , CAST(SUBSTRING(race_course, 2, 4) AS SIGNED) AS race_course_dist
            , SUBSTRING(race_course, LOCATE('(',race_course)+1, LOCATE(')',race_course)- LOCATE('(',race_course)-1) AS race_course_mawari
            , race_weather
            , race_condition
            , race_year
            , race_month
            , race_date
            , race_dow
            , CAST(SUBSTRING(starting_time, 1, 2) AS SIGNED) AS starting_hour
            , CAST(SUBSTRING(starting_time, 4, 5) AS SIGNED) AS starting_minutes
        FROM race_master
        ''',
    
    'RACE_TABLE_RESULT_INFO':
        '''
        SELECT 
            A.race_id
            , A.horse_num
            , A.bracket_num
            , A.href_to_horse
            , A.horse_age
            , A.horse_sex
            , CAST(A.weight_penalty AS FLOAT) AS weight_penalty
            , A.href_to_jockey
            , A.href_to_owner
            , CAST(A.popularity_order AS SIGNED) AS popularity_order
            , CAST(A.win_odds AS FLOAT) AS win_odds
            , CASE WHEN B.arrival_order <> 1 THEN 
              CAST(ROUND(TIMEDIFF(B.arrival_time, FIRST_VALUE(B.arrival_time) OVER (PARTITION BY B.race_id ORDER BY CAST(B.arrival_order AS SIGNED)))/60, 3) AS FLOAT) 
              ELSE 0.000 END AS arrival_sec_diff_from_first
            , B.arrival_order
        FROM race_table_info AS A
        LEFT JOIN race_result_info AS B
        ON A.race_id = B.race_id
        AND A.horse_num = B.horse_num
        WHERE 0=0
        CAST(B.arrival_order AS SIGNED) NOT IN ('', 0)
        AND A.win_odds <> 0
        ''',
    
    'RACE_PAST_X_RESULT_INFO': 
        '''
        SELECT 
            A.race_id
            , A.horse_num
            , A.past_x
            , A.arrival_order AS past_x_arrival_order
            , B.arrival_sec_diff_from_first
        FROM race_past_5_result_info AS A
        INNER JOIN (
            SELECT *
            , ROUND(TIMEDIFF(arrival_time, FIRST_VALUE(arrival_time) OVER (PARTITION BY race_id ORDER BY arrival_order))/60, 3) AS arrival_sec_diff_from_first
            FROM race_result_info
            WHERE CAST(arrival_order AS SIGNED) NOT IN ('', 0)
        ) AS B
        ON A.past_x_race_id = B.race_id
        AND A.arrival_order = B.arrival_order
        WHERE 0=0
        AND CAST(A.arrival_order AS SIGNED) NOT IN ('', 0)
        AND A.past_x_race_id <> '201008060411'
        '''
}

## Extract Data from DB

In [8]:
def _fetchall_and_make_list_by(query, con):
    try:
        cursor = con.cursor()
        cursor.execute(query)
        fetch_result = cursor.fetchall()
        fetch_result_list = [item for item in fetch_result]
        cursor.close()
        return fetch_result_list
    except Exception as e:
        print(e)

In [9]:
def _get_race_master_data_frame(queries, parameters, con):
    race_master_list = _fetchall_and_make_list_by(queries['RACE_MASTER_INFO'], con)
    return pd.DataFrame(race_master_list, 
                                         columns=parameters['DATAFRAME_COL_NAMES']['RACE_MASTER_INFO_COLS'])

In [12]:
race_master_df = _get_race_master_data_frame(queries, parameters, con)

In [14]:
print(race_master_df.shape)
race_master_df.head()

(40234, 12)


Unnamed: 0,race_id,race_course_baba,race_course_dist,race_course_mawari,race_weather,race_condition,race_year,race_month,race_date,race_dow,starting_hour,starting_minutes
0,200801010101,芝,1500,右,曇,良,2008,8,16,土,10,40
1,200801010102,ダ,1000,右,曇,稍重,2008,8,16,土,11,10
2,200801010103,ダ,1700,右,曇,稍重,2008,8,16,土,11,40
3,200801010104,芝,1500,右,曇,良,2008,8,16,土,12,30
4,200801010105,ダ,1700,右,曇,稍重,2008,8,16,土,13,5


In [17]:
def _get_race_table_result_data_frame(queries, parameters, con):
    race_table_result_list = _fetchall_and_make_list_by(queries['RACE_PAST_X_RESULT_INFO'], con)
    return pd.DataFrame(race_table_result_list, 
                                         columns=parameters['DATAFRAME_COL_NAMES']['RACE_TABLE_RESULT_INFO_COLS'])

In [18]:
race_table_result_df = _get_race_table_result_data_frame(queries, parameters, con)

In [20]:
print(race_table_result_df.shape)
race_table_result_df.head()

(571259, 13)


Unnamed: 0,race_id,horse_num,bracket_num,href_to_horse,horse_age,horse_sex,weight_penalty,href_to_jockey,href_to_owner,popularity_order,win_odds,arrival_sec_diff_from_first,arrival_order
0,200801010101,6,4,https://db.netkeiba.com/horse/2006102194/,2,牝,54.0,https://db.netkeiba.com/jockey/00700/,https://db.netkeiba.com/trainer/01027/,2,4.3,,1
1,200801010101,14,8,https://db.netkeiba.com/horse/2006101653/,2,牡,54.0,https://db.netkeiba.com/jockey/00705/,https://db.netkeiba.com/trainer/01099/,4,5.2,0.007,2
2,200801010101,1,1,https://db.netkeiba.com/horse/2006104966/,2,牡,54.0,https://db.netkeiba.com/jockey/00945/,https://db.netkeiba.com/trainer/01059/,1,3.1,0.008,3
3,200801010101,11,7,https://db.netkeiba.com/horse/2006102534/,2,牡,54.0,https://db.netkeiba.com/jockey/00722/,https://db.netkeiba.com/trainer/00388/,8,23.0,0.01,4
4,200801010101,2,2,https://db.netkeiba.com/horse/2006110090/,2,牝,54.0,https://db.netkeiba.com/jockey/01084/,https://db.netkeiba.com/trainer/00419/,6,21.9,0.01,5


In [28]:
def _get_race_past_x_result_data_frame(queries, parameters, con):
    race_past_x_result_list = _fetchall_and_make_list_by(queries['RACE_PAST_X_RESULT_INFO'], con)
    return pd.DataFrame(race_past_x_result_list, 
                                         columns=parameters['DATAFRAME_COL_NAMES']['RACE_PAST_X_RESULT_INFO_COLS'])

In [29]:
race_past_x_result_df = _get_race_past_x_result_data_frame(queries, parameters, con)

In [30]:
print(race_past_x_result_df.shape)
race_past_x_result_df.head()

(1974449, 5)


Unnamed: 0,race_id,horse_num,past_x,past_x_arrival_order,arrival_sec_diff_from_first
0,200801010101,1,1,3,0.003
1,200801010101,2,1,9,1.677
2,200801010101,3,1,3,0.99
3,200801010101,3,2,2,0.992
4,200801010101,4,1,5,1.667


## Define explanatory variables and objective variable

## Feature Engeneering by 'featuretools'

#### Profiling to check finally

In [None]:
# import pandas_profiling as pdp
# profile = pdp.ProfileReport(training_race_df)
# profile.to_file(output_file="Model/profile_report.html")
# profile