In [1]:
import os
import sys
import pymysql
import re
import time
import pandas as pd
import numpy as np
import category_encoders as ce
import featuretools as ft
import h2o
from h2o.automl import H2OAutoML

from Utils.bulk_insert import BulkInsert
from Config import params_config, db_config, queries_config

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', 400)
pd.set_option('display.max_rows', 400)

In [3]:
db_params = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': 'daigo1123',
    'database': 'dev_netkeiba',
    'port': 3306,
    'charset': 'utf8'
}
con = pymysql.connect(**db_params)
parameters = params_config.parameters
queries = queries_config.queries

## Extract Data from DB

In [4]:
def _fetchall_and_make_list_by(query, con):
    try:
        cursor = con.cursor()
        cursor.execute(query)
        fetch_result = cursor.fetchall()
        fetch_result_list = [item for item in fetch_result]
        cursor.close()
        return fetch_result_list
    except Exception as e:
        print(e)

In [5]:
def _get_race_master_data_frame(queries, parameters, con):
    race_master_list = _fetchall_and_make_list_by(queries['RACE_MASTER_INFO'], con)
    return pd.DataFrame(race_master_list, 
                                         columns=parameters['DATAFRAME_COL_NAMES']['RACE_MASTER_INFO_COLS'])

In [6]:
def _get_race_table_result_data_frame(queries, parameters, con):
    race_table_result_list = _fetchall_and_make_list_by(queries['RACE_TABLE_RESULT_INFO'], con)
    return pd.DataFrame(race_table_result_list, 
                                         columns=parameters['DATAFRAME_COL_NAMES']['RACE_TABLE_RESULT_INFO_COLS'])

In [7]:
def _get_race_past_x_result_data_frame(queries, parameters, con):
    race_past_x_result_list = _fetchall_and_make_list_by(queries['RACE_PAST_X_RESULT_INFO'], con)
    return pd.DataFrame(race_past_x_result_list, 
                                         columns=parameters['DATAFRAME_COL_NAMES']['RACE_PAST_X_RESULT_INFO_COLS'])

In [8]:
race_master_df = _get_race_master_data_frame(queries, parameters, con)

In [9]:
race_table_result_df = _get_race_table_result_data_frame(queries, parameters, con)

In [10]:
race_past_x_result_df = _get_race_past_x_result_data_frame(queries, parameters, con)

## Define index variable and objective variable

In [11]:
def _make_race_horse_id(row):
    horse_num_str = str(row['horse_num']) if row['horse_num'] >= 10 else '0' + str(row['horse_num'])
    return row['race_id'] + '_' + horse_num_str

In [12]:
def preprocess_race_table_result_df_idx(df):
    df['race_horse_id']= df.apply(_make_race_horse_id, axis=1)
    return df

In [13]:
race_table_result_df = preprocess_race_table_result_df_idx(race_table_result_df)

In [14]:
def _make_race_horse_past_x_id(row):
    horse_num_str = str(row['horse_num']) if row['horse_num'] >= 10 else '0' + str(row['horse_num'])
    return row['race_id'] + '_' + horse_num_str + '_' + str(row['past_x'])

In [15]:
def preprocess_race_past_x_result_df_idx(df):
    df['race_horse_id']= df.apply(_make_race_horse_id, axis=1)
    df['race_horse_past_x_id']= df.apply(_make_race_horse_past_x_id, axis=1)
    return df

In [None]:
race_past_x_result_df = preprocess_race_past_x_result_df_idx(race_past_x_result_df)

In [None]:
def _define_target_variable(row, model_type='win', obj_type='odds_or_zero'):
    if model_type == 'win' and obj_type == 'odds_or_zero':
        if row['arrival_order'] == 1 or row['arrival_sec_diff_from_first'] <= 0.002:
            return row['win_odds']
        else:
            return 0

In [None]:
def preprocess_target_variable(df):
    df['y']= df.apply(_define_target_variable, axis=1, model_type='win', obj_type='odds_or_zero')
    return df

In [None]:
race_table_result_df = preprocess_target_variable(race_table_result_df)

#### Check data frame

In [None]:
print(race_master_df.shape)
race_master_df.head()

In [None]:
print(race_table_result_df.shape)
race_table_result_df.head()

In [None]:
print(race_past_x_result_df.shape)
race_past_x_result_df.head()

## ここまでがデータ固有の前処理,　以降はAutoProcessing

In [None]:
model_params = {    
    'CATEGORICAL_FEATURES_DICT': {
        'race_place': 'OrdinalEncoder',
        'race_corse_baba': 'OrdinalEncoder',
        'race_corse_mawari': 'OrdinalEncoder',
        'race_weather': 'OrdinalEncoder',
        'race_condition': 'OrdinalEncoder',
        'race_dow': 'OrdinalEncoder',
        
        'href_to_horse': 'LeaveOneOutEncoder',
        'horse_sex': 'OrdinalEncoder',
        'href_to_jockey': 'LeaveOneOutEncoder',
        'href_to_owner': 'LeaveOneOutEncoder'
    },
    
    'FEATURETOOLS_PARAMS': {
        'INDEX_COL': {
            'RACE_MASTER': ['race_id'],
            'RACE_TABLE_RESULT': ['race_id', 'race_horse_id'],
            'RACE_PAST_X_RESULT': ['race_horse_id', 'race_horse_past_x_id']
        },
        'FEATURE_COL': {
            'RACE_MASTER': [
                'race_round',
                'race_kai',
                'race_place',
                'race_corse_baba',
                'race_corse_dist',
                'race_corse_mawari',
                'race_weather',
                'race_condition',
                'race_year',
                'race_month',
                'race_date',
                'race_dow',
                'starting_hour',
                'starting_minutes'
            ],
            'RACE_TABLE_RESULT': [
                'bracket_num',
                'href_to_horse',
                'horse_age',
                'horse_sex',
                'weight_penalty',
                'href_to_jockey',
                'href_to_owner',
                'popularity_order',
                'win_odds'
            ],
            'RACE_PAST_X_RESULT': [
                'past_x_arrival_order',
                'arrival_sec_diff_from_first'
            ]
        },
        'PRIMITIVES': {
            'aggregation': ['sum', 'mean', 'std', 'max', 'min', 'count', 'skew'],
            'transform': []
        }
    },
    
    'TRAIN_TEST_SPLIT': {
        'INDEX_COL': ['race_id', 'horse_num'],
        'EXCLUDE_COL': ['race_id', 'horse_num', 'y'],
        'TARGET_COL': 'y',
        'CRITERIA_TO_SPLIT_DATA': {'race_master.race_year': 2019, 'race_master.race_month': 3}
    }
    
}

## Encoding categorical features
- 参考URL: https://qiita.com/Hyperion13fleet/items/afa49a84bd5db65ffc31

In [None]:
def encode_category_variables(df, parameters):
    for key, value in model_params['CATEGORICAL_FEATURES_DICT'].items():        
        if key not in df.columns:
            continue
        if value == 'OrdinalEncoder':
            ce_oe = ce.OrdinalEncoder(cols=key, handle_unknown='impute')
            df = ce_oe.fit_transform(df)
        elif value == 'OneHotEncoder':
            ce_ohe = ce.OneHotEncoder(cols=key, handle_unknown='impute')
            df = ce_ohe.fit_transform(df)            
        elif value == 'LeaveOneOutEncoder':
            ce_looe = ce.LeaveOneOutEncoder(cols=key, handle_unknown='impute')
            df = ce_looe.fit_transform(df, y=df[model_params['TARGET_COL_NAME']])
    return df

In [None]:
race_master_df = encode_category_variables(race_master_df, parameters)
print(race_master_df.shape)
race_master_df.head()

In [None]:
race_table_result_df = encode_category_variables(race_table_result_df, parameters)
print(race_table_result_df.shape)
race_table_result_df.head()

In [None]:
race_past_x_result_df = encode_category_variables(race_past_x_result_df, parameters)
print(race_past_x_result_df.shape)
race_past_x_result_df.head()

## Feature Engeneering by 'featuretools'
- 参考URL <br>
: https://qiita.com/Hyperion13fleet/items/4eaca365f28049fe11c7 <br>
: https://docs.featuretools.com/en/stable/generated/featuretools.dfs.html#featuretools.dfs

In [None]:
# # Check the primitives
# ft.primitives.list_primitives()
# # print(ft.primitives.list_primitives().iloc[4,2])

In [None]:
def decode_race_horse_id(feature_matrix_df):
    def get_race_id(row):
        race_id = re.split('_', row['race_horse_id'])[0]
        return race_id

    def get_horse_num(row):
        horse_num = int(re.split('_', row['race_horse_id'])[1])
        return horse_num

    table_index_df = pd.DataFrame()
    table_index_df['race_id'] = pd.DataFrame(feature_matrix_df.index).apply(get_race_id, axis=1)
    table_index_df['horse_num'] = pd.DataFrame(feature_matrix_df.index).apply(get_horse_num, axis=1)
    return table_index_df

In [None]:
def engeneer_features_by_featuretools(model_params):
    es = ft.EntitySet(id='netkeiba')
    es.entity_from_dataframe(entity_id='race_master', 
                                                    dataframe=race_master_df[model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_MASTER'] + 
                                                                                                      model_params['FEATURETOOLS_PARAMS']['FEATURE_COL']['RACE_MASTER']], 
                                                    index='race_id')
    es.entity_from_dataframe(entity_id='race_table', 
                                                   dataframe=race_table_result_df[model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_TABLE_RESULT'] + 
                                                                                                              model_params['FEATURETOOLS_PARAMS']['FEATURE_COL']['RACE_TABLE_RESULT']], 
                                                   index='race_horse_id')
    es.entity_from_dataframe(entity_id='race_past_x', 
                                                   dataframe=race_past_x_result_df[model_params['FEATURETOOLS_PARAMS']['INDEX_COL']['RACE_PAST_X_RESULT'] + 
                                                                                                                 model_params['FEATURETOOLS_PARAMS']['FEATURE_COL']['RACE_PAST_X_RESULT']], 
                                                   index='race_horse_past_x_id')

    r_master_table = ft.Relationship(es['race_master']['race_id'], es['race_table']['race_id'])
    r_table_past_x = ft.Relationship(es['race_table']['race_horse_id'], es['race_past_x']['race_horse_id'])

    es.add_relationships(relationships=[r_master_table])
    es.add_relationships(relationships=[r_table_past_x])    
    
    feature_matrix_df, _ = ft.dfs(
                                                     entityset=es,
                                                     target_entity='race_table',
                                                     agg_primitives=model_params['FEATURETOOLS_PARAMS']['PRIMITIVES']['aggregation'],
                                                     trans_primitives =model_params['FEATURETOOLS_PARAMS']['PRIMITIVES']['transform'],
                                                     max_depth=2
                                                   )
    feature_matrix_df  = feature_matrix_df.fillna(0)
    table_index_df = decode_race_horse_id(feature_matrix_df)
    feature_matrix_df = feature_matrix_df.reset_index(drop=True)
    return feature_matrix_df, table_index_df

In [None]:
feature_matrix_df, table_index_df  = engeneer_features_by_featuretools(
    model_params, race_master_df, race_table_result_df, race_past_x_result_df
)

In [None]:
print(table_index_df.shape)
print(feature_matrix_df.shape)
feature_matrix_df.head(10)

## Feature Selection by 'boruta'
- 参考URL: https://dev.classmethod.jp/machine-learning/yoshim-featuretools-boruta-optuna/

### Train Test Split

In [None]:
def make_train_test_data(feature_df, y_df, idx_df, model_params):
    dataset = pd.concat([feature_df, y_df[model_params['TRAIN_TEST_SPLIT']['TARGET_COL']]], axis='columns')
    dataset = pd.concat([idx_df, dataset], axis='columns')
    
    index_cols = model_params['TRAIN_TEST_SPLIT']['INDEX_COL']
    feature_cols = [col for col in list(dataset.columns) if col not in model_params['TRAIN_TEST_SPLIT']['EXCLUDE_COL']]
    target_col = model_params['TRAIN_TEST_SPLIT']['TARGET_COL']
    criteria_to_split_dict = model_params['TRAIN_TEST_SPLIT']['CRITERIA_TO_SPLIT_DATA']
    
    Idx_train = dataset[
        (dataset[list(criteria_to_split_dict)[0]] < criteria_to_split_dict[list(criteria_to_split_dict)[0]]) |
        (dataset[list(criteria_to_split_dict)[1]] < criteria_to_split_dict[list(criteria_to_split_dict)[1]])
    ][index_cols]
    Idx_train = Idx_train.loc[:,~Idx_train.columns.duplicated()]

    X_train = dataset[
        (dataset[list(criteria_to_split_dict)[0]] < criteria_to_split_dict[list(criteria_to_split_dict)[0]]) |
        (dataset[list(criteria_to_split_dict)[1]] < criteria_to_split_dict[list(criteria_to_split_dict)[1]])
    ][feature_cols]

    y_train = dataset[
        (dataset[list(criteria_to_split_dict)[0]] < criteria_to_split_dict[list(criteria_to_split_dict)[0]]) |
        (dataset[list(criteria_to_split_dict)[1]] < criteria_to_split_dict[list(criteria_to_split_dict)[1]])
    ][target_col]

    Idx_test = dataset[
        (dataset[list(criteria_to_split_dict)[0]] >= criteria_to_split_dict[list(criteria_to_split_dict)[0]]) &
        (dataset[list(criteria_to_split_dict)[1]] >= criteria_to_split_dict[list(criteria_to_split_dict)[1]])
    ][index_cols]
    Idx_test = Idx_test.loc[:,~Idx_test.columns.duplicated()]

    X_test = dataset[
        (dataset[list(criteria_to_split_dict)[0]] >= criteria_to_split_dict[list(criteria_to_split_dict)[0]]) &
        (dataset[list(criteria_to_split_dict)[1]] >= criteria_to_split_dict[list(criteria_to_split_dict)[1]])
    ][feature_cols]

    y_test = dataset[
        (dataset[list(criteria_to_split_dict)[0]] >= criteria_to_split_dict[list(criteria_to_split_dict)[0]]) &
        (dataset[list(criteria_to_split_dict)[1]] >= criteria_to_split_dict[list(criteria_to_split_dict)[1]])
    ][target_col]
    
    return Idx_train, X_train, y_train, Idx_test, X_test, y_test

In [None]:
Idx_train, X_train, y_train, Idx_test, X_test, y_test = make_train_test_data(
    feature_matrix_df, race_table_result_df, table_index_df, model_params
)

In [None]:
print(Idx_train.shape)
print(X_train.shape)
print(y_train.shape)

print(Idx_test.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
X_train.head()

### Boruta

In [None]:
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# import matplotlib.pyplot as plt
# import seaborn as sns

In [None]:
def select_features_by_boruta(X_train, y_train):
    model = RandomForestRegressor(
        n_estimators=50, 
        max_depth = 10, 
        max_features = 'sqrt', 
        n_jobs=-1, 
        verbose=True
    )
    
    features_selector = BorutaPy(
        model, 
        n_estimators='auto',
         verbose=2,
         alpha=0.5, # 有意水準
         max_iter=30, # 試行回数
         random_state=1
    )
    
    features_selector.fit(X_train.values, y_train.values)
    X_train_selected = X_train.iloc[:,features_selector.support_]
    X_test_selected = X_test.iloc[:,features_selector.support_]
    feature_selected_cols = list(X_train_selected.columns())
    
    return feature_selected_cols, X_train_selected, X_test_selected

In [None]:
feature_selected_cols, X_train_selected, X_test_selected = select_features_by_boruta(X_train, y_train)

## Supervised Learning by 'h2o'

In [None]:
h2o.init(ip="127.0.0.1", max_mem_size_GB = 2)

In [None]:
hdf = h2o.H2OFrame(pd.concat([X_train, y_train], axis=1))

aml = H2OAutoML(max_models=2, seed=1, max_runtime_secs=28800)
aml.train(
    x = feature_cols, 
    y = target_col, 
    training_frame = hdf
)

In [None]:
lb = aml.leaderboard
lb.head(rows=lb.nrows) 

In [None]:
aml.leader.varimp(use_pandas=True)

In [None]:
y_test_pred = aml.predict(h2o.H2OFrame(X_test)).as_data_frame()['predict']
print('RMSE: ', np.sqrt(mean_squared_error(y_test_pred, y_test)))
print('R2: ', r2_score(y_test_pred, y_test))

### とりあえずRFで結果を出力する機構を作成する

In [None]:
rf_reg = RandomForestRegressor(
    n_estimators=100, 
    max_depth = 10, 
    max_features = 'sqrt', 
    n_jobs=-1, 
    verbose=1
)

In [None]:
rf_reg.fit(X_train, y_train)

In [None]:
feature_importance = pd.DataFrame(rf_reg.feature_importances_, columns=["importance"], index=feature_cols)
feature_importance.sort_values("importance", ascending=False)

In [None]:
y_test_pred = rf_reg.predict(X_test)
print('RMSE: ', np.sqrt(mean_squared_error(y_test_pred, y_test)))
print('R2: ', r2_score(y_test_pred, y_test))

### ここまでがRFによるもの

In [None]:
predited_score_df = pd.concat(
    [Idx_test.reset_index(drop=True), pd.DataFrame(np.round(y_test_pred, 3), columns=['predicted_score'])], axis=1
)

In [None]:
predited_score_df.head()

In [None]:
predited_score_list = predited_score_df.values.tolist()

In [None]:
def _bulk_insert(insert_list, target_table_name, insert_col_names):
    try:
        bi = BulkInsert(con)
        bi.execute(insert_data=insert_list, target_table=target_table_name, col_names=insert_col_names)
    except RuntimeError as e:
        print(e)
        raise TypeError

In [None]:
_bulk_insert(predited_score_list, 'race_predicted_score', parameters['TABLE_COL_NAMES']['race_predicted_score'])

#### Profiling to check finally

In [None]:
# import pandas_profiling as pdp
# profile = pdp.ProfileReport(training_race_df)
# profile.to_file(output_file="Model/profile_report.html")
# profile