In [1]:
import os
import sys
import pymysql
import pandas as pd
import re
import datetime
import category_encoders
import joblib

from Config import params_config, query_config, db_config
from Utils.bulk_insert import BulkInsert

import warnings
warnings.filterwarnings('ignore')

## fit_race_info_into_model.py

In [2]:
queries = query_config.queries
parameters = params_config.parameters
db_params = db_config.db_params
con = pymysql.connect(**db_params)

In [3]:
def fetchall_and_make_list_by(query, con):
    try:
        cursor = con.cursor()
        cursor.execute(query)
        fetch_result = cursor.fetchall()
        fetch_result_list = [item for item in fetch_result]
        cursor.close()
        return fetch_result_list
    except Exception as e:
        print(e)

def get_race_prior_info_list_for_training(parameters, con):
    selected_query = queries['RACE_PRIOR_INFO_FOR_TRAINING']
    return fetchall_and_make_list_by(selected_query, con)

In [4]:
race_prior_info_list_trained = get_race_prior_info_list_for_training(parameters, con)

In [5]:
race_prior_info_df_trained = pd.DataFrame(race_prior_info_list_trained, 
                                          columns=parameters['DATAFRAME_COL_NAMES']['race_prior_info_for_training'])

In [6]:
race_prior_info_df_trained.shape

(177695, 26)

## Class: Preprocessing

In [7]:
def _get_year_month_day_from_race_timing(x):
    date_str = re.match('([0-9]+)/([0-9]+)/([0-9]+)' , x).group()
    year = datetime.datetime.strptime(date_str, '%Y/%m/%d').year
    month = datetime.datetime.strptime(date_str, '%Y/%m/%d').month
    day = datetime.datetime.strptime(date_str, '%Y/%m/%d').day
    return pd.Series([year, month, day])

def _get_dow_from_race_timing(x):
    return re.search("土|日" , x).group()

def _get_time_in_the_racecourse_from_race_timing(x):
    return int(re.split('([0-9]+)回([ぁ-んァ-ン 一-龥]+)([0-9]+)日目' , x)[1])

def _get_racecourse_from_race_timing(x):
    return re.split('([0-9]+)回([ぁ-んァ-ン 一-龥]+)([0-9]+)日目' , x)[2]

def _get_what_day_in_the_racecourse_from_race_timing(x):
    return int(re.split('([0-9]+)回([ぁ-んァ-ン 一-龥]+)([0-9]+)日目' , x)[3])

def _encode_race_course(df):
    race_course_mapping = {'函館': 1, '札幌': 2, '福島': 3, '東京': 4, '中山': 5, '新潟': 6, '中京': 7, '阪神': 8, '京都': 9, '小倉': 10}
    return df['race_course'].map(race_course_mapping)

In [8]:
def preprocess_race_timing(df):
    df[['year', 'month', 'day']] = df['race_timing'].apply(_get_year_month_day_from_race_timing)
    df['dow'] = df['race_timing'].apply(_get_dow_from_race_timing)
    df['race_course'] =  df['race_timing'].apply(_get_racecourse_from_race_timing)
    df['race_course_encoded'] = _encode_race_course(df)
    df['time_in_racecourse'] =  df['race_timing'].apply(_get_time_in_the_racecourse_from_race_timing)
    df['what_day_in_racecourse'] =  df['race_timing'].apply(_get_what_day_in_the_racecourse_from_race_timing)
    return df

In [9]:
race_prior_info_df_trained = preprocess_race_timing(race_prior_info_df_trained)

In [10]:
def encode_race_weather(df):
    race_weather_mapping = {'晴': 1, '曇': 2, '小雨': 3, '雨': 4, '小雪': 5, '雪':6}
    return df['race_weather'].map(race_weather_mapping)

In [11]:
race_prior_info_df_trained['race_weather_encoded'] = encode_race_weather(race_prior_info_df_trained)

In [12]:
def encode_race_condition(df):
    race_condition_mapping = {'良': 1, '稍': 2, '重': 3, '不': 4}
    return df['race_condition'].map(race_condition_mapping)

In [13]:
race_prior_info_df_trained['race_condition_encoded'] = encode_race_condition(race_prior_info_df_trained)

In [17]:
def encode_fit_and_transform_href_to_the_horse(df):
    if parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_FOR_HORSE']=='TargetEncoder':
        ce = category_encoders.TargetEncoder(cols=['href_to_the_horse'])
    elif parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_FOR_HORSE']=='OrdinalEncoder':
        ce = category_encoders.OrdinalEncoder(cols=['href_to_the_horse'])
        
    ce.fit(df, 
           df[parameters['DATAFRAME_COL_NAMES']['target_col']],
           handle_unknown=parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_HANDLE_UNKNOWN'])
    joblib.dump(ce, parameters['FILE_NAME_OF_HORSE_CATEGORY_ENCODERS'])
    
    df_ce = ce.transform(df)
    df_ce = df_ce.rename(columns={'href_to_the_horse': 'href_to_the_horse_encoded'})
    return pd.concat([df, df_ce['href_to_the_horse_encoded']], axis=1)

In [18]:
# ce_loaded = joblib.load(parameters['FILE_NAME_OF_CATEGORY_ENCODERS'])
# ce_loaded

In [19]:
race_prior_info_df_trained = encode_fit_and_transform_href_to_the_horse(race_prior_info_df_trained)

In [20]:
def _get_horse_age_and_sex(x):
    horse_sex = re.split('([ぁ-んァ-ン 一-龥]+)([0-9]+)' , x)[1]
    horse_age = int(re.split('([ぁ-んァ-ン 一-龥]+)([0-9]+)' , x)[2])
    return pd.Series([horse_sex, horse_age])

def  _encode_horse_sex(df):
    horse_sex_mapping = {'牡': 1, '牝': 2, 'セ': 3}
    return df['horse_sex'].map(horse_sex_mapping)

def preprocess_horse_sex_age(df):
    df[['horse_sex', 'horse_age']] = df['horse_sex_age'].apply(_get_horse_age_and_sex)
    df['horse_sex_encoded'] = _encode_horse_sex(df)
    return df

In [21]:
race_prior_info_df_trained = preprocess_horse_sex_age(race_prior_info_df_trained)

In [22]:
def _parse_horse_weight_increment(x):
    return int(x.replace('＋', '+').replace('－', '-').replace('---', '0'))

def _get_horse_weight_info(x):
    horse_weight = int(re.split('(\()(.*)(\))' , x)[0])
    horse_weight_increment_str = re.split('(\()(.*)(\))' , x)[2]
    horse_weight_increment = _parse_horse_weight_increment(horse_weight_increment_str)
    return pd.Series([horse_weight, horse_weight_increment])

def preprocess_horse_weight_and_increment(df):
    df[['horse_weight', 'horse_weight_increment']] = df['horse_weight_and_increment'].apply(_get_horse_weight_info)
    return df

In [23]:
race_prior_info_df_trained = preprocess_horse_weight_and_increment(race_prior_info_df_trained)

In [24]:
def _get_and_encode_weight_loss_flg(x):
    try:
        weight_loss_flg = re.search('▲|△|☆' , x).group()
        weight_loss_encode = int(weight_loss_flg.replace('▲', '3').replace('△', '2').replace('☆', '1'))
    except AttributeError:
        weight_loss_encode = 0
    return weight_loss_encode

def preprocess_jockey_name(df):
    df['weight_loss_encode'] = df['jockey_name'].apply(_get_and_encode_weight_loss_flg)
    return df

In [25]:
race_prior_info_df_trained = preprocess_jockey_name(race_prior_info_df_trained)

In [26]:
def encode_fit_and_transform_href_to_the_jockey(df):
    if parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_FOR_JOCKEY']=='TargetEncoder':
        ce = category_encoders.TargetEncoder(cols=['href_to_the_jockey'])
    elif parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_FOR_JOCKEY']=='OrdinalEncoder':
        ce = category_encoders.OrdinalEncoder(cols=['href_to_the_jockey'])
        
    ce.fit(df, 
           df[parameters['DATAFRAME_COL_NAMES']['target_col']],
           handle_unknown=parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_HANDLE_UNKNOWN'])
    joblib.dump(ce, parameters['FILE_NAME_OF_JOCKEY_CATEGORY_ENCODERS'])
    
    df_ce = ce.transform(df)
    df_ce = df_ce.rename(columns={'href_to_the_jockey': 'href_to_the_jockey_encoded'})
    return pd.concat([df, df_ce['href_to_the_jockey_encoded']], axis=1)

In [27]:
# ce_loaded = joblib.load(parameters['FILE_NAME_OF_JOCKEY_CATEGORY_ENCODERS'])
# ce_loaded

In [28]:
race_prior_info_df_trained = encode_fit_and_transform_href_to_the_jockey(race_prior_info_df_trained)

In [29]:
def _get_trainer_belonging(x):
    return re.split('\[(.*)\]' , x)[1]

def _encode_trainer_belonging(df):
    trainer_belonging_mapping = {'美': 1, '栗': 2, '招': 3}
    return df['trainer_belonging'].map(trainer_belonging_mapping)

def preprocess_trainer_name(df):
    df['trainer_belonging'] = df['trainer_name'].apply(_get_trainer_belonging)
    df['trainer_belonging_encoded'] = _encode_trainer_belonging(df)
    return df

In [30]:
race_prior_info_df_trained = preprocess_trainer_name(race_prior_info_df_trained)

In [31]:
def encode_fit_and_transform_href_to_the_trainer(df):
    if parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_FOR_TRAINER']=='TargetEncoder':
        ce = category_encoders.TargetEncoder(cols=['href_to_the_trainer'])
    elif parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_FOR_TRAINER']=='OrdinalEncoder':
        ce = category_encoders.OrdinalEncoder(cols=['href_to_the_trainer'])
        
    ce.fit(df, 
           df[parameters['DATAFRAME_COL_NAMES']['target_col']],
           handle_unknown=parameters['HYPER_PARAMETERS']['CATEGORY_ENCODERS_HANDLE_UNKNOWN'])
    joblib.dump(ce, parameters['FILE_NAME_OF_TRAINER_CATEGORY_ENCODERS'])
    
    df_ce = ce.transform(df)
    df_ce = df_ce.rename(columns={'href_to_the_trainer': 'href_to_the_trainer_encoded'})
    return pd.concat([df, df_ce['href_to_the_trainer_encoded']], axis=1)

In [32]:
race_prior_info_df_trained = encode_fit_and_transform_href_to_the_trainer(race_prior_info_df_trained)

In [34]:
race_prior_info_df_trained.head()

Unnamed: 0,race_id,race_timing,race_title,race_weather,race_condition,course_syokin_list,post_position,horse_number,href_to_the_horse,horse_sex_age,...,horse_sex,horse_age,horse_sex_encoded,horse_weight,horse_weight_increment,weight_loss_encode,href_to_the_jockey_encoded,trainer_belonging,trainer_belonging_encoded,href_to_the_trainer_encoded
0,201510100501,2015/10/10(土) 4回東京1日目,サラ系2歳未勝利,曇,良,サラ系2歳未勝利 牝 [指] 馬齢 ダ1400m 16頭 11:10発走 本賞金 500万 ...,1,1,https://www.keibalab.jp/db/horse/2013101018/,牝2,...,牝,2,2,494,0,3,9.365854,美,1,9.477301
1,201510100501,2015/10/10(土) 4回東京1日目,サラ系2歳未勝利,曇,良,サラ系2歳未勝利 牝 [指] 馬齢 ダ1400m 16頭 11:10発走 本賞金 500万 ...,1,2,https://www.keibalab.jp/db/horse/2013104095/,牝2,...,牝,2,2,442,-8,0,7.139848,美,1,8.058151
2,201510100501,2015/10/10(土) 4回東京1日目,サラ系2歳未勝利,曇,良,サラ系2歳未勝利 牝 [指] 馬齢 ダ1400m 16頭 11:10発走 本賞金 500万 ...,2,3,https://www.keibalab.jp/db/horse/2013102296/,牝2,...,牝,2,2,492,2,0,7.397448,美,1,7.429495
3,201510100501,2015/10/10(土) 4回東京1日目,サラ系2歳未勝利,曇,良,サラ系2歳未勝利 牝 [指] 馬齢 ダ1400m 16頭 11:10発走 本賞金 500万 ...,2,4,https://www.keibalab.jp/db/horse/2013105492/,牝2,...,牝,2,2,434,-8,0,8.4662,美,1,7.913892
4,201510100501,2015/10/10(土) 4回東京1日目,サラ系2歳未勝利,曇,良,サラ系2歳未勝利 牝 [指] 馬齢 ダ1400m 16頭 11:10発走 本賞金 500万 ...,3,5,https://www.keibalab.jp/db/horse/2013102168/,牝2,...,牝,2,2,434,-6,0,8.27238,美,1,7.556059


In [43]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [37]:
class Net(nn.Module):
    def __init__(self, D):
        super(Net, self).__init__()
        self.l1 = nn.Linear(D, 10)
        self.l2 = nn.Linear(10, 1)

    def forward(self, x):
        x = torch.sigmoid(self.l1(x))
        x = self.l2(x)
        return x

In [39]:
def listnet_loss(y_i, z_i):
    """
    y_i: (n_i, 1)
    z_i: (n_i, 1)
    """

    P_y_i = F.softmax(y_i, dim=0)
    P_z_i = F.softmax(z_i, dim=0)
    return - torch.sum(P_y_i * torch.log(P_z_i))

def make_dataset(N_train, N_valid, D):
    ws = torch.randn(D, 1)

    X_train = torch.randn(N_train, D, requires_grad=True)
    X_valid = torch.randn(N_valid, D, requires_grad=True)

    ys_train_score = torch.mm(X_train, ws)
    ys_valid_score = torch.mm(X_valid, ws)

    bins = [-2, -1, 0, 1]  # 5 relevances
    ys_train_rel = torch.Tensor(
        np.digitize(ys_train_score.clone().detach().numpy(), bins=bins)
    )
    ys_valid_rel = torch.Tensor(
        np.digitize(ys_valid_score.clone().detach().numpy(), bins=bins)
    )

    return X_train, X_valid, ys_train_rel, ys_valid_rel


def swapped_pairs(ys_pred, ys_target):
    N = ys_target.shape[0]
    swapped = 0
    for i in range(N - 1):
        for j in range(i + 1, N):
            if ys_target[i] < ys_target[j]:
                if ys_pred[i] > ys_pred[j]:
                    swapped += 1
            elif ys_target[i] > ys_target[j]:
                if ys_pred[i] < ys_pred[j]:
                    swapped += 1
    return swapped


def ndcg(ys_true, ys_pred):
    def dcg(ys_true, ys_pred):
        _, argsort = torch.sort(ys_pred, descending=True, dim=0)
        ys_true_sorted = ys_true[argsort]
        ret = 0
        for i, l in enumerate(ys_true_sorted, 1):
            ret += (2 ** l - 1) / np.log2(1 + i)
        return ret
    ideal_dcg = dcg(ys_true, ys_true)
    pred_dcg = dcg(ys_true, ys_pred)
    return pred_dcg / ideal_dcg


In [44]:
N_train = 500
N_valid = 100
D = 50
epochs = 10
batch_size = 16

X_train, X_valid, ys_train, ys_valid = make_dataset(N_train, N_valid, D)

In [47]:
X_train

tensor([[-0.4171, -1.8220, -0.5783,  ...,  1.5002, -0.4274, -1.1665],
        [ 1.7215, -1.9836,  0.5270,  ...,  0.6701, -0.9603,  0.3600],
        [ 0.0219, -1.3083,  0.8111,  ...,  0.6885, -0.5924, -0.1169],
        ...,
        [-0.2086, -0.0378,  1.2788,  ...,  0.3110, -0.2704, -1.0314],
        [-0.5935, -0.6379,  1.3678,  ..., -0.3148, -0.7610,  0.2626],
        [ 0.7759,  0.8268,  1.5696,  ...,  2.0907,  1.7582, -0.2129]],
       requires_grad=True)