In [18]:
import os
import sys
import pymysql
import pandas as pd
import joblib
import h2o

from scipy.optimize import minimize
import matplotlib.pyplot as plt

from Config import params_config, db_config, queries_config, model_params_config

import warnings
warnings.filterwarnings('ignore')

In [2]:
pd.set_option('display.max_columns', 400)
pd.set_option('display.max_rows', 400)

In [7]:
db_params = {
    'host': '127.0.0.1',
    'user': 'root',
    'password': 'daigo1123',
    'database': 'dev_netkeiba',
    'port': 3306,
    'charset': 'utf8'
}
con = pymysql.connect(**db_params)
parameters = params_config.parameters
model_params = model_params_config.model_params
queries = queries_config.queries

## Get the predicted score data

- Predicted Scoreから閾値を決める
- Pickleファイルと閾値から、将来データのScore及びFlagを計算

In [None]:
def _fetchall_and_make_list_by(query, con):
    try:
        cursor = con.cursor()
        cursor.execute(query)
        fetch_result = cursor.fetchall()
        fetch_result_list = [item for item in fetch_result]
        cursor.close()
        return fetch_result_list
    except Exception as e:
        print(e)

In [None]:
def _get_race_prediction_data_frame(queries, parameters, con):
    race_prediction_list = _fetchall_and_make_list_by(queries['PREDICTION_SCORE_AND_RESULT_INFO'], con)
    return pd.DataFrame(race_prediction_list, 
                                         columns=[
                                             'race_id', 
                                             'race_year',
                                             'race_month',
                                             'race_date',
                                             'race_dow',
                                             'starting_time',
                                             'race_place',
                                             'race_round',
                                             'race_kai',
                                             'race_title',
                                             'horse_num', 
                                             'popularity_order',
                                             'predicted_score', 
                                             'refund_type', 
                                             'refund_yen'
                                         ])

In [None]:
race_prediction_df = _get_race_prediction_data_frame(queries, parameters, con)

In [None]:
print(race_prediction_df.shape)
race_prediction_df.head()

In [None]:
race_prediction_df['predicted_score'].describe()

## 閾値シミュレーション: 単勝のみを購入する場合
購入ルール
- 各レースIDにおいて、スコア閾値を超えた馬の内、最高スコアの馬に100円をBet
- どの馬も閾値を超えなかったら、当該レースにはBetしない

シミュレーション方法
- 閾値を動かしながら、利益(=Total Refund額 - Total Bet額)が最大化されるように調整
- Refund額は、単勝オッズ
- Test期間でどのような利益推移になるのかを可視化

In [None]:
len(pd.unique(race_prediction_df['race_id']))

In [None]:
def get_profit_in_each_race(race_prediction_df, race_id, refund_type, score_threshold):
    each_race_prediction_df = race_prediction_df[race_prediction_df['race_id']==race_id]
    each_horse_score_df = each_race_prediction_df[['horse_num', 'predicted_score']]
    each_horse_score_df = each_horse_score_df[~each_horse_score_df.duplicated()]
    horse_with_max_score_df = each_horse_score_df[
        each_horse_score_df['predicted_score'] == max(each_horse_score_df['predicted_score'])
    ]
    if horse_with_max_score_df.shape[0]>1:
        horse_with_max_score_df = horse_with_max_score_df.iloc[0,:]
    
    if float(horse_with_max_score_df['predicted_score']) >= score_threshold:
        bet_horse_num = int(horse_with_max_score_df['horse_num'])
        bet_price = 100
        refund_price = each_race_prediction_df[(each_race_prediction_df['horse_num']==bet_horse_num)&
                                                                                (each_race_prediction_df['refund_type']==refund_type)]['refund_yen']
        if refund_price.empty:
            refund_price = 0
            profit = -bet_price
        else:
            refund_price = float(refund_price)
            profit = refund_price - bet_price
    else:
        bet_price = 0
        refund_price = 0
        profit = 0
        
    return bet_price, refund_price, profit

In [None]:
def get_total_profit_price_with(thr, race_prediction_df, refund_type):
    total_bet_price = 0
    total_refund_price = 0
    total_profit_price = 0
    for race_id in pd.unique(race_prediction_df['race_id']):
        bet_price, refund_price, profit_price = get_profit_in_each_race(
            race_prediction_df=race_prediction_df, 
            race_id=race_id, 
            refund_type=refund_type, 
            score_threshold=thr
        )
        total_bet_price += bet_price
        total_refund_price += refund_price
        total_profit_price += profit_price
    return total_bet_price, total_refund_price, total_profit_price

In [None]:
def optimize_threshold_of_score(race_prediction_df, refund_type, max_iter, thr_list):
    total_profit_list = []
    profit_rate_list = []
    for thr in thr_list:
        total_bet_price, total_refund_price, total_profit_price = get_total_profit_price_with(thr, race_prediction_df, refund_type)
        total_profit_list.append(total_profit_price)
        profit_rate_list.append(total_profit_price/total_bet_price)

    for itr in range(max_iter):
        thr = (thr_list[-1]+thr_list[profit_rate_list.index(max(profit_rate_list))])/2
        total_bet_price, total_refund_price, total_profit_price = get_total_profit_price_with(thr, race_prediction_df, refund_type)
        total_profit_list.append(total_profit_price)
        profit_rate_list.append(total_profit_price/total_bet_price)
        thr_list.append(thr)

        if abs(total_profit_list[-1] - total_profit_list[-2]) < 0.01:
            break
        
    return thr_list, total_profit_list, profit_rate_list

In [None]:
thr_list, total_profit_list, profit_rate_list = optimize_threshold_of_score(
            race_prediction_df=race_prediction_df, 
            refund_type='単勝', 
            max_iter=10, 
            thr_list=[0, 2]
)

In [None]:
opt_profit_result_df = pd.DataFrame([thr_list, total_profit_list, profit_rate_list]).T
opt_profit_result_df.columns = ['thr', 'total_profit', 'profit_rate']
opt_profit_result_df

In [None]:
plt.scatter(thr_list, profit_rate_list)

In [None]:
def get_simulated_refund_result(race_prediction_df, refund_type):
    result_list = []
    thr_list = [0, 1.0, 1.5, 2, 2.5]
    for thr in thr_list:
        for race_id in pd.unique(race_prediction_df['race_id']):
            bet_price, refund_price, profit_price = get_profit_in_each_race(
                race_prediction_df=race_prediction_df, 
                race_id=race_id, 
                refund_type=refund_type, 
                score_threshold=thr
            )
            result_list.append([thr, race_id, bet_price, refund_price, profit_price])
    return pd.DataFrame(result_list, columns=['threshold', 'race_id', 'bet_price', 'refund_price', 'profit_price'])

In [None]:
simulated_result_df = get_simulated_refund_result(race_prediction_df=race_prediction_df, refund_type='単勝')

In [None]:
simulated_result_df.groupby('threshold').sum()

In [None]:
simulated_result_df.groupby('threshold').sum()['profit_price'] / simulated_result_df.groupby('threshold').sum()['bet_price'] 

In [None]:
# race_id = '201910010705'
# each_race_prediction_df = race_prediction_df[race_prediction_df['race_id']==race_id]
# each_horse_score_df = each_race_prediction_df[['horse_num', 'predicted_score']]
# each_horse_score_df = each_horse_score_df[~each_horse_score_df.duplicated()]
# horse_with_max_score_df = each_horse_score_df[
#     each_horse_score_df['predicted_score'] == max(each_horse_score_df['predicted_score'])
# ]

## 次回レースにおける予想算出

In [19]:
model_params['DIR_NAME_OF_MODEL_PICKLE']+'fitted_'+features+'_features_'+model_type+'_model.pkl'

'Model/pickles_files/191123/fitted_selected_features_h2o_model.pkl'

In [21]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,14 days 7 hours 17 mins
H2O cluster timezone:,Asia/Tokyo
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,24 days
H2O cluster name:,H2O_from_python_daigomiyoshi_2o3kxx
H2O cluster total nodes:,1
H2O cluster free memory:,898 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [24]:
# predict score by model pickled
features='selected'
model_type='h2o'
h2o.load_model(model_params['DIR_NAME_OF_MODEL_PICKLE']+'fitted_'+features+'_features_'+model_type+'_model.pkl')

H2OResponseError: Server error java.lang.IllegalArgumentException:
  Error: Missing magic number 0x1CED at stream start
  Request: POST /99/Models.bin/
    data: {'dir': 'Model/pickles_files/191123/fitted_selected_features_h2o_model.pkl'}
