In [1]:
import pandas as pd
import numpy as np
import numpy.linalg as npl
import matplotlib.pyplot as plt

In [5]:
raceResultDf = pd.read_csv('../data/raceResultTrainTest.csv', index_col=0)

In [7]:
raceResultDf.columns

Index(['age', 'chulNo', 'diffUnit', 'hrNo', 'ord', 'rating', 'rcDist',
       'rcTime', 'sex', 'wgBudam', 'wgHr', 'winOdds', 'plcOdds', 'rcIdx',
       'hrRcCnt', 'ord1Rate', 'ord2Rate', 'ord3Rate', 'trackState',
       'deltaWgHr', 'sexNeutral', 'sexMale', 'sexFemale', 'chulNo_1',
       'chulNo_2', 'chulNo_3', 'chulNo_4', 'chulNo_5', 'chulNo_6', 'chulNo_7',
       'chulNo_8', 'chulNo_9', 'chulNo_10', 'chulNo_11', 'chulNo_12',
       'chulNo_13'],
      dtype='object')

In [16]:
def groupFunc(x):
    d = {}
    d['ageMean'] = x['age'].mean()
    d['ratingMean'] = x['rating'].mean()
    return pd.Series(d, index=['ageMean', 'ratingMean'])

In [19]:
rcIdxMean = (raceResultDf.groupby(['rcIdx']).apply(groupFunc))

In [32]:
def demeanAge(x):
    return x['age'] - rcIdxMean.loc[x['rcIdx'],'ageMean']

def demeanRating(x):
    return x['rating'] - rcIdxMean.loc[x['rcIdx'],'ratingMean']

In [34]:
raceResultDf['demeanAge'] = raceResultDf.apply(demeanAge, axis=1)
raceResultDf['demeanRating'] = raceResultDf.apply(demeanRating, axis=1)

In [80]:
testDf = raceResultDf[raceResultDf['rcIdx'] > 202305000]
trainDf = raceResultDf[raceResultDf['rcIdx'] <= 202305000]
len(testDf) / (len(testDf) + len(trainDf)) * 100

16.85126582278481

In [81]:
trainDf = trainDf[['demeanAge', 'demeanRating', 'sexNeutral', 'sexMale', 'sexFemale', 'wgBudam', 'wgHr', 'deltaWgHr', 'hrRcCnt', 'trackState', 'ord']]
testDf = testDf[['demeanAge', 'demeanRating', 'sexNeutral', 'sexMale', 'sexFemale', 'wgBudam', 'wgHr', 'deltaWgHr', 'hrRcCnt', 'trackState', 'ord']]

In [82]:
trainDf['label'] = trainDf.apply(lambda x: 1 if x['ord'] < 3 else 0, axis=1)
testDf['label'] = testDf.apply(lambda x: 1 if x['ord'] < 3 else 0, axis=1)

In [83]:
trainDf

Unnamed: 0,demeanAge,demeanRating,sexNeutral,sexMale,sexFemale,wgBudam,wgHr,deltaWgHr,hrRcCnt,trackState,ord,label
302,0.0,0.0,0,0,1,54.0,469,-7,3,13,1,1
303,0.0,0.0,0,1,0,56.0,470,-7,5,13,2,1
304,0.0,0.0,0,0,1,54.0,466,4,2,13,3,0
305,0.0,0.0,1,0,0,56.0,467,-2,4,13,4,0
306,0.0,0.0,1,0,0,56.0,511,-21,4,13,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1366,-0.5,8.7,1,0,0,58.0,481,-7,11,3,6,0
1367,0.5,5.7,1,0,0,54.5,447,-5,11,3,7,0
1368,0.5,-3.3,1,0,0,52.0,472,1,6,3,8,0
1369,-1.5,-4.3,1,0,0,51.5,479,-1,9,3,9,0


In [122]:
train_input, train_target = trainDf.drop(['label', 'ord'], axis=1), trainDf['label']
test_input, test_target = testDf.drop(['label', 'ord'], axis=1), testDf['label']

In [116]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=42, oob_score=True, max_depth=5)

In [117]:
from sklearn.model_selection import cross_validate
scores = cross_validate(rf,train_input, train_target, 
                        return_train_score=True,n_jobs=-1)

In [118]:
scores

{'fit_time': array([0.2193799 , 0.19902706, 0.19842625, 0.20664692, 0.19776678]),
 'score_time': array([0.01922584, 0.0194869 , 0.02288055, 0.01851892, 0.0190742 ]),
 'test_score': array([0.8056872, 0.8      , 0.8      , 0.8      , 0.8047619]),
 'train_score': array([0.80952381, 0.8216409 , 0.81331748, 0.8216409 , 0.80499405])}

In [119]:
fit = rf.fit(train_input,train_target)

In [120]:
print('param importance')
pd.Series(data= fit.feature_importances_, index= train_input.columns)

param importance


demeanAge       0.275912
demeanRating    0.199498
sexNeutral      0.013313
sexMale         0.014443
sexFemale       0.008155
wgBudam         0.105327
wgHr            0.137935
deltaWgHr       0.104531
hrRcCnt         0.096109
trackState      0.044775
dtype: float64

In [121]:
fit.oob_score_

0.796384395813511

In [125]:
fit.score(test_input,test_target)

0.8028169014084507

In [131]:
testDf

Unnamed: 0,demeanAge,demeanRating,sexNeutral,sexMale,sexFemale,wgBudam,wgHr,deltaWgHr,hrRcCnt,trackState,ord,label
0,0.000,0.0,0,0,1,54.0,437,9,0,6,1,1
1,0.000,0.0,0,0,1,54.0,455,1,1,6,2,1
2,0.000,0.0,0,0,1,54.0,456,12,0,6,3,0
3,0.000,0.0,0,1,0,56.0,482,-8,1,6,4,0
4,0.000,0.0,0,0,1,54.0,465,-5,0,6,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
211,0.375,0.0,0,0,1,54.0,495,8,8,20,4,0
212,0.375,0.0,0,0,1,54.0,503,7,4,20,5,0
213,-0.625,0.0,0,0,1,54.0,464,4,7,20,6,0
214,0.375,0.0,0,0,1,54.0,437,-12,4,20,7,0


In [132]:
testDf['probPlc'] = fit.predict_proba(test_input)[:,1]

In [134]:
testDf.to_csv('../data/prediction.csv')