In [1]:
import numpy as np
import numpy.linalg as npl
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
raceHorseInfoDf = pd.read_csv('data/raceHorseInfo.csv', index_col=0)
raceResultDf = pd.read_csv('data/raceResult_0514.csv', index_col=0)

In [3]:
raceHorseInfoDf.set_index('hrNo', inplace=True)
raceResultDf = raceResultDf[['age','chulNo', 'diffUnit', 'hrNo', 'ord', 'rating', 'rcDate', 'rcDist', 'rcNo', 'rcTime', 'sex', 'track', 'wgBudam', 'wgHr', 'winOdds', 'plcOdds']]

raceResultDf = raceResultDf.assign(rcIdx=lambda x:x['rcDate']*10 + x['rcNo'])
del raceResultDf['rcDate']
del raceResultDf['rcNo']

hrRcCnt = []
ord1Rate = []
ord2Rate = []
ord3Rate = []
for i, row in raceResultDf.iterrows():
    hrNo = row['hrNo']
    rcCnt = int(raceHorseInfoDf.loc[hrNo, ['rcCntY']])
    if rcCnt:
        ord1Rate.append(round(int(raceHorseInfoDf.loc[hrNo, ['ord1CntY']])/rcCnt, 2))
        ord2Rate.append(round(int(raceHorseInfoDf.loc[hrNo, ['ord2CntY']])/rcCnt, 2))
        ord3Rate.append(round(int(raceHorseInfoDf.loc[hrNo, ['ord3CntY']])/rcCnt, 2))
    else:
        ord1Rate.append(0)
        ord2Rate.append(0)
        ord3Rate.append(0)
    hrRcCnt.append(rcCnt)
raceResultDf['hrRcCnt'] = hrRcCnt
raceResultDf['ord1Rate'] = ord1Rate
raceResultDf['ord2Rate'] = ord2Rate
raceResultDf['ord3Rate'] = ord3Rate

In [4]:
def trackProcessing(strings):
    trackStates = []
    for string in strings:
        string = str(string)
        string = string.split('%')
        string = string[0].split('(')
        trackStates.append(int(string[1]))
    return trackStates
raceResultDf = raceResultDf.assign(trackState=lambda x:trackProcessing(x['track']))
del raceResultDf['track']

In [5]:
wgHr = []
deltaWgHr = []
for wgs in raceResultDf['wgHr'].tolist():
    wg = wgs.split('(')
    deltaWg = wg[1].split(')')
    wg = int(wg[0])
    deltaWg = int(deltaWg[0]) if deltaWg[0] else 0
    wgHr.append(wg)
    deltaWgHr.append(deltaWg)
raceResultDf['wgHr'] = wgHr
raceResultDf['deltaWgHr'] = deltaWgHr

raceResultDf['ord'].replace([92,93,94,95,99], pd.NA, inplace=True)
raceResultDf.dropna(axis=0, inplace=True)

In [6]:
one_hot = pd.get_dummies(raceResultDf['sex'], prefix='sex', prefix_sep='_').rename(columns={
    'sex_암': 'sexFemale',
    'sex_수': 'sexMale',
    'sex_거': 'sexNeutral'
})
raceResultDf = pd.concat([raceResultDf, one_hot], axis=1)

In [7]:
diffUnitReplace = {'-':0, '3':3, '2½':2.5, '1':1, '¾':0.75, '1¾':1.75, '11':11, '24':24, 
                   '4':4, '2':2, '목':0.25, '코':1/16, '1¼':1.25, '7':7, '½':0.5, '8':8, 
                   '19':19, '1½':1.5, '머리':1/8, '5':5, '6':6, '16':16, '동순위':0, '9':9,
                   '15':15, '10':10, '70':70, '62':62, '14':14, '28':28, '12':12, '23':23, '25':25, '22':22, '20':20, '37':37, '44':44}
raceResultDf['diffUnit'] = raceResultDf['diffUnit'].replace(diffUnitReplace)

In [8]:
one_hot = pd.get_dummies(raceResultDf['chulNo'], prefix='chulNo')
raceResultDf = pd.concat([raceResultDf, one_hot], axis=1)

In [9]:
raceResultDf['rating'] = raceResultDf['rating'].replace('-', 0)
raceResultDf['rating'] = raceResultDf['rating'].astype(int)

In [10]:
raceResultDf.groupby('rcIdx').max()['ord'].describe()

count    127.000000
mean       9.952756
std        1.194287
min        7.000000
25%        9.000000
50%       10.000000
75%       11.000000
max       13.000000
Name: ord, dtype: float64

In [11]:
print(raceResultDf.columns)
raceResultDf.describe()

Index(['age', 'chulNo', 'diffUnit', 'hrNo', 'ord', 'rating', 'rcDist',
       'rcTime', 'sex', 'wgBudam', 'wgHr', 'winOdds', 'plcOdds', 'rcIdx',
       'hrRcCnt', 'ord1Rate', 'ord2Rate', 'ord3Rate', 'trackState',
       'deltaWgHr', 'sexNeutral', 'sexMale', 'sexFemale', 'chulNo_1',
       'chulNo_2', 'chulNo_3', 'chulNo_4', 'chulNo_5', 'chulNo_6', 'chulNo_7',
       'chulNo_8', 'chulNo_9', 'chulNo_10', 'chulNo_11', 'chulNo_12',
       'chulNo_13'],
      dtype='object')


Unnamed: 0,age,chulNo,diffUnit,hrNo,rating,rcDist,rcTime,wgBudam,wgHr,winOdds,...,chulNo_4,chulNo_5,chulNo_6,chulNo_7,chulNo_8,chulNo_9,chulNo_10,chulNo_11,chulNo_12,chulNo_13
count,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,...,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0,1264.0
mean,4.193829,5.622627,2.21158,49718.31,32.25712,1364.398734,87.845016,53.960839,476.431962,21.859098,...,0.09731,0.099684,0.09731,0.099684,0.099684,0.084652,0.071203,0.050633,0.001582,0.000791
std,1.237291,3.0108,4.129196,85647.42,26.494886,204.343072,14.908078,1.78025,26.668698,23.325325,...,0.296497,0.299696,0.296497,0.299696,0.299696,0.278473,0.257265,0.219334,0.039762,0.028127
min,2.0,1.0,0.0,33142.0,0.0,1200.0,71.6,50.0,401.0,1.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,3.0,0.5,43167.5,0.0,1200.0,76.6,52.0,459.0,6.075,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,6.0,1.25,44529.0,31.0,1300.0,82.9,54.0,476.0,13.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,5.0,8.0,3.0,45766.5,46.0,1400.0,89.8,55.125,494.0,29.125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10.0,13.0,70.0,1412364.0,138.0,2000.0,133.4,59.0,560.0,167.5,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [12]:
raceResultDf.to_csv('data/raceResultTrainTest.csv')

In [13]:
raceResultTrainTest = raceResultDf.drop(['chulNo','diffUnit', 'hrNo', 'sex'], axis=1)

In [14]:
raceResultTrainTest.head()

Unnamed: 0,age,ord,rating,rcDist,rcTime,wgBudam,wgHr,winOdds,plcOdds,rcIdx,...,chulNo_4,chulNo_5,chulNo_6,chulNo_7,chulNo_8,chulNo_9,chulNo_10,chulNo_11,chulNo_12,chulNo_13
0,3,1,0,1200,75.5,54.0,437,1.7,1.1,202305131,...,0,1,0,0,0,0,0,0,0,0
1,3,2,0,1200,76.0,54.0,455,13.6,3.1,202305131,...,0,0,0,0,0,0,0,0,0,0
2,3,3,0,1200,76.4,54.0,456,51.6,4.5,202305131,...,0,0,0,0,0,0,0,0,0,0
3,3,4,0,1200,76.6,56.0,482,7.7,2.1,202305131,...,0,0,0,0,0,0,0,0,0,0
4,3,5,0,1200,76.7,54.0,465,13.3,2.2,202305131,...,0,0,0,0,1,0,0,0,0,0


In [15]:
raceResultDfTestIdx = raceResultTrainTest['rcIdx'] // 10 >= 20230513

In [16]:
raceResultTest = raceResultTrainTest[raceResultDfTestIdx]
raceResultTrain = raceResultTrainTest[-raceResultDfTestIdx]

In [17]:
len(raceResultTrain), len(raceResultTest), len(raceResultDf)

(1163, 101, 1264)

In [18]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(raceResultTrain.drop(['rcIdx','ord','rcTime'], axis=1))

In [47]:
X_Test = raceResultTest.drop(["ord", "rcIdx", "rcTime"], axis=1)
X_scaled_test = scaler.transform(X_Test)
test_y = list(map(lambda x: 1 if x <= 2 else 0,raceResultTest["ord"]))

### Statsmodel

In [61]:
X_train = raceResultTrain.drop(["ord", "rcIdx", "rcTime"], axis=1)
y_train = list(map(lambda x: 1 if x <= 2 else 0, raceResultTrain["ord"]))
X_scaled_train = scaler.transform(X_train)

In [67]:
import statsmodels.api as sm
print(y_train)
logit = sm.Logit(y_train,X_train.iloc[:,:13])
model = logit.fit(method='bfgs')
model.summary()

[1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))
  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


0,1,2,3
Dep. Variable:,y,No. Observations:,1163.0
Model:,Logit,Df Residuals:,1150.0
Method:,MLE,Df Model:,12.0
Date:,"Tue, 16 May 2023",Pseudo R-squ.:,0.2566
Time:,17:53:00,Log-Likelihood:,-434.1
converged:,False,LL-Null:,-583.9
Covariance Type:,nonrobust,LLR p-value:,5.685000000000001e-57

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
age,0.1789,0.123,1.455,0.146,-0.062,0.420
rating,-0.0231,0.006,-3.741,0.000,-0.035,-0.011
rcDist,-0.0003,0.001,-0.555,0.579,-0.001,0.001
wgBudam,-0.0104,0.030,-0.343,0.732,-0.070,0.049
wgHr,-0.0015,0.003,-0.451,0.652,-0.008,0.005
winOdds,0.0020,0.017,0.120,0.905,-0.031,0.035
plcOdds,-0.3693,0.116,-3.177,0.001,-0.597,-0.141
hrRcCnt,0.0430,0.034,1.264,0.206,-0.024,0.110
ord1Rate,6.0068,0.848,7.083,0.000,4.345,7.669


### Sklearn SGDClassifier

In [46]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

sgd = SGDClassifier(loss='log_loss', random_state=42)

In [57]:
groups = raceResultTrain.groupby('rcIdx')
val_f1 = []
test_f1 = []
n = 0
while n < 1000:
    n += 1
    valn_f1 = []
    testn_f1 = []
    for i, (name, group) in enumerate(groups):
        # 데이터셋 분리
        X = group.drop(["ord", "rcIdx", "rcTime"], axis=1)
        y = list(map(lambda x: 1 if x <= 2 else 0, group["ord"]))
        X_scaled = scaler.transform(X)
        sgd.partial_fit(X_scaled, y, classes=[0,1])

        if i % 10 == 0:
            val_data = raceResultTrain[raceResultTrain["rcIdx"] != name]
            val_X = val_data.drop(["ord", "rcIdx", "rcTime"], axis=1)
            val_y = list(map(lambda x: 1 if x <= 2 else 0,val_data["ord"]))
            val_scaled_X = scaler.transform(val_X)
            y_pred = sgd.predict(val_scaled_X)
            acc = accuracy_score(val_y, y_pred)
            recall = recall_score(val_y, y_pred)
            precision = precision_score(val_y, y_pred)
            f1 = f1_score(val_y, y_pred)
            valn_f1.append(f1)
            #print(f"Group {i} accuracy: {acc} recall: {recall} precision: {precision} f1: {f1}")
            y_pred = sgd.predict(X_scaled_test)
            acc = accuracy_score(test_y, y_pred)
            recall = recall_score(test_y, y_pred)
            precision = precision_score(test_y, y_pred)
            f1 = f1_score(test_y, y_pred)
            testn_f1.append(f1)
            #print(f"Test accuracy: {acc} recall: {recall} precision: {precision} f1: {f1}")
    val_f1.append(np.mean(valn_f1))
    test_f1.append(np.mean(testn_f1))
plt.plot(range(len(val_f1)),val_f1, 'g--')
plt.plot(range(len(test_f1)),test_f1, 'r-')

KeyboardInterrupt: 

In [58]:
test_y = list(map(lambda x: 1 if x <= 2 else 0,raceResultTest["ord"]))
#tol = 0.6
#y_pred = list(map(lambda x: 1 if x > tol else 0,list(sgd.predict_proba(X_scaled_test)[:,1])))
y_pred = sgd.predict(X_scaled_test)
acc = accuracy_score(test_y, y_pred)
recall = recall_score(test_y, y_pred)
precision = precision_score(test_y, y_pred)
f1 = f1_score(test_y, y_pred)
print(f"Test accuracy: {acc} recall: {recall} precision: {precision} f1: {f1}")

Test accuracy: 0.8316831683168316 recall: 0.25 precision: 0.7142857142857143 f1: 0.37037037037037035


In [115]:
raceResultTest['predict'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raceResultTest['predict'] = y_pred


In [116]:
raceResultTest

Unnamed: 0,age,ord,rating,rcDist,rcTime,wgBudam,wgHr,winOdds,plcOdds,rcIdx,...,chulNo_5,chulNo_6,chulNo_7,chulNo_8,chulNo_9,chulNo_10,chulNo_11,chulNo_12,chulNo_13,predict
0,3,1,0,1200,75.5,54.0,437,1.7,1.1,202305131,...,1,0,0,0,0,0,0,0,0,0
1,3,2,0,1200,76.0,54.0,455,13.6,3.1,202305131,...,0,0,0,0,0,0,0,0,0,0
2,3,3,0,1200,76.4,54.0,456,51.6,4.5,202305131,...,0,0,0,0,0,0,0,0,0,0
3,3,4,0,1200,76.6,56.0,482,7.7,2.1,202305131,...,0,0,0,0,0,0,0,0,0,0
4,3,5,0,1200,76.7,54.0,465,13.3,2.2,202305131,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,4,6,48,1200,75.0,55.0,449,4.0,1.6,202305140,...,0,0,0,0,0,0,0,0,0,0
98,5,7,36,1200,75.1,52.0,488,50.3,9.4,202305140,...,0,0,1,0,0,0,0,0,0,0
99,5,8,36,1200,75.4,52.0,470,27.8,3.9,202305140,...,0,0,0,1,0,0,0,0,0,0
100,3,9,36,1200,75.8,52.0,420,23.4,5.4,202305140,...,0,1,0,0,0,0,0,0,0,0


In [117]:
raceResultTest.to_csv('data/predict.csv')