In [410]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

local_path = '/Users/csizi/Development/marathon_statistics/dataset/csv/bszm_2008_2020/'

bszm = pd.read_csv(local_path + 'bszm_cleaned.csv', encoding = "utf8", 
                    sep=';', header=0)

In [411]:
bszm.drop(['Category/Placement', 'event_year', '1.day/1.length', '1.day/2.length', '1.day/3.length', '1.day/length', '2.day/1.length', '2.day/2.length', '2.day/3.length', '2.day/length',
           '3.day/1.length', '3.day/2.length', '3.day/3.length', '3.day/length', '4.day/1.length', '4.day/2.length', '4.day/3.length', '4.day/length', 'event_length'], axis = 1, inplace=True)

In [412]:
bszm['1/1_tempo_category'] = pd.cut(bszm['1/1_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['1/2_tempo_category'] = pd.cut(bszm['1/2_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['1/3_tempo_category'] = pd.cut(bszm['1/3_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['1_tempo_category'] = pd.cut(bszm['1_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['2/1_tempo_category'] = pd.cut(bszm['2/1_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['2/2_tempo_category'] = pd.cut(bszm['2/2_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['2/3_tempo_category'] = pd.cut(bszm['2/3_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['2_tempo_category'] = pd.cut(bszm['2_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['3/1_tempo_category'] = pd.cut(bszm['3/1_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['3/2_tempo_category'] = pd.cut(bszm['3/2_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['3/3_tempo_category'] = pd.cut(bszm['3/3_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['3_tempo_category'] = pd.cut(bszm['3_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['4/1_tempo_category'] = pd.cut(bszm['4/1_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['4/2_tempo_category'] = pd.cut(bszm['4/2_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['4/3_tempo_category'] = pd.cut(bszm['4/3_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])
bszm['4_tempo_category'] = pd.cut(bszm['4_tempo'], 5, labels=['fast', 'mid fast', 'average', 'mid slow', 'slow'])

In [413]:
bszm.drop(['Name', 'Placement', 'Born', 'Team', 'City', 'Number', '1.day/1.time', '1.day/2.time', '1.day/3.time', '1.day/sum', '2.day/1.time'
           , '2.day/2.time', '2.day/3.time', '2.day/sum', '3.day/1.time', '3.day/2.time', '3.day/3.time', '3.day/sum', '4.day/1.time', '4.day/2.time'
           , '4.day/3.time', '4.day/sum', 'Result', 'Ran km'], axis = 1, inplace=True)

In [414]:
bszm.drop(['1/1_tempo', '1/2_tempo', '1/3_tempo', '1_tempo', '2/1_tempo', '2/2_tempo', '2/3_tempo', '2_tempo'
           ,'3/1_tempo', '3/2_tempo', '3/3_tempo', '3_tempo'
           , '4/1_tempo', '4/2_tempo', '4/3_tempo', '4_tempo'], axis = 1, inplace=True)

In [415]:
bszm.dtypes

Category                       object
Country                        object
Gender                         object
finished                         bool
age                           float64
finisher_result               float64
average_tempo(minutes/km)     float64
tempo_category                 object
club_member                      bool
1_weather_temp                float64
1_weather_rain                float64
1_weather_cloud               float64
1_weather_press               float64
1_weather_wind                float64
1_weather_gust                float64
2_weather_temp                float64
2_weather_rain                float64
2_weather_cloud               float64
2_weather_press               float64
2_weather_wind                float64
2_weather_gust                float64
3_weather_temp                float64
3_weather_rain                float64
3_weather_cloud               float64
3_weather_press               float64
3_weather_wind                float64
3_weather_gu

# Random Forest


In [416]:
bszm.drop(['age', 'finisher_result', 'average_tempo(minutes/km)'], axis=1, inplace=True)

In [417]:
bszm.drop(['1_weather_temp', '2_weather_temp', '3_weather_temp', '4_weather_temp'], axis = 1, inplace=True)
bszm.drop(['1_weather_rain', '2_weather_rain', '3_weather_rain', '4_weather_rain'], axis = 1, inplace=True)
bszm.drop(['1_weather_cloud', '2_weather_cloud', '3_weather_cloud', '4_weather_cloud'], axis = 1, inplace=True)
bszm.drop(['1_weather_press', '2_weather_press', '3_weather_press', '4_weather_press'], axis = 1, inplace=True)
bszm.drop(['1_weather_wind', '2_weather_wind', '3_weather_wind', '4_weather_wind'], axis = 1, inplace=True)
bszm.drop(['1_weather_gust', '2_weather_gust', '3_weather_gust', '4_weather_gust'], axis = 1, inplace=True)

In [418]:
bszm.drop(['1/1_tempo_category', '1/2_tempo_category', '1/3_tempo_category', '1_tempo_category'], axis = 1, inplace=True)
bszm.drop(['2/1_tempo_category', '2/2_tempo_category', '2/3_tempo_category', '2_tempo_category'], axis = 1, inplace=True)
bszm.drop(['3/1_tempo_category', '3/2_tempo_category', '3/3_tempo_category', '3_tempo_category'], axis = 1, inplace=True)
bszm.drop(['4/1_tempo_category', '4/2_tempo_category', '4/3_tempo_category', '4_tempo_category'], axis = 1, inplace=True)

In [419]:
bszm['temperature'] = pd.cut(bszm['temp_avg'], 3, labels=['cold', 'average', 'warm'])
bszm['rain'] = pd.cut(bszm['rain_avg'], 3, labels=['no rain', 'moderate rain', 'rainy'])
bszm['clouds'] = pd.cut(bszm['cloud_avg'], 3, labels=['no clouds', 'moderate cloudness', 'cloudy'])
bszm['pressure'] = pd.cut(bszm['press_avg'], 3, labels=['low pressure', 'moderate pressure', 'high pressure'])
bszm['wind'] = pd.cut(bszm['wind_avg'], 3, labels=['no wind', 'moderate wind', 'windy'])
bszm['gusts'] = pd.cut(bszm['gust_avg'], 3, labels=['weak gusts', 'moderate gusts', 'strong gusts'])


In [420]:
bszm.drop(['temp_avg', 'rain_avg', 'cloud_avg', 'press_avg', 'wind_avg', 'gust_avg'], axis=1, inplace=True)

In [421]:
bszm = bszm.sample(frac = 1)
bszm.columns


Index(['Category', 'Country', 'Gender', 'finished', 'tempo_category',
       'club_member', 'temperature', 'rain', 'clouds', 'pressure', 'wind',
       'gusts'],
      dtype='object')

In [422]:
bszm.shape

(2506, 12)

In [423]:
bszm[bszm['tempo_category'].astype(str) == 'nan'] = np.nan
bszm = bszm.dropna()

In [424]:
bszm

Unnamed: 0,Category,Country,Gender,finished,tempo_category,club_member,temperature,rain,clouds,pressure,wind,gusts
535,M2,HUN,M,True,slow,False,warm,no rain,no clouds,high pressure,no wind,weak gusts
1766,M3,HUN,M,True,fast,False,cold,no rain,moderate cloudness,low pressure,windy,strong gusts
1758,M2,HUN,M,True,fast,True,cold,no rain,moderate cloudness,low pressure,windy,strong gusts
310,M4,HUN,M,True,fast,True,warm,no rain,no clouds,high pressure,no wind,weak gusts
1982,M4,GER,M,True,mid fast,False,warm,no rain,no clouds,high pressure,windy,strong gusts
...,...,...,...,...,...,...,...,...,...,...,...,...
1085,M3,HUN,M,True,slow,False,warm,no rain,no clouds,moderate pressure,moderate wind,moderate gusts
554,M3,HUN,M,True,slow,True,warm,no rain,no clouds,high pressure,no wind,weak gusts
1978,M3,HUN,M,True,mid fast,True,warm,no rain,no clouds,high pressure,windy,strong gusts
623,M2,HUN,M,True,fast,False,cold,no rain,cloudy,low pressure,moderate wind,moderate gusts


In [425]:
bszm['Hungarian'] = np.where(bszm['Country'] == 'HUN', 1, 0)

In [426]:
category = pd.get_dummies(bszm['club_member'], drop_first=True)
tempo_category = pd.get_dummies(bszm['tempo_category'], drop_first=True)
bszm['Hungarian'] = pd.get_dummies(bszm['Hungarian'], drop_first=True)
bszm['Gender'] = pd.get_dummies(bszm['Gender'], drop_first=True)
bszm['finished'] = pd.get_dummies(bszm['finished'], drop_first=True)
bszm['club_member'] = pd.get_dummies(bszm['club_member'], drop_first=True)
temp = pd.get_dummies(bszm['temperature'], drop_first=True)
rain = pd.get_dummies(bszm['rain'], drop_first=True)
clouds = pd.get_dummies(bszm['clouds'], drop_first=True)
pressure = pd.get_dummies(bszm['pressure'], drop_first=True)
wind = pd.get_dummies(bszm['wind'], drop_first=True)
gusts = pd.get_dummies(bszm['gusts'], drop_first=True)

bszm = bszm.join([category, tempo_category, temp, rain, clouds, pressure, wind, gusts])

bszm.drop(['Category', 'Country', 'tempo_category', 'temperature', 'rain', 'wind', 'clouds', 'pressure', 'gusts'], axis = 1, inplace=True)

In [427]:
y = bszm['finished']
bszm.drop(['finished'], axis=1, inplace=True)
X= bszm

In [428]:
y = y.values.reshape(-1,1)

In [429]:
bszm

Unnamed: 0,Gender,club_member,Hungarian,True,fast,mid fast,mid slow,slow,average,warm,moderate rain,rainy,moderate cloudness,cloudy,moderate pressure,high pressure,moderate wind,windy,moderate gusts,strong gusts
535,1,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0
1766,1,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
1758,1,1,1,1,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1
310,1,1,1,1,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1982,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085,1,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,1,0
554,1,1,1,1,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0
1978,1,1,1,1,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,1
623,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0


In [451]:
# Tréning és teszt adatok létrehozása 2/3 és 1/3 vágással
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.90)

In [452]:
clf = DecisionTreeClassifier(criterion="entropy", splitter="best", max_depth=7, min_samples_split=3)
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print("Paraméteres pontosság:",accuracy_score(y_test, pred))

Paraméteres pontosság: 0.8564334085778781
