In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.metrics

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score


from catboost import Pool, CatBoostClassifier

%matplotlib inline
sns.set(style='darkgrid', font_scale=1.5)

### Подготовка данных

In [2]:
data = pd.read_csv('metrics_data.csv', index_col='_id')
person = pd.read_csv('../person.csv', index_col='_id')

In [3]:
data.head()

Unnamed: 0_level_0,x_mean,x_std,x_max,x_min,y_mean,y_std,y_max,y_min,z_mean,z_std,...,sma,x_energy,y_energy,z_energy,x_iqr,y_iqr,z_iqr,xy_corr,xz_corr,yz_corr
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,13110.861519,1489.074033,17584.0,628.0,-231.04531,1249.368959,3960.0,-5076.0,-9781.608168,1204.070214,...,16514.114425,174110600.0,1613309.0,97128720.0,684.0,1788.0,808.0,0.389405,0.895658,0.367969
2,10398.993311,636.795316,11924.0,8728.0,1220.230769,712.538755,3100.0,-1344.0,-12743.73913,516.853783,...,16528.702152,108544200.0,1996250.0,162669800.0,1136.0,735.0,866.0,-0.186981,0.95049,-0.062792
3,12969.669749,807.711275,15236.0,10760.0,-36.354029,1327.7851,4640.0,-5000.0,-10092.491413,1051.603116,...,16538.716131,168863900.0,1762006.0,102962800.0,1164.0,964.0,1368.0,-0.023526,0.921424,-0.023458
6,12765.113725,3289.505788,29072.0,6396.0,815.027451,2927.223542,12488.0,-5844.0,-10411.811765,3428.969363,...,16912.948421,173726500.0,9199305.0,120117500.0,3768.0,3326.0,4846.0,0.078659,-0.394979,-0.021597
7,15146.131868,160.385778,16424.0,12984.0,-1865.611722,122.331084,-1588.0,-3500.0,-6243.567766,290.745677,...,16491.939761,229431000.0,3495458.0,39066590.0,196.0,132.0,485.0,-0.203383,0.750925,-0.452449


In [4]:
person.head()

Unnamed: 0_level_0,height,mass,position,is_valid,age,sex
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,180,85,2,0,18,1
2,185,75,1,0,18,1
3,185,75,2,0,18,1
6,185,75,3,0,18,1
7,185,75,1,1,18,1


In [5]:
data = pd.concat([data, person], axis=1).drop(['is_valid', 'age'], axis=1)

In [6]:
data.head()

Unnamed: 0_level_0,x_mean,x_std,x_max,x_min,y_mean,y_std,y_max,y_min,z_mean,z_std,...,x_iqr,y_iqr,z_iqr,xy_corr,xz_corr,yz_corr,height,mass,position,sex
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,13110.861519,1489.074033,17584.0,628.0,-231.04531,1249.368959,3960.0,-5076.0,-9781.608168,1204.070214,...,684.0,1788.0,808.0,0.389405,0.895658,0.367969,180,85,2,1
2,10398.993311,636.795316,11924.0,8728.0,1220.230769,712.538755,3100.0,-1344.0,-12743.73913,516.853783,...,1136.0,735.0,866.0,-0.186981,0.95049,-0.062792,185,75,1,1
3,12969.669749,807.711275,15236.0,10760.0,-36.354029,1327.7851,4640.0,-5000.0,-10092.491413,1051.603116,...,1164.0,964.0,1368.0,-0.023526,0.921424,-0.023458,185,75,2,1
6,12765.113725,3289.505788,29072.0,6396.0,815.027451,2927.223542,12488.0,-5844.0,-10411.811765,3428.969363,...,3768.0,3326.0,4846.0,0.078659,-0.394979,-0.021597,185,75,3,1
7,15146.131868,160.385778,16424.0,12984.0,-1865.611722,122.331084,-1588.0,-3500.0,-6243.567766,290.745677,...,196.0,132.0,485.0,-0.203383,0.750925,-0.452449,185,75,1,1


In [7]:
y = data['position']
X = data.drop('position', axis=1)

In [8]:
cat_feature = [X.columns.get_loc('sex')]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=1234
)

train_pool = Pool(X_train, label=y_train, cat_features=cat_feature, 
                  feature_names=list(X_train.columns))
test_pool = Pool(X_test, label=y_test, cat_features=cat_feature, 
                 feature_names=list(X_test.columns))

In [10]:
model = CatBoostClassifier(eval_metric='Accuracy', 
                           use_best_model=True, random_seed=0, loss_function='MultiClass')

model.fit(train_pool, eval_set=test_pool, metric_period=100)

Learning rate set to 0.102876
0:	learn: 0.8426966	test: 0.7435897	best: 0.7435897 (0)	total: 53.7ms	remaining: 53.6s
100:	learn: 1.0000000	test: 0.7948718	best: 0.7948718 (100)	total: 164ms	remaining: 1.46s
200:	learn: 1.0000000	test: 0.8205128	best: 0.8205128 (200)	total: 293ms	remaining: 1.17s
300:	learn: 1.0000000	test: 0.8461538	best: 0.8461538 (300)	total: 399ms	remaining: 927ms
400:	learn: 1.0000000	test: 0.8717949	best: 0.8717949 (400)	total: 512ms	remaining: 765ms
500:	learn: 1.0000000	test: 0.8461538	best: 0.8717949 (400)	total: 628ms	remaining: 625ms
600:	learn: 1.0000000	test: 0.8717949	best: 0.8717949 (400)	total: 812ms	remaining: 539ms
700:	learn: 1.0000000	test: 0.8461538	best: 0.8717949 (400)	total: 919ms	remaining: 392ms
800:	learn: 1.0000000	test: 0.8717949	best: 0.8717949 (400)	total: 1.07s	remaining: 267ms
900:	learn: 1.0000000	test: 0.8717949	best: 0.8717949 (400)	total: 1.18s	remaining: 130ms
999:	learn: 1.0000000	test: 0.8461538	best: 0.8717949 (400)	total: 1.31s	

<catboost.core.CatBoostClassifier at 0x7feec9f556d0>

In [11]:
y_pred = model.predict(X_test)

In [12]:
y_pred.ravel()

array([3, 1, 3, 1, 2, 3, 3, 3, 1, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 3, 3, 3,
       1, 1, 3, 1, 1, 2, 3, 3, 3, 1, 2, 2, 1, 2, 3, 3, 1])

In [13]:
y_test.to_numpy()

array([3, 1, 3, 1, 2, 3, 3, 2, 1, 1, 3, 1, 3, 2, 3, 1, 2, 1, 2, 3, 3, 3,
       2, 1, 3, 1, 1, 2, 3, 3, 3, 1, 1, 2, 1, 2, 3, 3, 2])

Попробуем предсказать горбатость. $pos=1$

In [31]:
data = pd.read_csv('metrics_data.csv', index_col='_id')
person = pd.read_csv('../person.csv', index_col='_id')

In [32]:
data = pd.concat([data, person], axis=1).drop(['age'], axis=1)

In [33]:
data = data[data['position'] == 3].drop('position', axis=1)

In [34]:
data.head()

Unnamed: 0_level_0,x_mean,x_std,x_max,x_min,y_mean,y_std,y_max,y_min,z_mean,z_std,...,x_iqr,y_iqr,z_iqr,xy_corr,xz_corr,yz_corr,height,mass,is_valid,sex
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,12765.113725,3289.505788,29072.0,6396.0,815.027451,2927.223542,12488.0,-5844.0,-10411.811765,3428.969363,...,3768.0,3326.0,4846.0,0.078659,-0.394979,-0.021597,185,75,0,1
9,15071.435323,3140.794595,29164.0,10240.0,-1579.159204,1482.553175,3308.0,-7412.0,-6577.5199,1970.134951,...,3827.0,1986.0,2640.0,-0.058404,-0.595581,0.095259,185,75,1,1
14,12002.647646,2301.407845,23460.0,8132.0,1024.348074,1488.899667,5840.0,-2820.0,-11312.720875,1956.064704,...,3002.0,2422.0,2828.0,0.089644,-0.727482,-0.066262,178,68,0,1
17,15046.406723,3399.854465,27084.0,8920.0,-545.532773,1597.479157,4180.0,-6240.0,-6689.328852,1891.960641,...,5048.0,2468.0,2776.0,-0.045948,-0.598055,0.034319,178,68,1,1
21,13891.260826,2469.622894,30348.0,9740.0,-231.43001,1909.370131,5540.0,-5772.0,-8757.39577,1630.688783,...,2606.0,3155.0,1823.0,-0.061139,-0.478953,-0.022014,182,68,0,1


In [35]:
y = data['is_valid']
X = data.drop('is_valid', axis=1)

In [36]:
cat_feature = [X.columns.get_loc('sex')]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=1234
)

train_pool = Pool(X_train, label=y_train, cat_features=cat_feature, 
                  feature_names=list(X_train.columns))
test_pool = Pool(X_test, label=y_test, cat_features=cat_feature, 
                 feature_names=list(X_test.columns))

In [38]:
model = CatBoostClassifier(eval_metric='Accuracy', 
                           use_best_model=True, random_seed=0)

model.fit(train_pool, eval_set=test_pool, metric_period=100)

Learning rate set to 0.013328
0:	learn: 0.8666667	test: 0.6153846	best: 0.6153846 (0)	total: 502us	remaining: 502ms
100:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 65.2ms	remaining: 581ms
200:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 102ms	remaining: 406ms
300:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 139ms	remaining: 322ms
400:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 176ms	remaining: 263ms
500:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 231ms	remaining: 231ms
600:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 269ms	remaining: 178ms
700:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 306ms	remaining: 130ms
800:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 344ms	remaining: 85.4ms
900:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 385ms	remaining: 42.3ms
999:	learn: 1.0000000	test: 0.6923077	best: 0.6923077 (100)	total: 422m

<catboost.core.CatBoostClassifier at 0x7fbf348e4a90>

In [39]:
for value, name in sorted(
    zip(model.get_feature_importance(fstr_type="FeatureImportance"), 
        X_train.columns), 
    reverse=True
):
    print("{}\t{:.2f}".format(name, value))

z_min	13.31
z_energy	8.88
x_mean	8.68
z_mean	8.14
x_energy	6.94
z_std	5.70
xy_corr	4.91
y_mean	4.66
x_min	3.54
y_min	3.52
y_iqr	3.25
height	2.99
z_max	2.91
mass	2.72
z_iqr	2.59
yz_corr	2.19
sma	2.08
y_energy	2.06
x_max	1.98
x_iqr	1.95
sex	1.91
y_max	1.64
y_std	1.39
xz_corr	1.05
x_std	1.01
