In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.metrics

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.multiclass import OneVsOneClassifier
from sklearn.linear_model import LogisticRegression

from sklearn import preprocessing

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import classification_report

from tqdm import tqdm

from collections import defaultdict
from collections import OrderedDict

import time

%matplotlib inline
sns.set(style='darkgrid', font_scale=1.5)

In [2]:
data = pd.read_csv('../datasets/added_pos1_metrics_windowed.csv', index_col=[0, 1])
data = data.droplevel(0)
data.head()

Unnamed: 0_level_0,x_mean,x_std,x_max,x_min,x_energy,x_iqr,y_mean,y_std,y_max,y_min,...,z_iqr_pos1,xy_corr_pos1,xz_corr_pos1,yz_corr_pos1,height,mass,position,is_valid,age,sex
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1673,11156.76,123.532772,11380.0,10836.0,124488400.0,177.0,1158.8,73.172275,1348.0,920.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
1698,11156.88,128.858961,11380.0,10836.0,124492400.0,193.0,1137.96,69.416023,1300.0,920.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
1723,11154.36,134.410311,11532.0,10836.0,124437600.0,190.0,1088.32,171.829072,1380.0,164.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
1748,10998.28,415.506022,11532.0,9384.0,121133100.0,323.0,853.36,499.190851,1380.0,-1032.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
1773,10883.04,409.57593,11532.0,9384.0,118606600.0,543.0,537.68,622.192456,1380.0,-1032.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1


In [3]:
data = data.drop(['is_valid', 'age', 'sex'], axis=1)

In [4]:
y = data['position']
X = data.drop('position', axis=1)

In [5]:
sorted_metrics = ['x_max',
 'x_std',
 'xz_corr',
 'x_min',
 'y_std',
 'y_energy',
 'x_mean',
 'y_max',
 'z_max',
 'y_min',
 'sma',
 'z_mean',
 'x_mean_pos1',
 'sma_pos1',
 'y_iqr',
 'z_energy',
 'z_iqr_pos1',
 'z_max_pos1',
 'x_min_pos1',
 'x_energy',
 'z_min',
 'x_iqr_pos1',
 'y_mean',
 'y_energy_pos1',
 'x_std_pos1',
 'y_max_pos1',
 'yz_corr',
 'x_max_pos1',
 'height',
 'z_iqr',
 'y_mean_pos1',
 'x_energy_pos1',
 'z_mean_pos1',
 'z_min_pos1',
 'z_std',
 'xy_corr',
 'y_iqr_pos1',
 'z_std_pos1',
 'y_std_pos1',
 'x_iqr',
 'z_energy_pos1',
 'yz_corr_pos1',
 'xz_corr_pos1',
 'y_min_pos1',
 'xy_corr_pos1',
 'mass']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=1234
)

scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)

In [7]:
max_iter = 10000
C = 2

In [8]:
ovo_strategy = OneVsOneClassifier(
    LogisticRegression(random_state=42, max_iter=max_iter, C=C), n_jobs=-1
).fit(X_train, y_train)

ovr_strategy = LogisticRegression(
    multi_class='ovr', max_iter=max_iter, solver='lbfgs', random_state=42, C=C
).fit(X_train, y_train)

multinomial = LogisticRegression(
    multi_class='multinomial', max_iter=max_iter, solver='lbfgs', random_state=42, C=C
).fit(X_train, y_train)


for clf, strategy in zip([ovo_strategy, ovr_strategy, multinomial], 
                         ['OvO', 'OvR', 'multinomial']):
    start_time = time.time_ns()
    X_test = scaler.transform(X_test)
    y_pred = clf.predict(X_test)
    elapsed_time = time.time_ns() - start_time
    
    print('Accuracy for {}: {}, time : {} ns'.format(
        strategy, accuracy_score(y_test, y_pred), elapsed_time
    ))

Accuracy for OvO: 0.9026642984014209, time : 3864500 ns
Accuracy for OvR: 0.31900532859680286, time : 755757 ns
Accuracy for multinomial: 0.31900532859680286, time : 591534 ns


In [9]:
odds = np.exp(multinomial.coef_[0])
pd.DataFrame(odds, 
             X.columns, 
             columns=['coef'])\
            .sort_values(by='coef', ascending=False)

Unnamed: 0,coef
z_max,4.996455
y_max,3.454068
x_energy,3.301435
x_mean_pos1,2.83314
xz_corr,2.565099
x_std_pos1,2.318837
x_max_pos1,2.100476
yz_corr_pos1,1.999
y_mean_pos1,1.903788
z_std,1.842431


### Определение горбатости в ходячем положении

In [10]:
data = pd.read_csv('../datasets/added_pos1_metrics_windowed.csv', index_col=[0, 1])
data = data.droplevel(0)
data = data[data['position'] == 3].drop(['position', 'age'], axis=1)
data.head()

Unnamed: 0_level_0,x_mean,x_std,x_max,x_min,x_energy,x_iqr,y_mean,y_std,y_max,y_min,...,x_iqr_pos1,y_iqr_pos1,z_iqr_pos1,xy_corr_pos1,xz_corr_pos1,yz_corr_pos1,height,mass,is_valid,sex
level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3621,12685.04,2477.023409,22084.0,6396.0,166984500.0,3433.0,1969.0,3074.25009,12488.0,-3760.0,...,196.0,132.0,485.0,-0.203383,0.750925,-0.452449,185,75,0,1
3646,12148.96,2494.978107,22084.0,6396.0,153759900.0,2141.0,1951.48,3311.899216,12488.0,-3760.0,...,196.0,132.0,485.0,-0.203383,0.750925,-0.452449,185,75,0,1
3671,12151.88,3207.290657,25876.0,6396.0,157852000.0,2697.0,1784.28,3319.946075,12488.0,-5016.0,...,196.0,132.0,485.0,-0.203383,0.750925,-0.452449,185,75,0,1
3696,12398.36,3620.581932,29072.0,6396.0,166696900.0,3260.0,753.16,2951.146042,10968.0,-5704.0,...,196.0,132.0,485.0,-0.203383,0.750925,-0.452449,185,75,0,1
3721,12812.64,3791.848513,29072.0,8804.0,178398100.0,3707.0,97.68,2808.045249,8604.0,-5844.0,...,196.0,132.0,485.0,-0.203383,0.750925,-0.452449,185,75,0,1


In [11]:
y = data['is_valid']
X = data.drop('is_valid', axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=1234
)

scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)

In [13]:
max_iter = 10000
C = 10000

In [14]:
logreg = LogisticRegression(random_state=42, max_iter=max_iter, C=C).fit(X_train, y_train)
logreg.fit(X_train, y_train)

start_time = time.time_ns()
X_test = scaler.transform(X_test)
y_pred = logreg.predict(X_test)
time_elapsed = time.time_ns() - start_time

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.3f}, time: {time_elapsed} ns')

Accuracy: 0.924, time: 1835712 ns


Сохранение весов:

In [19]:
pd.DataFrame(logreg.coef_[0], 
             X.columns, 
             columns=['coef'])\
            .sort_values(by='coef', ascending=False).to_json("weight_valid.json")

In [16]:
odds = np.exp(logreg.coef_[0])
pd.DataFrame(odds, 
             X.columns, 
             columns=['coef'])\
            .sort_values(by='coef', ascending=False)

Unnamed: 0,coef
x_mean,8.600931e+36
z_energy,3.69421e+20
xy_corr_pos1,20862770000.0
z_iqr_pos1,2639758.0
x_energy_pos1,3877.514
y_min_pos1,3394.712
xz_corr_pos1,2898.24
y_max_pos1,1349.263
z_max_pos1,908.1603
y_iqr_pos1,174.782
