In [1]:
import sys
sys.path.append('../')
sys.path.append('./')

In [2]:
import pandas as pd

from src.data.uniting import unite_datasets
from src.data.drop_duplicates import drop_duplicates
from src.data.metrics import extract_metrics
from src.data.split import prepare_split
from src.models.train import svm_train

import src.config as CFG

In [3]:
new_data_path = 'data_table.csv'
new_person_path = 'person_table.csv'

In [4]:
def re_fit(new_data_path, new_person_path):
    '''
    Полный пайплайн ре-обучения, включает в себя:
    1) Удаление дубликатов из новых данных
    2) Добавления новых данных к старым
    3) Извлечение нужных метрик для обучения
    4) Разделение на обучающую и тестовую выборки
    5) Обучение
    6) Подсчет скоров и проверка проблем датасета
    '''
    drop_duplicates(new_data_path)
    unite_datasets(CFG.RAW_DATA_PATH, CFG.RAW_PERSON_PATH, new_data_path, new_person_path)
    extract_metrics()
    prepare_split()
    svm_train()
    # check_mistakes()

### drop_duplicates

In [27]:
new_data = pd.read_csv(new_data_path, index_col=0)
new_person = pd.read_csv(new_person_path)
new_data

Unnamed: 0,_id,x,y,z
0,1,12848.0,704.0,-10332.0
1,1,12832.0,668.0,-10424.0
2,1,12808.0,580.0,-10372.0
3,1,12828.0,644.0,-10232.0
4,1,12796.0,600.0,-10320.0
...,...,...,...,...
3682,1,12820.0,624.0,-10376.0
3683,1,12932.0,656.0,-10240.0
3684,1,12852.0,696.0,-10296.0
3685,1,12848.0,732.0,-10344.0


In [28]:
new_person

Unnamed: 0,_id,height,mass,position,is_valid,age,sex
0,1,176,76,1,1,50,1
1,5,176,76,3,1,50,1
2,6,176,76,3,0,50,1
3,7,179,80,1,1,21,1
4,8,179,80,1,0,21,1
5,9,179,80,2,1,21,1
6,10,179,80,2,0,21,1
7,11,179,80,3,0,21,1
8,12,179,80,3,0,21,1
9,13,179,80,3,1,21,1


In [29]:
drop_duplicates(new_data_path)

In [30]:
new_data = pd.read_csv(new_data_path, index_col=0)
new_person = pd.read_csv(new_person_path)
new_data

Unnamed: 0,_id,x,y,z
0,1,12848.0,704.0,-10332.0
1,1,12832.0,668.0,-10424.0
2,1,12808.0,580.0,-10372.0
3,1,12828.0,644.0,-10232.0
4,1,12796.0,600.0,-10320.0
...,...,...,...,...
3682,1,12820.0,624.0,-10376.0
3683,1,12932.0,656.0,-10240.0
3684,1,12852.0,696.0,-10296.0
3685,1,12848.0,732.0,-10344.0


In [33]:
new_person = new_person[new_person['_id'] == 1]
new_person.to_csv(new_person_path, index=False)

In [39]:
new_data = pd.read_csv(new_data_path, index_col=0)
new_person = pd.read_csv(new_person_path)
new_person

Unnamed: 0,_id,height,mass,position,is_valid,age,sex
0,1,176,76,1,1,50,1


**CORRECT**

### unite_datasets

In [40]:
data = pd.read_csv(CFG.RAW_DATA_PATH, index_col=0)
person = pd.read_csv(CFG.RAW_PERSON_PATH)
data

Unnamed: 0,_id,x,y,z
0,1,13916,-1172,-8424
1,1,13816,-1156,-8304
2,1,14188,-1256,-8012
3,1,14128,-1360,-8100
4,1,14228,-832,-7976
...,...,...,...,...
248660,311,12104,-660,-5384
248661,311,12612,-400,-4408
248662,311,12368,-384,-3992
248663,311,16172,-156,-6656


In [41]:
unite_datasets(new_data_path, new_person_path)

In [42]:
data = pd.read_csv(CFG.RAW_DATA_PATH, index_col=0)
person = pd.read_csv(CFG.RAW_PERSON_PATH)
data

Unnamed: 0,_id,x,y,z
0,1,13916.0,-1172.0,-8424.0
1,1,13816.0,-1156.0,-8304.0
2,1,14188.0,-1256.0,-8012.0
3,1,14128.0,-1360.0,-8100.0
4,1,14228.0,-832.0,-7976.0
...,...,...,...,...
252347,313,12820.0,624.0,-10376.0
252348,313,12932.0,656.0,-10240.0
252349,313,12852.0,696.0,-10296.0
252350,313,12848.0,732.0,-10344.0


In [43]:
data

Unnamed: 0,_id,x,y,z
0,1,13916.0,-1172.0,-8424.0
1,1,13816.0,-1156.0,-8304.0
2,1,14188.0,-1256.0,-8012.0
3,1,14128.0,-1360.0,-8100.0
4,1,14228.0,-832.0,-7976.0
...,...,...,...,...
252347,313,12820.0,624.0,-10376.0
252348,313,12932.0,656.0,-10240.0
252349,313,12852.0,696.0,-10296.0
252350,313,12848.0,732.0,-10344.0


**CORRECT**

### extract_metrics

In [44]:
data = pd.read_csv(CFG.DATASET_PATH, index_col=0)
data

Unnamed: 0_level_0,level_1,x_mean,x_std,x_max,x_min,x_energy,x_iqr,y_mean,y_std,y_max,...,z_iqr_pos1,xy_corr_pos1,xz_corr_pos1,yz_corr_pos1,height,mass,position,is_valid,age,sex
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1673,11156.76,123.532772,11380.0,10836.0,1.244884e+08,177.0,1158.80,73.172275,1348.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
2,1698,11156.88,128.858961,11380.0,10836.0,1.244924e+08,193.0,1137.96,69.416023,1300.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
2,1723,11154.36,134.410311,11532.0,10836.0,1.244376e+08,190.0,1088.32,171.829072,1380.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
2,1748,10998.28,415.506022,11532.0,9384.0,1.211331e+08,323.0,853.36,499.190851,1380.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
2,1773,10883.04,409.575930,11532.0,9384.0,1.186066e+08,543.0,537.68,622.192456,1380.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,248547,14922.72,3592.622199,23688.0,9424.0,2.354654e+08,5910.0,-895.72,1385.373319,3096.0,...,264.0,0.213569,0.786830,0.045659,170,61,3,1,20,0
311,248572,14993.44,3544.279603,23688.0,9424.0,2.372395e+08,5291.0,-1130.24,1549.973927,3096.0,...,264.0,0.213569,0.786830,0.045659,170,61,3,1,20,0
311,248597,15152.04,3581.770072,23688.0,9212.0,2.422851e+08,5213.0,-994.12,1720.060648,3168.0,...,264.0,0.213569,0.786830,0.045659,170,61,3,1,20,0
311,248622,14821.28,3600.001207,24076.0,9212.0,2.325007e+08,5696.0,-948.92,1772.984961,3760.0,...,264.0,0.213569,0.786830,0.045659,170,61,3,1,20,0


In [45]:
extract_metrics()

In [46]:
data = pd.read_csv(CFG.DATASET_PATH, index_col=0)
data

Unnamed: 0_level_0,level_1,x_mean,x_std,x_max,x_min,x_energy,x_iqr,y_mean,y_std,y_max,...,z_iqr_pos1,xy_corr_pos1,xz_corr_pos1,yz_corr_pos1,height,mass,position,is_valid,age,sex
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1673,11156.76,123.532772,11380.0,10836.0,1.244884e+08,177.0,1158.80,73.172275,1348.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
2,1698,11156.88,128.858961,11380.0,10836.0,1.244924e+08,193.0,1137.96,69.416023,1300.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
2,1723,11154.36,134.410311,11532.0,10836.0,1.244376e+08,190.0,1088.32,171.829072,1380.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
2,1748,10998.28,415.506022,11532.0,9384.0,1.211331e+08,323.0,853.36,499.190851,1380.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
2,1773,10883.04,409.575930,11532.0,9384.0,1.186066e+08,543.0,537.68,622.192456,1380.0,...,485.0,-0.203383,0.750925,-0.452449,185,75,1,0,18,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
313,252246,12888.32,56.892399,13028.0,12780.0,1.661120e+08,88.0,707.60,53.817499,856.0,...,220.0,-0.334250,0.872003,-0.319047,176,76,1,1,50,1
313,252271,12882.92,56.878691,13028.0,12768.0,1.659728e+08,85.0,699.16,56.771867,856.0,...,220.0,-0.334250,0.872003,-0.319047,176,76,1,1,50,1
313,252296,12877.20,55.825124,13028.0,12768.0,1.658254e+08,77.0,672.28,42.218307,804.0,...,220.0,-0.334250,0.872003,-0.319047,176,76,1,1,50,1
313,252321,12880.88,50.239282,13028.0,12768.0,1.659196e+08,60.0,668.36,40.000384,756.0,...,220.0,-0.334250,0.872003,-0.319047,176,76,1,1,50,1


### prepare_split

In [1]:
from src.data.split import prepare_split

In [2]:
prepare_split()

('datasets/X_train_id.csv', 'datasets/X_test_id.csv')

**CORRECT**

### svm_fit

In [5]:
svm_train()

(9525, 48)
              precision    recall  f1-score   support

           0       0.89      0.69      0.78      1604
           1       0.74      0.91      0.82      1555

    accuracy                           0.80      3159
   macro avg       0.82      0.80      0.80      3159
weighted avg       0.82      0.80      0.80      3159
 ['crooked', 'straight']
