In [183]:
import pandas as pd
import numpy as np
from tsfresh import extract_features
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from tsfresh.feature_selection.significance_tests import target_binary_feature_real_test
import sys
sys.path.append("../")
from src.dwt_utils import *
from src.baseline_utils import train_model

# **Dataset import**

In [93]:
df = pd.read_csv("../datasets/esr_dataset.csv")
df = df.drop("Unnamed", axis=1)
# Трансформируем целевую переменную из многоклассовой в бинарную
df['y'] = df['y'].apply(lambda y: 1 if y==1 else 0)
df['y'].value_counts()

y
0    9200
1    2300
Name: count, dtype: int64

In [103]:
# Подготавливаем датасет для tsfresh
new_df = pd.DataFrame(columns=['id', 'time', 'value'])
time_names = df.drop(['y'], axis=1).columns.to_list()
n_times = len(time_names)

for row in df.iterrows():
    values = row[1].drop(['y']).to_numpy().flatten()

    id = np.full(n_times, row[0])
    row_df = pd.DataFrame({"id" : id, "time" : time_names, "value" : values})
    new_df = pd.concat([new_df, row_df])
    
new_df.to_csv("../datasets/tsfresh_dataset.csv", index=False)

In [123]:
data = pd.read_csv("../datasets/tsfresh_dataset.csv")
data.head(5)

Unnamed: 0,id,time,value
0,0,X1,135
1,0,X2,190
2,0,X3,229
3,0,X4,223
4,0,X5,192


# **TSFresh Automatic Feature Generation & ML**

In [129]:
data.id = data.id.astype('object')
data.dtypes

id       object
time     object
value     int64
dtype: object

Извлечем все возможные признаки из датасета

In [130]:
extracted_features = extract_features(data, column_id="id", column_sort="time", column_value="value")

Feature Extraction: 100%|██████████| 10/10 [27:37<00:00, 165.78s/it]


In [147]:
extracted_features['y'] = df['y'].to_numpy()
extracted_features.head(5)

Unnamed: 0,value__variance_larger_than_standard_deviation,value__has_duplicate_max,value__has_duplicate_min,value__has_duplicate,value__sum_values,value__abs_energy,value__mean_abs_change,value__mean_change,value__mean_second_derivative_central,value__median,...,value__fourier_entropy__bins_10,value__fourier_entropy__bins_100,value__permutation_entropy__dimension_3__tau_1,value__permutation_entropy__dimension_4__tau_1,value__permutation_entropy__dimension_5__tau_1,value__permutation_entropy__dimension_6__tau_1,value__permutation_entropy__dimension_7__tau_1,value__query_similarity_count__query_None__threshold_0.0,value__mean_n_absolute_max__number_of_maxima_7,y
0,1.0,0.0,0.0,1.0,-3010.0,1681484.0,52.016949,-1.949153,0.247159,-11.5,...,1.195352,2.951483,1.568466,2.552025,3.483084,4.219962,4.690669,,245.714286,0
1,1.0,0.0,0.0,1.0,5004.0,39768644.0,168.310734,-0.00565,0.474432,220.5,...,0.538372,2.288039,1.556706,2.538147,3.435382,4.161452,4.601271,,1260.571429,1
2,1.0,0.0,0.0,1.0,-7840.0,692846.0,20.19209,-0.180791,0.15625,-44.5,...,0.692489,2.313938,1.647364,2.631261,3.541738,4.236422,4.69401,,113.571429,0
3,1.0,0.0,0.0,1.0,-12266.0,890386.0,9.254237,0.271186,-0.059659,-69.0,...,1.00017,2.852154,1.641484,2.764789,3.798963,4.537381,4.949949,,99.857143,0
4,1.0,0.0,1.0,1.0,-1184.0,274368.0,21.355932,0.20339,0.193182,-1.0,...,0.914344,2.302692,1.619943,2.718243,3.723965,4.426841,4.851762,,97.285714,0


Определим значимость сгенерированных признаков используя U-критерий Манна—Уитни, встроенный в TSFRESH

In [162]:
x = extracted_features.drop('y', axis=1)
x = x.dropna(axis=1)
y = extracted_features['y']
p_values = np.array([])
for col_name in x.columns:
    p_val = target_binary_feature_real_test(x[col_name], y, 'mann')
    p_values = np.append(p_values, p_val)

Разделим выборку на тренировочную и тестовую, предварительно отобрав наиболее значимые колонки

In [215]:
best_feature_indexes = np.where(p_values < 0.05)[0]
X = x[x.columns[best_feature_indexes]]
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Тренируем модели

In [199]:
train_model(RandomForestClassifier(), X_train, Y_train, X_test, Y_test)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1835
           1       0.98      0.96      0.97       465

    accuracy                           0.99      2300
   macro avg       0.98      0.98      0.98      2300
weighted avg       0.99      0.99      0.99      2300



In [200]:
train_model(SVC(), X_train, Y_train, X_test, Y_test )

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1835
           1       0.94      0.86      0.90       465

    accuracy                           0.96      2300
   macro avg       0.96      0.93      0.94      2300
weighted avg       0.96      0.96      0.96      2300



In [201]:
train_model(KNeighborsClassifier(), X_train, Y_train, X_test, Y_test )

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1835
           1       0.94      0.92      0.93       465

    accuracy                           0.97      2300
   macro avg       0.96      0.95      0.96      2300
weighted avg       0.97      0.97      0.97      2300



Результат получился немного лучше бейзлайна