Imports

In [None]:
!pip install sktime
!pip install pandas
!pip install seaborn
!pip install sklearn
!pip install xgboost

In [None]:
# misc
from sktime.datasets import load_from_arff_to_dataframe
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
import zipfile
import shutil
import os
import random

# for feature extractor
from scipy.signal import find_peaks
from scipy.stats import skew
from scipy.stats import kurtosis

# for gridsearch and models
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


creating datasets and spliting

In [None]:
# unzips datasets
with zipfile.ZipFile("datasets.zip","r") as zip_ref:
    zip_ref.extractall()


In [None]:
X_train_sports, y_train_sports = load_from_arff_to_dataframe('RacketSports\RacketSports_TRAIN.arff')
X_test_sports, y_test_sports = load_from_arff_to_dataframe('RacketSports\RacketSports_TEST.arff')

data_mitbih_train = pd.read_csv('ECG\mitbih_train.csv', header=None)
data_mitbih_test = pd.read_csv('ECG\mitbih_test.csv', header=None)

X_train_mitbih = data_mitbih_train.iloc[:, 0:-1]
y_train_mitbih = data_mitbih_train.iloc[:, -1]

X_test_mitbih = data_mitbih_test.iloc[:, 0:-1]
y_test_mitbih = data_mitbih_test.iloc[:, -1]

data_ptbdb_abnormal = pd.read_csv('ECG\ptbdb_abnormal.csv', header=None)
data_ptbdb_normal = pd.read_csv('ECG\ptbdb_normal.csv', header=None)

data_ptbdb = pd.concat([data_ptbdb_abnormal, data_ptbdb_normal])

X_ptbdb = data_ptbdb.iloc[:, 0:-1]
y_ptbdb = data_ptbdb.iloc[:, -1]

X_train_ptbdb, X_test_ptbdb, y_train_ptbdb, y_test_ptbdb = train_test_split(X_ptbdb, y_ptbdb, test_size=0.2, random_state=80085)


In [None]:
# for the plotting
labels_sports = ['Badminton_Smash',
                'Badminton_Clear',
                'Squash_ForehandBoast',
                'Squash_BackhandBoast']

labels_mitbih = list(range(5))

labels_ptbdb = [0, 1]

datasets = [y_train_sports, y_test_sports, y_train_mitbih, y_test_mitbih, y_train_ptbdb, y_test_ptbdb]

plot_labels = [labels_sports, labels_mitbih, labels_ptbdb]
plot_titles = ['RacketSports', 'ECG Heartbeat Categorization Dataset - MIT-BIH', 'ECG Heartbeat Categorization Dataset - PTB Diagnostic']
plot_subtitles = ['Train Set', 'Test Set']

sns.set(rc={'figure.autolayout': True})

bg_color = '#E5E6F0'
color_line = '#0F101A'
colors = ['#5465FF', '#788BFF', '#9BB1FF', '#BFD7FF', '#D1EAFF', '#E2FDFF']

sns.set_theme(style="whitegrid", palette=sns.color_palette(colors))


**3.1.1**

Grafice ale frecvenței de apariție a fiecărei etichete (clase) în setul de date de antrenare / test

In [None]:
results_dir = '3.1/1/'

if not os.path.isdir(results_dir):
    os.makedirs(results_dir)

for index in range(len(datasets)):

    fig, axes = plt.subplots(1, 2, figsize=(15, 7), facecolor=bg_color)
    fig.suptitle(plot_subtitles[index % 2] + ' - ' + plot_titles[index // 2], fontsize=20)

    sns.countplot(ax=axes[0], x=datasets[index], order=np.unique(plot_labels[index // 2]), edgecolor=color_line)
    axes[0].set_facecolor(bg_color)

    for p in axes[0].patches:
        axes[0].bar_label(container=axes[0].containers[0])

    counts = [len(datasets[index][datasets[index] == label]) for label in plot_labels[index//2]]
    axes[1].pie(counts, labels=plot_labels[index // 2], autopct='%1.1f%%')

    plt.savefig(results_dir + plot_titles[index // 2] + '_' + plot_subtitles[index % 2] + '.png')
    plt.show()


**3.1.2**

1) Câte un exemplu de serie pentru fiecare tip de acțiune din RacketSports - val de acc accelerometru si de giroscop

In [None]:
random.seed(80085)
results_dir = '3.1/2/1/'

if not os.path.isdir(results_dir):
    os.makedirs(results_dir)

for label in labels_sports:
    indexes = random.sample([i for i in range(y_train_sports.shape[0]) if y_train_sports[i] == label], 6)

    # 3D acc
    fig = plt.figure(figsize=plt.figaspect(0.33), facecolor=bg_color)
    fig.suptitle('RacketSports ' + label + ' Exemplu - Accelerometru - 3D', fontsize=20)
    for i in range(6):

        x_acc = X_train_sports.to_numpy()[indexes[i]][0].to_numpy()
        y_acc = X_train_sports.to_numpy()[indexes[i]][1].to_numpy()
        z_acc = X_train_sports.to_numpy()[indexes[i]][2].to_numpy()

        ax = fig.add_subplot(2, 3, i + 1, projection='3d', facecolor=bg_color)
        ax.plot(x_acc, y_acc, z_acc, color=colors[0], zorder=-1)
        ax.plot(x_acc, y_acc, z_acc, color=colors[4], marker='*', markeredgecolor=colors[0], markeredgewidth=1, markersize=10, linestyle=' ')
    plt.tight_layout()
    plt.savefig(results_dir + 'RacketSports_' + label + '_Accelerometru_3d' + '.png')
    plt.show()

    # axis acc
    fig = plt.figure(figsize=plt.figaspect(0.33), facecolor=bg_color)
    fig.suptitle('RacketSports ' + label + ' Exemplu - Accelerometru - axe', fontsize=20)
    for i in range(6):

        x_acc = X_train_sports.to_numpy()[indexes[i]][0].to_numpy()
        y_acc = X_train_sports.to_numpy()[indexes[i]][1].to_numpy()
        z_acc = X_train_sports.to_numpy()[indexes[i]][2].to_numpy()

        ax = fig.add_subplot(2, 3, i + 1, facecolor=bg_color)
        ax.plot(x_acc, color=colors[0])
        ax.plot(y_acc, color=colors[2])
        ax.plot(z_acc, color=colors[3])

    plt.tight_layout()
    plt.savefig(results_dir + 'RacketSports_' + label + '_Accelerometru_axis' + '.png')
    plt.show()

    fig = plt.figure(figsize=plt.figaspect(0.33), facecolor=bg_color)
    fig.suptitle('RacketSports ' + label + ' Exemplu - Giroscop - 3D', fontsize=20)
    for i in range(6):

        x_giro = X_train_sports.to_numpy()[indexes[i]][3].to_numpy()
        y_giro = X_train_sports.to_numpy()[indexes[i]][4].to_numpy()
        z_giro = X_train_sports.to_numpy()[indexes[i]][5].to_numpy()

        ax = fig.add_subplot(2, 3, i + 1, projection='3d', facecolor=bg_color)
        ax.plot(x_giro, y_giro, z_giro, color=colors[0], zorder=-1)
        ax.plot(x_giro, y_giro, z_giro, color=colors[4], marker='*', markeredgecolor=colors[0], markeredgewidth=1, markersize=10, linestyle=' ')
    plt.tight_layout()
    plt.savefig(results_dir + 'RacketSports_' + label + '_Giroscop_3d' + '.png')
    plt.show()

    fig = plt.figure(figsize=plt.figaspect(0.33), facecolor=bg_color)
    fig.suptitle('RacketSports ' + label + ' Exemplu - Giroscop - axe', fontsize=20)
    for i in range(6):

        x_giro = X_train_sports.to_numpy()[indexes[i]][0].to_numpy()
        y_giro = X_train_sports.to_numpy()[indexes[i]][1].to_numpy()
        z_acc = X_train_sports.to_numpy()[indexes[i]][2].to_numpy()

        ax = fig.add_subplot(2, 3, i + 1, facecolor=bg_color)
        ax.plot(x_giro, color=colors[0])
        ax.plot(y_giro, color=colors[2])
        ax.plot(z_giro, color=colors[3])

    plt.tight_layout()
    plt.savefig(results_dir + 'RacketSports_' + label + '_Giroscop_axis' + '.png')
    plt.show()


2) Câte un exemplu de serie pentru fiecare categorie de aritmie din seturile de date MIT-BIH / PTB.

In [None]:
random.seed(80085)
results_dir = '3.1/2/2/'

if not os.path.isdir(results_dir):
    os.makedirs(results_dir)

# 'ECG Heartbeat Categorization Dataset - MIT-BIH', 'ECG Heartbeat Categorization Dataset - PTB Diagnostic'
time_sample = list(range(187))

for label in labels_mitbih:
    indexes = random.sample([i for i in range(y_train_mitbih.shape[0]) if y_train_mitbih[i] == label], 6)

    fig = plt.figure(figsize=plt.figaspect(0.33), facecolor=bg_color)
    fig.suptitle('ECG Heartbeat Categorization Dataset - MIT-BIH ' + str(label) , fontsize=20)
    for i in range(6):
        series = X_train_mitbih.to_numpy()[indexes[i]]

        ax = fig.add_subplot(2, 3, i + 1, facecolor=bg_color)
        ax.plot(time_sample, series, color=colors[0], zorder=-1)
    plt.tight_layout()
    plt.savefig(results_dir + 'ECG_Heartbeat_Categorization_Dataset_MIT_BIH_' + str(label) + '.png')
    plt.show()

for label in labels_ptbdb:
    indexes = random.sample([i for i in range(y_train_ptbdb.shape[0]) if y_train_ptbdb.to_numpy()[i] == label], 6)

    fig = plt.figure(figsize=plt.figaspect(0.33), facecolor=bg_color)
    fig.suptitle('ECG Heartbeat Categorization Dataset - PTB Diagnostic ' + str(label) , fontsize=20)
    for i in range(6):
        series = X_train_ptbdb.to_numpy()[indexes[i]]

        ax = fig.add_subplot(2, 3, i + 1, facecolor=bg_color)
        ax.plot(time_sample, series, color=colors[0], zorder=-1)
    plt.tight_layout()
    plt.savefig(results_dir + 'ECG_Heartbeat_Categorization_Dataset_PTB_Diagnostic_' + str(label) + '.png')
    plt.show()


3) Grafic al mediei și deviației standard per unitate de timp, pentru fiecare clasă de aritmie.

In [None]:
results_dir = '3.1/2/3/'

if not os.path.isdir(results_dir):
    os.makedirs(results_dir)

X_mitbih = pd.concat([X_train_mitbih, X_test_mitbih]).to_numpy()
y_mitbih = pd.concat([y_train_mitbih, y_test_mitbih]).to_numpy()


for label in labels_mitbih:
    indexes = [i for i in range(y_mitbih.shape[0]) if y_mitbih[i] == label]

    std_mitbih = np.std(X_mitbih[indexes], axis=0)
    mean_mitbih = np.mean(X_mitbih[indexes], axis=0)

    fig = plt.figure(facecolor=bg_color)
    fig.suptitle('ECG Heartbeat Categorization Dataset - MIT-BIH ' + str(label) , fontsize=20)
    ax = fig.add_subplot(1, 1, 1, facecolor=bg_color)
    ax.set_title('Standard Deviation and Mean')
    ax.plot(time_sample, std_mitbih, color=colors[3], linewidth=2.5)
    ax.plot(time_sample, mean_mitbih, color=colors[2], linewidth=2, zorder=-1)

    plt.savefig(results_dir + 'ECG_Heartbeat_Categorization_Dataset_MIT_BIH_mean_std_' + str(label) + '.png')
    plt.show()


for label in labels_ptbdb:
    indexes = [i for i in range(y_ptbdb.shape[0]) if y_ptbdb.to_numpy()[i] == label]
    std_ptbdb = np.std(X_ptbdb.to_numpy()[indexes], axis=0)
    mean_ptbdb = np.mean(X_ptbdb.to_numpy()[indexes], axis=0)

    fig = plt.figure(facecolor=bg_color)
    fig.suptitle('ECG Heartbeat Categorization Dataset - PTB Diagnostic ' + str(label) , fontsize=20)
    ax = fig.add_subplot(1, 1, 1, facecolor=bg_color)
    ax.set_title('Standard Deviation and Mean')
    ax.plot(time_sample, std_ptbdb, color=colors[3], linewidth=2.5)
    ax.plot(time_sample, mean_ptbdb, color=colors[2], linewidth=2, zorder=-1)
    plt.savefig(results_dir + 'ECG_Heartbeat_Categorization_Dataset_PTB_Diagnostic_mean_std_' + str(label) + '.png')

    plt.show()

4) Grafic al distributiei de valori per axa per actiune pt RacketSports

In [None]:
# why did i do it like this? why is my code so inefficient? why is it so bad?
# because i do not care anymore
# ive spent literal hours of my life on this amazing 🤬 thing and i do not want to see it anymore
# now it works and i dont care
# let it rot for all i care

# AND I HAD TO CHANGE MY VIBES AESTHETIC PRETTY COLOR PALETTE FOR THIS

results_dir = '3.1/2/4/'

if not os.path.isdir(results_dir):
    os.makedirs(results_dir)

colors2 = ['#cafe48', '#70f8ba', '#d14081', '#2583d0', '#733ab1', '#ff8811']

data_sports_all = pd.concat([X_train_sports, X_test_sports], ignore_index=True)
columns_sports = data_sports_all.columns
y_sports_all = np.append(y_train_sports, y_test_sports)

parsed_data = {}
labels = []
values = []
dimensions = []

for i in range(len(data_sports_all)):
    for col in ['dim_0', 'dim_1']:
        time_series = data_sports_all[col][i]
        for value in time_series:
            values.append(value)
            labels.append(y_sports_all[i])
            dimensions.append(col)

parsed_data['value'] = values
parsed_data['dimension'] = dimensions
parsed_data['label'] = labels

parsed_data = pd.DataFrame(parsed_data)

g = sns.FacetGrid(parsed_data, col='dimension', hue='label', aspect=1.5, palette=sns.color_palette(colors)).map(sns.histplot, 'value', kde=True).add_legend()
g.set_titles("{col_name}")
plt.savefig(results_dir + 'value_distr_1' + str(label) + '.png')

parsed_data = {}
labels = []
values = []
dimensions = []

for i in range(len(data_sports_all)):
    for col in ['dim_2', 'dim_3']:
        time_series = data_sports_all[col][i]
        for value in time_series:
            values.append(value)
            labels.append(y_sports_all[i])
            dimensions.append(col)

parsed_data['value'] = values
parsed_data['dimension'] = dimensions
parsed_data['label'] = labels

parsed_data = pd.DataFrame(parsed_data)

g = sns.FacetGrid(parsed_data, col='dimension', hue='label', aspect=1.5, palette=sns.color_palette(colors)).map(sns.histplot, 'value', kde=True)
g.add_legend()
g.set_titles("{col_name}")
plt.savefig(results_dir + 'value_distr_2' + str(label) + '.png')

parsed_data = {}
labels = []
values = []
dimensions = []

for i in range(len(data_sports_all)):
    for col in ['dim_4', 'dim_5']:
        time_series = data_sports_all[col][i]
        for value in time_series:
            values.append(value)
            labels.append(y_sports_all[i])
            dimensions.append(col)

parsed_data['value'] = values
parsed_data['dimension'] = dimensions
parsed_data['label'] = labels

parsed_data = pd.DataFrame(parsed_data)

g = sns.FacetGrid(parsed_data, col='dimension', hue='label', aspect=1.5, palette=sns.color_palette(colors)).map(sns.histplot, 'value', kde=True)
g.add_legend()
g.set_titles("{col_name}")
plt.savefig(results_dir + 'value_distr_2' + str(label) + '.png')



3.2.1

Clasa pentru extragerea de feature-uri
- are window size si sample size reglabile

In [None]:
class feature_extractor:
    def __init__(self, time_series, labels, W=20, sample_step=15):
        self.time_series = time_series
        self.labels = labels
        self.W = W
        self.data = []
        self.indexed_labels = []

        for i in range(len(time_series)):
            count = 0
            for step in range(0, len(self.time_series[i]), sample_step):
                self.data.append(self.time_series[i][step : step + self.W])
                self.indexed_labels.append(self.labels[i] + '_' + str(count))
                count += 1
        self.count = count

    def get_max(self):
        return [np.max(window) for window in self.data], [label + '_max' for label in self.indexed_labels]

    def get_min(self):
        return [np.min(window) for window in self.data], [label + '_min' for label in self.indexed_labels]

    def get_std(self):
        return [np.std(window) for window in self.data], [label + '_std' for label in self.indexed_labels]

    def get_ptp(self):
        return [np.ptp(window) for window in self.data], [label + '_ptp' for label in self.indexed_labels]

    def get_avg(self):
        return [np.average(window) for window in self.data], [label + '_avg' for label in self.indexed_labels]

    def get_median(self):
        return [np.median(window) for window in self.data], [label + '_median' for label in self.indexed_labels]

    def get_mean(self):
        return [np.mean(window) for window in self.data], [label + '_mean' for label in self.indexed_labels]

    def get_pos(self):
        return [len(window[window >= 0]) for window in self.data], [label + '_pos' for label in self.indexed_labels]

    def get_neg(self):
        return [len(window[window < 0]) for window in self.data], [label + '_neg' for label in self.indexed_labels]

    def get_iqr(self):
        return [np.percentile(window, 75) - np.percentile(window, 25) for window in self.data], [label + '_iqr' for label in self.indexed_labels]

    def get_over_avg(self):
        return [len(window[window > np.average(window)]) for window in self.data], [label + '_over_avg' for label in self.indexed_labels]

    def get_peaks(self):
        return [len(find_peaks(window)[0]) for window in self.data], [label + '_peaks' for label in self.indexed_labels]

    def get_skew(self):
        return [skew(window) for window in self.data], [label + '_skew' for label in self.indexed_labels]

    def get_kurtosis(self):
        return [kurtosis(window) for window in self.data], [label + '_kurtosis' for label in self.indexed_labels]

    def get_energy(self):
        return [np.sum(window ** 2) / self.W for window in self.data], [label + '_energy' for label in self.indexed_labels]

    def get_acc(self):
        return [np.average(np.sqrt(self.data[start + i] ** 2 +
                self.data[start + i + self.count] ** 2 +
                self.data[start + i + self.count * 2] ** 2))
                for start in range(0, len(self.data) , 3 * self.count)
                for i in range(self.count)], 'acc_'

    def get_aria(self):
        return [np.sum(np.abs(self.data[start + i]) +
                np.abs(self.data[start + i + self.count]) +
                np.abs(self.data[start + i + self.count * 2]))
                for start in range(0, len(self.data) , 3 * self.count)
                for i in range(self.count)], 'aria_'

    def get_all(self, no_3D = False):
        results_max, labels_max = self.get_max()
        results_min, labels_min = self.get_min()
        results_std, labels_std = self.get_std()
        results_ptp, labels_ptp = self.get_ptp()
        results_avg, labels_avg = self.get_avg()
        results_median, labels_median = self.get_median()
        results_mean, labels_mean = self.get_mean()
        results_pos, labels_pos = self.get_pos()
        results_neg, labels_neg = self.get_neg()
        results_iqr, labels_iqr = self.get_iqr()
        results_over_avg, labels_over_avg = self.get_over_avg()
        results_peaks, labels_peaks = self.get_peaks()
        results_skew, labels_skew = self.get_skew()
        results_kurtosis, labels_kurtosis = self.get_kurtosis()
        results_energy, labels_energy = self.get_energy()

        results = [*results_max, *results_min, *results_std, *results_ptp, *results_avg, *results_median, *results_mean,
                   *results_pos, *results_neg, *results_iqr, *results_over_avg, *results_peaks, *results_skew,
                   *results_kurtosis, *results_energy]
        final_labels = [*labels_max, *labels_min, *labels_std, *labels_ptp, *labels_avg, *labels_median, *labels_mean,
                        *labels_pos, *labels_neg, *labels_iqr, *labels_over_avg, *labels_peaks, *labels_skew,
                        *labels_kurtosis, *labels_energy]

        if no_3D == False:
            results_acc, label_acc = self.get_acc()
            results_aria, label_aria = self.get_aria()

            labels_acc = [label_acc + str(i) for i in range(len(results_acc))]
            labels_aria = [label_aria + str(i) for i in range(len(results_aria))]

            results = [*results, *results_acc, *results_aria]
            final_labels = [*final_labels, *labels_acc, *labels_aria]

        return {final_labels[i]:[results[i]] for i in range(len(results))}

Extragerea de atribute pt fiecare dataset
- creeaza cate un csv pt fiecare ca sa pot sa le citesc din el direct

In [None]:
# # RacketSports Test
time_series_x_acc = X_train_sports['dim_0'].to_numpy()[0].to_numpy()
time_series_y_acc = X_train_sports['dim_1'].to_numpy()[0].to_numpy()
time_series_z_acc = X_train_sports['dim_2'].to_numpy()[0].to_numpy()
time_series_x_giro = X_train_sports['dim_3'].to_numpy()[0].to_numpy()
time_series_y_giro = X_train_sports['dim_4'].to_numpy()[0].to_numpy()
time_series_z_giro = X_train_sports['dim_5'].to_numpy()[0].to_numpy()


time_series = [time_series_x_acc,  time_series_y_acc, time_series_z_acc, time_series_x_giro,  time_series_y_giro, time_series_z_giro]
time_series_fft = np.abs(np.fft.fft([time_series_x_acc,  time_series_y_acc, time_series_z_acc, time_series_x_giro,  time_series_y_giro, time_series_z_giro]))

fe = feature_extractor([*time_series, *time_series_fft], ['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', 'dim_0_fft', 'dim_1_fft', 'dim_2_fft', 'dim_3_fft', 'dim_4_fft', 'dim_5_fft'])

results = fe.get_all()

features_sports_train = pd.DataFrame.from_dict(results)

for i in range(1, len(X_train_sports)):
    time_series_x_acc = X_train_sports['dim_0'].to_numpy()[i].to_numpy()
    time_series_y_acc = X_train_sports['dim_1'].to_numpy()[i].to_numpy()
    time_series_z_acc = X_train_sports['dim_2'].to_numpy()[i].to_numpy()
    time_series_x_giro = X_train_sports['dim_3'].to_numpy()[i].to_numpy()
    time_series_y_giro = X_train_sports['dim_4'].to_numpy()[i].to_numpy()
    time_series_z_giro = X_train_sports['dim_5'].to_numpy()[i].to_numpy()

    time_series = [time_series_x_acc,  time_series_y_acc, time_series_z_acc, time_series_x_giro,  time_series_y_giro, time_series_z_giro]
    time_series_fft = np.abs(np.fft.fft([time_series_x_acc,  time_series_y_acc, time_series_z_acc, time_series_x_giro,  time_series_y_giro, time_series_z_giro]))

    fe = feature_extractor([*time_series, *time_series_fft], ['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', 'dim_0_fft', 'dim_1_fft', 'dim_2_fft', 'dim_3_fft', 'dim_4_fft', 'dim_5_fft'])
    results = fe.get_all()

    features_sports_train = pd.concat([features_sports_train, pd.DataFrame.from_dict(results)], ignore_index=True)

features_sports_train.to_csv('features_sports_train.csv')

# RacketSports Train
time_series_x_acc = X_test_sports['dim_0'].to_numpy()[0].to_numpy()
time_series_y_acc = X_test_sports['dim_1'].to_numpy()[0].to_numpy()
time_series_z_acc = X_test_sports['dim_2'].to_numpy()[0].to_numpy()
time_series_x_giro = X_test_sports['dim_3'].to_numpy()[0].to_numpy()
time_series_y_giro = X_test_sports['dim_4'].to_numpy()[0].to_numpy()
time_series_z_giro = X_test_sports['dim_5'].to_numpy()[0].to_numpy()

time_series = [time_series_x_acc,  time_series_y_acc, time_series_z_acc, time_series_x_giro,  time_series_y_giro, time_series_z_giro]
time_series_fft = np.abs(np.fft.fft([time_series_x_acc,  time_series_y_acc, time_series_z_acc, time_series_x_giro,  time_series_y_giro, time_series_z_giro]))

fe = feature_extractor([*time_series, *time_series_fft], ['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', 'dim_0_fft', 'dim_1_fft', 'dim_2_fft', 'dim_3_fft', 'dim_4_fft', 'dim_5_fft'])

results = fe.get_all()

features_sports_test = pd.DataFrame.from_dict(results)

for i in range(1, len(X_test_sports)):
    time_series_x_acc = X_test_sports['dim_0'].to_numpy()[i].to_numpy()
    time_series_y_acc = X_test_sports['dim_1'].to_numpy()[i].to_numpy()
    time_series_z_acc = X_test_sports['dim_2'].to_numpy()[i].to_numpy()
    time_series_x_giro = X_test_sports['dim_3'].to_numpy()[i].to_numpy()
    time_series_y_giro = X_test_sports['dim_4'].to_numpy()[i].to_numpy()
    time_series_z_giro = X_test_sports['dim_5'].to_numpy()[i].to_numpy()

    time_series = [time_series_x_acc,  time_series_y_acc, time_series_z_acc, time_series_x_giro,  time_series_y_giro, time_series_z_giro]
    time_series_fft = np.abs(np.fft.fft([time_series_x_acc,  time_series_y_acc, time_series_z_acc, time_series_x_giro,  time_series_y_giro, time_series_z_giro]))

    fe = feature_extractor([*time_series, *time_series_fft], ['dim_0', 'dim_1', 'dim_2', 'dim_3', 'dim_4', 'dim_5', 'dim_0_fft', 'dim_1_fft', 'dim_2_fft', 'dim_3_fft', 'dim_4_fft', 'dim_5_fft'])

    results = fe.get_all()

    features_sports_test = pd.concat([features_sports_test, pd.DataFrame.from_dict(results)], ignore_index=True)

features_sports_test.to_csv('features_sports_test.csv')

# Mitbih train

time_series = X_train_mitbih.to_numpy()[0]
fe = feature_extractor([time_series, np.abs(np.fft.fft(time_series))], ['series', 'series_fft'], W = 200, sample_step=200)
results = fe.get_all(no_3D=True)
features_mitbih_train = pd.DataFrame.from_dict(results)

for i in range(1, len(X_train_mitbih)):
    time_series = X_train_mitbih.to_numpy()[i]

    fe = feature_extractor([time_series, np.abs(np.fft.fft(time_series))], ['series', 'series_fft'], W = 200, sample_step=200)
    results = fe.get_all(no_3D=True)
    features_mitbih_train = pd.concat([features_mitbih_train, pd.DataFrame.from_dict(results)], ignore_index=True)

features_mitbih_train.to_csv('features_mitbih_train.csv')

# Mitbih test
time_series = X_test_mitbih.to_numpy()[0]
fe = feature_extractor([time_series, np.abs(np.fft.fft(time_series))], ['series', 'series_fft'], W = 200, sample_step=200)
results = fe.get_all(no_3D=True)
features_mitbih_test = pd.DataFrame.from_dict(results)

for i in range(1, len(X_test_mitbih)):
    time_series = X_test_mitbih.to_numpy()[i]

    fe = feature_extractor([time_series, np.abs(np.fft.fft(time_series))], ['series', 'series_fft'], W = 200, sample_step=200)
    results = fe.get_all(no_3D=True)
    features_mitbih_test = pd.concat([features_mitbih_test, pd.DataFrame.from_dict(results)], ignore_index=True)

features_mitbih_test.to_csv('features_mitbih_test.csv')

# ptbdb train
for i in range(len(X_train_ptbdb)):
    time_series = X_train_ptbdb.to_numpy()[i].ravel()
    time_series_fft = np.abs(np.fft.fft(time_series))

    fe = feature_extractor([time_series, time_series_fft], ['series', 'series_fft'], W=200, sample_step=200)

    results = fe.get_all(no_3D=True)

    if i == 0:
        features_ptbdb_train = pd.DataFrame.from_dict(results)
    else:
        features_ptbdb_train = pd.concat([features_ptbdb_train, pd.DataFrame.from_dict(results)], ignore_index=True)

features_ptbdb_train.to_csv('features_ptbdb_train.csv')

# Ptbdb test
for i in range(len(X_test_ptbdb)):
    time_series = X_test_ptbdb.to_numpy()[i]
    time_series_fft = np.abs(np.fft.fft(time_series))

    fe = feature_extractor([time_series, time_series_fft], ['series', 'series_fft'], W=200, sample_step=200)

    results = fe.get_all(no_3D=True)

    if i == 0:
        features_ptbdb_test = pd.DataFrame.from_dict(results)
    else:
        features_ptbdb_test = pd.concat([features_ptbdb_test, pd.DataFrame.from_dict(results)], ignore_index=True)

features_ptbdb_test.to_csv('features_ptbdb_test.csv')


In [None]:
# label encoder pt Sports
var = VarianceThreshold(threshold=0.2)
var.fit(features_sports_train)
X_train_sports_selected = var.transform(features_sports_train)
X_test_sports_selected = var.transform(features_sports_test)

le = LabelEncoder().fit(y_train_sports)
y_sports_train_enc = le.transform(y_train_sports)

le = LabelEncoder().fit(y_test_sports)
y_sports_test_enc = le.transform(y_test_sports)


In [None]:
# pt printat scor/clasa
# efectiv pain but ayaye :)
def scorer_f1_by_class(y_true, y_predicted, class_):
  result = f1_score(y_true, y_predicted, average=None, zero_division=0)
  return result[class_]

def scorer_precision_by_class(y_true, y_predicted, class_):
  result = precision_score(y_true, y_predicted, average=None, zero_division=0)
  return result[class_]

def scorer_recall_by_class(y_true, y_predicted, class_):
  result = recall_score(y_true, y_predicted, average=None, zero_division=0)
  return result[class_]

In [None]:
rf = RandomForestClassifier()

scoring = {'f1_score': make_scorer(f1_score, average='macro'),
           'accuracy': make_scorer(accuracy_score),
           'f1_class_0': make_scorer(scorer_f1_by_class, class_=0),
           'f1_class_1': make_scorer(scorer_f1_by_class, class_=1),
           'f1_class_2': make_scorer(scorer_f1_by_class, class_=2),
           'f1_class_3': make_scorer(scorer_f1_by_class, class_=3),
           'precision_class_0': make_scorer(scorer_precision_by_class, class_=0),
           'precision_class_1': make_scorer(scorer_precision_by_class, class_=1),
           'precision_class_2': make_scorer(scorer_precision_by_class, class_=2),
           'precision_class_3': make_scorer(scorer_precision_by_class, class_=3),
           'recall_class_0': make_scorer(scorer_recall_by_class, class_=0),
           'recall_class_1': make_scorer(scorer_recall_by_class, class_=1),
           'recall_class_2': make_scorer(scorer_recall_by_class, class_=2),
           'recall_class_3': make_scorer(scorer_recall_by_class, class_=3)}


params = {'max_features' : [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35],
          'n_estimators' : [200, 235, 250, 270, 300],
          'max_depth' : [30, 35, 40, 45],
          'criterion' : ['gini'],
          'n_jobs' : [-1],
          'oob_score' : [True],
          'warm_start' : [True, False]
          }

grid = GridSearchCV(estimator=rf, param_grid=params, cv=5, scoring=scoring, refit='f1_score', verbose=3, return_train_score=True)
grid.fit(X_train_sports_selected, y_sports_train_enc)

print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_index_)
print(grid.cv_results_)

In [None]:
# {'criterion': 'gini',
#  'max_depth': 35,
#  'max_features': 0.1,
#  'n_estimators': 200,
#  'n_jobs': -1,
#  'oob_score': True,
#  'warm_start': True}

rf = RandomForestClassifier(n_estimators=200, criterion='gini', max_depth=35, max_features=0.1, oob_score=True, n_jobs=-1, warm_start=True)

rf.fit(X_train_sports_selected, y_sports_train_enc)

y_pred_sports_rf = rf.predict(X_test_sports_selected)
y_pred_proba_sports_rf = rf.predict_proba(X_test_sports_selected)
cm_rf = confusion_matrix(y_sports_test_enc, y_pred_sports_rf)


In [None]:
boosted = XGBClassifier()

scoring = {'f1_score': make_scorer(f1_score, average='macro'),
           'accuracy': make_scorer(accuracy_score),
           'f1_class_0': make_scorer(scorer_f1_by_class, class_=0),
           'f1_class_1': make_scorer(scorer_f1_by_class, class_=1),
           'f1_class_2': make_scorer(scorer_f1_by_class, class_=2),
           'f1_class_3': make_scorer(scorer_f1_by_class, class_=3),
           'precision_class_0': make_scorer(scorer_precision_by_class, class_=0),
           'precision_class_1': make_scorer(scorer_precision_by_class, class_=1),
           'precision_class_2': make_scorer(scorer_precision_by_class, class_=2),
           'precision_class_3': make_scorer(scorer_precision_by_class, class_=3),
           'recall_class_0': make_scorer(scorer_recall_by_class, class_=0),
           'recall_class_1': make_scorer(scorer_recall_by_class, class_=1),
           'recall_class_2': make_scorer(scorer_recall_by_class, class_=2),
           'recall_class_3': make_scorer(scorer_recall_by_class, class_=3)}

# numărul de arbori, adâncimea maximă a unui arbore,
# learning rate

params = {'max_features' : [0.05, 0.1, 0.15],
          'n_estimators' : [50, 100, 200],
          'max_depth' : [None, 30, 35, 40],
          'n_jobs': [-1],
	      'verbosity' : [0],
	      'booster': ['gbtree', 'gblinear', 'dart'],
          'gamma': [0.3, 0.4, 0.5],
	      'learning_rate': [0.05, 0.1, 0.15]}


grid = GridSearchCV(estimator=boosted, param_grid=params, cv=5, scoring=scoring, refit='f1_score', verbose=3, return_train_score=True)
grid.fit(X_train_sports_selected, y_sports_train_enc)

print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_index_)
print(grid.cv_results_)

In [None]:
# {'booster': 'gblinear', 'gamma': 0.3, 'learning_rate': 0.05, 'max_depth': 30, 
# 'max_features': 0.15, 'n_estimators': 50, 'n_jobs': -1, 'verbosity': 0}

boosted = XGBClassifier(booster='gblinear', gamma=0.3, learning_rate=0.05, max_depth=30,
                        max_features=0.15, n_estimators=50, n_jobs=-1, verbosity=0)
boosted.fit(features_sports_train, y_sports_train_enc, eval_set=[(features_sports_test, y_sports_test_enc)], verbose=0)
y_pred_sports_boosted = boosted.predict(features_sports_test)
y_pred_proba_sports_boosted = rf.predict_proba(X_test_sports_selected)
cm_boosted = confusion_matrix(y_sports_test_enc, y_pred_sports_boosted)


In [None]:
# tipul de kernel, parametru C de regularizare
svc = SVC()

scoring = {'f1_score': make_scorer(f1_score, average='macro'),
           'accuracy': make_scorer(accuracy_score),
           'f1_class_0': make_scorer(scorer_f1_by_class, class_=0),
           'f1_class_1': make_scorer(scorer_f1_by_class, class_=1),
           'f1_class_2': make_scorer(scorer_f1_by_class, class_=2),
           'f1_class_3': make_scorer(scorer_f1_by_class, class_=3),
           'precision_class_0': make_scorer(scorer_precision_by_class, class_=0),
           'precision_class_1': make_scorer(scorer_precision_by_class, class_=1),
           'precision_class_2': make_scorer(scorer_precision_by_class, class_=2),
           'precision_class_3': make_scorer(scorer_precision_by_class, class_=3),
           'recall_class_0': make_scorer(scorer_recall_by_class, class_=0),
           'recall_class_1': make_scorer(scorer_recall_by_class, class_=1),
           'recall_class_2': make_scorer(scorer_recall_by_class, class_=2),
           'recall_class_3': make_scorer(scorer_recall_by_class, class_=3)}

params = {'C' : [0.1, 0.2, 0.3, 0.5, 1],
          'kernel' : ['linear','poly', 'rbf', 'sigmoid'],
          'degree' : [2, 3, 4],
          'gamma': ['scale', 'auto'],
          'shrinking': [True, False],
          'decision_function_shape': ['ovo', 'ovr']}

grid = GridSearchCV(estimator=svc, param_grid=params, cv=5, scoring=scoring, refit='f1_score', verbose=3, return_train_score=True)
grid.fit(X_train_sports_selected, y_sports_train_enc)

print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_index_)
print(grid.cv_results_)

In [None]:
# {'C': 0.1, 'decision_function_shape': 'ovo', 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}

svc = SVC(C=0.1, kernel='linear', degree=2, gamma='scale', shrinking=True, decision_function_shape='ovo')

svc.fit(X_train_sports_selected, y_sports_train_enc)
y_pred_sports_svc = rf.predict(X_test_sports_selected)
y_pred_proba_sports_svc = rf.predict_proba(X_test_sports_selected)
cm_svc = confusion_matrix(y_sports_test_enc, y_pred_sports_svc)


In [None]:
results_dir = '3.2/'

if not os.path.isdir(results_dir):
    os.makedirs(results_dir)

cms = [cm_rf, cm_boosted, cm_svc]
name = ['RandomForest', 'Gradient Boosted Tree', 'SVC']

for i in range(len(cms)):
    disp = ConfusionMatrixDisplay(confusion_matrix=cms[i])
    disp.plot(include_values=True, cmap='cool', ax=None, xticks_rotation="vertical")
    plt.title(name[i] + ' Confusion Matrix')
    plt.savefig(name[i] + '_cm.png')
    plt.grid(False)
    plt.show()

for i in range(len(cms)):
    for class_pos in range(len(labels_sports)):
        y_test_all_vs_one = np.array([1 if y == class_pos else 0 for y in y_sports_test_enc])
        roc_auc = roc_auc_score(y_test_all_vs_one, y_pred_proba_sports_svc[:, class_pos])
        fpr, tpr, thresholds = roc_curve(y_test_all_vs_one, y_pred_proba_sports_svc[:, class_pos])
        roc_display = RocCurveDisplay(fpr=fpr, tpr=tpr).plot()
        plt.text(0.95, 0.01, 'AUC = ' + str(roc_auc), verticalalignment='bottom', horizontalalignment='right')
        plt.title(name[i] + 'ROC-AUC curve ' + labels_sports[class_pos])
        plt.savefig(results_dir + name[i] + '_ROC_AUC_' + labels_sports[class_pos])
        plt.show()


In [None]:
shutil.rmtree('ECG', ignore_errors=True)
shutil.rmtree('RacketSports', ignore_errors=True)