In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pandas as pd
import numpy as np
from tsfresh import extract_features
import plotly.graph_objs as go
import plotly.express as px
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFECV
import random
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,  f1_score

In [None]:
seed = 42
np.random.seed(seed)
random.seed(seed)

In [None]:
data_to_extract=pd.read_csv("..\chapter4\data_to_extraction.csv").drop(columns="Unnamed: 0")
data_to_extract.columns

In [None]:
data_to_extract.groupby("PACJENT_NR").last()["BADANIE_NR"].max()

In [None]:
y=data_to_extract.groupby("PACJENT_NR").last()["ZGON"].reset_index(drop=True)

In [None]:
var_columns=['BETET', 'CO2TET', 'HCO3ACTE', 'HCO3STTE',
       'O2SATTET', 'O2TET', 'IONH', 'BETET_kw', 'O2TET_kw',
       'HCO3ACTE_kw', 'HCO3STTE_kw', 'O2SATTET_kw', 'IONH_kw', 'CO2TET_kw',
       'BETET_pn', 'O2TET_pn', 'HCO3ACTE_pn', 'HCO3STTE_pn', 'O2SATTET_pn',
       'IONH_pn', 'CO2TET_pn', 'euclidean_kw', 'euclidean_kw_skum']


In [None]:
len(var_columns)

In [None]:
selected_features = {
    'minimum': None,
    'maximum': None,
    'mean': None,
    'median': None,
    'variance': None,
    'number_peaks': [{'n': 2}, {'n': 4}],
    'first_location_of_minimum': None,
    'first_location_of_maximum': None,
    'linear_trend': [{'attr': 'slope'}],
    'agg_linear_trend': [
        {'f_agg': 'mean', 'chunk_len': 3, 'attr': 'slope'},
        {'f_agg': 'mean', 'chunk_len': 6, 'attr': 'slope'},
        {'f_agg': 'max', 'chunk_len': 3, 'attr': 'slope'},
        {'f_agg': 'max', 'chunk_len': 6, 'attr': 'slope'}
    ]
}

extracted_features = pd.DataFrame()

for col in var_columns:
    features = extract_features(data_to_extract, column_id="PACJENT_NR", column_sort="BADANIE_NR", 
                                column_value=col, default_fc_parameters=selected_features)
    extracted_features = pd.concat([extracted_features, features], axis=1)

extracted_features 

In [None]:
names=extracted_features.columns

In [None]:
len(names)

In [None]:
data_with_features=pd.concat([data_to_extract[['PACJENT_NR', 'ZGON']].groupby("PACJENT_NR").last()["ZGON"], extracted_features], axis=1).rename_axis("PACJENT_NR").reset_index()
data_with_features

In [None]:
X_data = data_with_features.drop(columns=["ZGON", "PACJENT_NR"])
y_data = data_with_features["ZGON"]
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, stratify=y_data, test_size=0.2, random_state=seed)

In [None]:
X_data.shape

In [None]:
# Standaryzacja danych
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def plot_confusion_matrix(model, X_train, X_test, y_train, y_test, ax, title):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap="Reds", annot_kws={"fontsize": 14})
    ax.set_title(f'{title}\ndokładność: {accuracy:.2f}, precyzja: {precision:.2f},\nczułość: {recall:.2f}, F1-score: {f1:.2f}', fontsize=16)
    ax.set_xlabel('Przewidywane', fontsize=14)
    ax.set_ylabel('Prawdziwe', fontsize=14)

    # Funkcja do trenowania i mierzenia czasu
def model_evaluate(classifier, X_train, X_test, y_train, y_test):
    y_proba = classifier.predict_proba(X_test)[:, 1]
    fpr, tpr, _= roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc


### Klasyfikacja na wszystkich 324 zmiennych

In [None]:
# Klasyfikatory

classifiers = {
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVC': SVC(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier()
}

param_grids = {
    'KNN': {'n_neighbors': [5,10,15,20], 'weights': ['uniform', 'distance']},
    'LDA': {'solver': ['svd', 'lsqr', 'eigen']},
    'SVC': {'C': [0.1, 0.5, 1, 1.5], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],  'probability': [True]},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50, None], 'random_state': [seed]},
    'RF': {'n_estimators': [5, 10, 50, 100, 200], 'criterion': ['gini', 'entropy'], 
                      'max_depth': [5, 10, 20, 30, 40, 50,], 'random_state': [seed]}
}


# Przechowywanie najlepszych modeli i ich wyników
best_models_all = {}
best_scores_all = {}
test_scores_all={}

# Przeszukiwanie hiperparametrów dla każdego klasyfikatora
for name, classifier in classifiers.items():
    print(f"Przetwarzanie {name}...")
    grid_search = GridSearchCV(classifier, param_grids[name], cv=4, n_jobs=-1, scoring='accuracy',)
    grid_search.fit(X_train_scaled, y_train)
    best_models_all[name] = (type(grid_search.best_estimator_), grid_search.best_params_)
    best_scores_all[name] = grid_search.best_score_
    print(f"Najlepsze parametry dla {name}: {grid_search.best_params_}")
    print(f"Najlepszy wynik dla {name}: {grid_search.best_score_}\n")

# Ocena najlepszych modeli na zbiorze testowym
for name, (model_type, best_params) in best_models_all.items():
    model = model_type(**best_params)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    test_score = accuracy_score(y_test, y_pred)
    test_scores_all[name]=test_score
    print(f"Dokładność {name} na zbiorze testowym: {test_score}")

In [None]:
best_scores_all

In [None]:
test_scores_all

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# Lista tytułów
titles = ['KNN', 'LDA', 'SVC', 'DT', 'RF']

# Rysowanie macierzy pomyłek dla każdego klasyfikatora
for ax, (title, (model_type, best_params)) in zip(axes.flatten(), best_models_all.items()):
    model = model_type(**best_params)
    plot_confusion_matrix(model, X_train_scaled, X_test_scaled, y_train, y_test, ax, title)

# Usuwanie niepotrzebnych osi w przypadku parzystej liczby klasyfikatorów
if len(classifiers) % 2 != 0:
    fig.delaxes(axes.flatten()[-1])

plt.tight_layout()

plt.savefig("images5/conf_matr_all.png")

plt.show()

In [None]:
plt.figure(figsize=(8, 6))
for name, (model_type, best_params) in best_models_all.items():
    model = model_type(**best_params)
    print(model)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    fpr, tpr, roc_auc= model_evaluate(model, X_train_scaled, X_test_scaled, y_train, y_test)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')  # Grubsze linie

# Dodanie linii diagonalnej
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Ustawienia osi
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

# Powiększenie napisów przy osiach
plt.xlabel('Odsetek fałszywie pozytywnych', fontsize=14)
plt.ylabel('Odsetek prawdziwie pozytywnych', fontsize=14)

# Powiększenie legendy
plt.legend(loc='lower right', fontsize=12)

plt.savefig("images5/roc_all.png")
# Wyświetlenie wykresu
plt.show()

### Klasyfikacja na podstawie najważniejszych skłądowych

In [None]:
# PCA
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_train_scaled)

# Wyjaśniona wariancja
explained_variance_ratio = pca.explained_variance_ratio_

# Kumulatywna suma wyjaśnionej wariancji
cumulative_explained_variance_ratio = explained_variance_ratio.cumsum()

# Tworzenie osypiska (scree plot) za pomocą Plotly
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(1, len(explained_variance_ratio) + 1)), y=explained_variance_ratio,
                         mode='lines+markers', name='Explained Variance Ratio'))
# fig.add_trace(go.Scatter(x=list(range(1, len(cumulative_explained_variance_ratio) + 1)), 
#                          y=cumulative_explained_variance_ratio,
#                          mode='lines+markers', name='Cumulative Explained Variance Ratio'))

# Dodajemy etykiety i tytuł
fig.update_layout(
    xaxis=dict(
        title='Liczba składowych',
        tickfont=dict(size=16),
        title_font=dict(size=20),
    ),
    yaxis=dict(
        title='Odsetek wariancji',
        tickfont=dict(size=16),
        title_font=dict(size=20)
    )
)
fig.update_layout(template="plotly_white")


# Wyświetlenie wykresu
fig.show()
fig.write_image("images5/osypisko.png", width=1000, height=600, scale=3, format="png")

In [None]:
#PCA
pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

explained_variance_ratio = pca.explained_variance_ratio_
print(sum(explained_variance_ratio))

X_pca_df=pd.DataFrame(X_train_pca, columns=['Czynnik 1', 'Czynnik 2', 'Czynnik 3'])
X_pca_df['ZGON'] = y
X_pca_df['Czy pacjent zmarł?'] = X_pca_df['ZGON'].map({1: 'tak', 0: 'nie'})

fig = px.scatter_3d(X_pca_df, x='Czynnik 1', y='Czynnik 2', z='Czynnik 3', color='Czy pacjent zmarł?', size_max=18, opacity=0.8)
fig.update_traces(textposition='top center', marker_size=6)

fig.update_layout(
    scene=dict(
        xaxis=dict(tickfont=dict(size=10)),
        yaxis=dict(tickfont=dict(size=10)),
        zaxis=dict(tickfont=dict(size=10)),
    )
)

fig.update_layout(legend=dict(font=dict(size=14)), legend_title=dict(font=dict(size=14)))
fig.update_layout(template="plotly_white")
fig.show()

# fig.write_image("images5/" + "pca_miernik_3d" + ".png", width=1000, height=600, scale=3, format="png")


In [None]:
pca = PCA(n_components=7) #22
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
print(sum(explained_variance_ratio))

In [None]:
# Klasyfikatory

classifiers = {
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVC': SVC(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier()
}


param_grids = {
    'KNN': {'n_neighbors': [5,10,15,20], 'weights': ['uniform', 'distance']},
    'LDA': {'solver': ['svd', 'lsqr', 'eigen']},
    'SVC': {'C': [0.1, 0.5, 1, 1.5], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'gamma': [0.001, 0.01, 0.1, 1],  'probability': [True]},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50, None], 'random_state': [seed]},
    'RF': {'n_estimators': [5, 10, 50, 100, 200], 'criterion': ['gini', 'entropy'], 
                      'max_depth': [5, 10, 20, 30, 40, 50,], 'random_state': [seed]}
}


# Przechowywanie najlepszych modeli i ich wyników
best_models_pca = {}
best_scores_pca = {}
test_scores_pca= {}

# Przeszukiwanie hiperparametrów dla każdego klasyfikatora
for name, classifier in classifiers.items():
    print(f"Przetwarzanie {name}...")
    grid_search = GridSearchCV(classifier, param_grids[name], cv=4, n_jobs=-1, scoring='accuracy',)
    grid_search.fit(X_train_pca, y_train)
    best_models_pca[name] = (type(grid_search.best_estimator_), grid_search.best_params_)
    best_scores_pca[name] = grid_search.best_score_
    print(f"Najlepsze parametry dla {name}: {grid_search.best_params_}")
    print(f"Najlepszy wynik dla {name}: {grid_search.best_score_}\n")

# Ocena najlepszych modeli na zbiorze testowym
for name, (model_type, best_params) in best_models_pca.items():
    model = model_type(**best_params)
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    test_score = accuracy_score(y_test, y_pred)
    test_scores_pca[name]=test_score
    print(f"Dokładność {name} na zbiorze testowym: {test_score}")

In [None]:
best_scores_pca

In [None]:
test_scores_pca

In [None]:
# Krzywe ROC
plt.figure(figsize=(8, 6))
for name, (model_type, best_params) in best_models_pca.items():
    model = model_type(**best_params)
    model.fit(X_train_pca, y_train)
    y_pred = model.predict(X_test_pca)
    fpr, tpr, roc_auc= model_evaluate(model, X_train_pca, X_test_pca, y_train, y_test)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')  # Grubsze linie

# Dodanie linii diagonalnej
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Ustawienia osi
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

# Powiększenie napisów przy osiach
plt.xlabel('Odsetek fałszywie pozytywnych', fontsize=14)
plt.ylabel('Odsetek prawdziwie pozytywnych', fontsize=14)

# Powiększenie legendy
plt.legend(loc='lower right', fontsize=12)

plt.savefig("images5/roc_pca.png")
# Wyświetlenie wykresu
plt.show()



In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# Lista tytułów
titles = ['KNN', 'LDA', 'SVC', 'DT', 'RF']

# Rysowanie macierzy pomyłek dla każdego klasyfikatora
for ax, (title, (model_type, best_params)) in zip(axes.flatten(), best_models_all.items()):
    model = model_type(**best_params)
    plot_confusion_matrix(model, X_train_pca, X_test_pca, y_train, y_test, ax, title)

# Usuwanie niepotrzebnych osi w przypadku parzystej liczby klasyfikatorów
if len(classifiers) % 2 != 0:
    fig.delaxes(axes.flatten()[-1])

plt.tight_layout()

plt.savefig("images5/conf_matr_pca.png")

plt.show()

### Wyszukiwanie liczby cech ref

In [None]:
# Ustawienie różnych wartości C dla SVC
C_values = [0.05, 0.1, 0.5, 1, 1.2]

# Przechowywanie wyników
results = []

# Iterowanie przez różne wartości C
for C in C_values:
    svc_linear = SVC(kernel='linear', C=C, random_state=42)
    
    # Definiowanie RFECV z SVC (liniowym) jako estymatora
    rfecv = RFECV(estimator=svc_linear, step=1, cv=4, scoring='accuracy')
    rfecv.fit(X_train_scaled, y_train)
    
    n_features=rfecv.n_features_

    # Transformowanie danych na podstawie wybranych cech
    X_train_rfecv = rfecv.transform(X_train_scaled)
    X_test_rfecv = rfecv.transform(X_test_scaled)
    
    classifiers = {
        'KNN': KNeighborsClassifier(),
        'LDA': LinearDiscriminantAnalysis(),
        'SVC': SVC(),
        'DT': DecisionTreeClassifier(),
        'RF': RandomForestClassifier()
    }

    param_grids = {
        'KNN': {'n_neighbors': [5,10,15,20], 'weights': ['uniform', 'distance']},
        'LDA': {'solver': ['svd', 'lsqr', 'eigen']},
        'SVC': {'C': [0.1, 0.5, 1, 1.5], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'probability': [True]},
        'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50], 'random_state': [42]},
        'RF': {'n_estimators': [5, 10, 50, 100, 200], 'criterion': ['gini', 'entropy'], 
                          'max_depth': [5, 10, 20, 30, 40, 50], 'random_state': [42]}
    }

    # Przeszukiwanie hiperparametrów dla każdego klasyfikatora
    for name, classifier in classifiers.items():
        grid_search = GridSearchCV(classifier, param_grids[name], cv=4, n_jobs=-1, scoring='accuracy')
        grid_search.fit(X_train_rfecv, y_train)
        results.append({'Model': name, 'C': C, 'Best Score': grid_search.best_score_, 'n_features': n_features})
    
# Konwertowanie wyników do ramki danych
results_df = pd.DataFrame(results)


In [None]:
# Rysowanie wykresu kolumnowego
results_df['C_n_features'] = results_df['C'].astype(str) + ' (liczba cech=' + results_df['n_features'].astype(str) + ')'
palette = sns.color_palette(["#2ecc71", "#3498db", "#e74c3c", "#9b59b6", "#f1c40f"])

results_df['C'] = results_df['C'].astype(str).str.replace('.', ',')
results_df['C_n_features'] = results_df['C'] + ' (liczba cech=' + results_df['n_features'].astype(str) + ')'


# Rysowanie wykresu kolumnowego
plt.figure(figsize=(12, 8))

sns.barplot(data=results_df, x='C_n_features', y='Best Score', hue='Model', palette=palette)
plt.xlabel('\nKoszt C (wraz z liczbą wybranych cech)', fontsize=14)
plt.ylabel('Dokładność', fontsize=14)
plt.ylim([0,1])
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=14,  title_fontsize=16)
plt.tick_params(axis='both', which='major', labelsize=12)

plt.savefig("images5/comparision_cost.png", bbox_inches='tight')
plt.show()

### Klasyfikacja na podstawie najważniejszych zmiennych

In [None]:
# Użycie SVC z liniowym jądrem dla RFECV
svc_linear = SVC(kernel='linear', C=0.05)

# Definiowanie RFECV z SVC (liniowym) jako estymatora
rfecv = RFECV(estimator=svc_linear, step=1, cv=4, scoring='accuracy')
rfecv.fit(X_train_scaled, y_train)

# Transformowanie danych na podstawie wybranych cech
X_train_rfecv = rfecv.transform(X_train_scaled)
X_test_rfecv = rfecv.transform(X_test_scaled)

In [None]:
rfecv.n_features_

In [None]:
import plotly.graph_objects as go

min_features_to_select = 1
n_scores = len(rfecv.cv_results_["mean_test_score"])

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=list(range(min_features_to_select, n_scores + min_features_to_select)),
    y=rfecv.cv_results_["mean_test_score"],
    mode='lines'
))

fig.update_layout(
    xaxis_title="Liczba zmiennych",
    yaxis_title="Dokładność",
    yaxis=dict(range=[0.5, 0.9]),
    xaxis=dict(range=[0, 100])
)

fig.update_layout(template="plotly_white")

fig.write_image("images5/zmienne_rfecv.png", width=1000, height=600, scale=4, format="png")

fig.show()

In [None]:
classifiers = {
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVC': SVC(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier()
}

param_grids = {
    'KNN': {'n_neighbors': [5,10,15,20], 'weights': ['uniform', 'distance']},
    'LDA': {'solver': ['svd', 'lsqr', 'eigen']},
    'SVC': {'C': [0.1, 0.5, 1, 1.5], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'probability': [True]},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50], 'random_state': [seed]},
    'RF': {'n_estimators': [5, 10, 50, 100, 200], 'criterion': ['gini', 'entropy'], 
                      'max_depth': [5, 10, 20, 30, 40, 50], 'random_state': [seed]}
}


# Przechowywanie najlepszych modeli i ich wyników
best_models_rfe = {}
best_scores_rfe = {}
test_scores_rfe = {}
best_models_rfe_class={}

# Przeszukiwanie hiperparametrów dla każdego klasyfikatora
for name, classifier in classifiers.items():
    print(f"Przetwarzanie {name}...")
    grid_search = GridSearchCV(classifier, param_grids[name], cv=4, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train_rfecv, y_train)
    best_models_rfe_class[name]=grid_search.best_estimator_
    best_models_rfe[name] = (type(grid_search.best_estimator_), grid_search.best_params_)
    best_scores_rfe[name] = grid_search.best_score_
    print(f"Najlepsze parametry dla {name}: {grid_search.best_params_}")
    print(f"Najlepszy wynik dla {name}: {grid_search.best_score_}\n")

# Ocena najlepszych modeli na zbiorze testowym
for name, (model_type, best_params) in best_models_rfe.items():
    model = model_type(**best_params)
    model.fit(X_train_rfecv, y_train)
    y_pred = model.predict(X_test_rfecv)
    test_score = accuracy_score(y_test, y_pred)
    test_scores_rfe[name]=test_score
    print(f"Dokładność {name} na zbiorze testowym: {test_score}")

In [None]:
best_scores_rfe

In [None]:
test_scores_rfe

In [None]:
df = pd.DataFrame({
    'Model': best_scores_all.keys(),
    'Best Score all': best_scores_all.values(),
    'Test Score all': test_scores_all.values(),
    'Best Score pca': best_scores_pca.values(),
    'Test Score pca': test_scores_pca.values(),
    'Best Score rfe': best_scores_rfe.values(),
    'Test Score rfe': test_scores_rfe.values(),
})

# Przekształcenie ramki danych do formatu długiego
df_melted = df.melt(id_vars='Model', var_name='Score Type', value_name='Score')

# Tworzenie wykresu kolumnowego
plt.figure(figsize=(14, 8))
sns.barplot(x='Score Type', y='Score', hue='Model', data=df_melted)

# Dostosowanie wyglądu wykresu
plt.title('Porównanie wyników różnych modeli')
plt.ylabel('Score')
plt.xlabel('Score Type')
plt.ylim(0, 1)
plt.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
best_models_rfe

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# Lista tytułów
titles = ['KNN', 'LDA', 'SVC', 'DT', 'RF']

# Rysowanie macierzy pomyłek dla każdego klasyfikatora
for ax, (title, (model_type, best_params)) in zip(axes.flatten(), best_models_rfe.items()):
    model = model_type(**best_params)
    plot_confusion_matrix(model, X_train_rfecv, X_test_rfecv, y_train, y_test, ax, title)

# Usuwanie niepotrzebnych osi w przypadku parzystej liczby klasyfikatorów
if len(classifiers) % 2 != 0:
    fig.delaxes(axes.flatten()[-1])

plt.tight_layout()

plt.savefig("images5/conf_matr_rfe.png")

plt.show()

In [None]:
import shap

In [None]:
(model_type, best_params)=best_models_rfe["SVC"]
model = model_type(**best_params)
model.fit(X_train_rfecv,  y_train)

# Uzyskanie współczynników cech
coefs = model.coef_[0]

# Stworzenie wykresu współczynników cech
feature_names = ["Cecha 1", "Cecha 2", "Cecha 3", "Cecha 4", "Cecha 5", "Cecha 6"] #

# Posortowanie cech według ich współczynników
sorted_idx = np.argsort(coefs)
sorted_coefs = coefs[sorted_idx]
sorted_features = feature_names #feature_names[sorted_idx]

plt.figure(figsize=(10, 6))
plt.barh(sorted_features, sorted_coefs, color='#0097ca')
plt.xlabel("Współczynniki cech w modelu", fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
# plt.title("Istotność cech w modelu SVC z liniową funkcją jądra")
plt.savefig("images5/waznosc_cech.png")
plt.show()

In [None]:
feature_names

In [None]:
model.coef_

In [None]:
y_pred = model.predict(X_test_rfecv)
por=y_test.reset_index()
por["y_pred"]=y_pred
por.iloc[np.where(por["ZGON"]!=por["y_pred"])]["index"]

In [None]:
X_train_df = pd.DataFrame(X_train_rfecv).reset_index(drop=True)
# columns=feature_names
# Przekształcenie y_train do DataFrame i nadanie kolumnie nazwy "ZGON"
y_train_df = pd.DataFrame(y_train, columns=["ZGON"]).reset_index(drop=True)

# Połączenie X_train_df i y_train_df wzdłuż osi kolumn (axis=1)
data_6_vars = pd.concat([X_train_df, y_train_df], axis=1)

# Wyświetlenie wynikowej ramki danych
data_6_vars

In [None]:
feature_names_skr=['BETET_piki', 'BETET_loc_min',
       'HCO3STTE_loc_max', 'IONH_piki',
       'IONH_loc_max',
       'mean_slope_miernik2']

In [None]:
# Wyświetlenie wynikowej ramki danych
data_6_vars=data_with_features[[*names[rfecv.support_], "ZGON", "PACJENT_NR"]]
# data_6_vars.columns=[0,1,2,3,4,5,"ZGON"]
data_6_vars.columns=[*feature_names, "ZGON", "PACJENT_NR"]
data_6_vars.columns

In [None]:
data_with_features.iloc[59]["PACJENT_NR"]
data_with_features.iloc[13]["PACJENT_NR"]

In [None]:
cols=["Cecha 1", "Cecha 2", "Cecha 3", "Cecha 4", "Cecha 5", "Cecha 6"]
my_blue="#0064B2"
my_red="#D61600"
fig = go.Figure()

# Użycie pd.melt() do przekształcenia ramki danych do długiego formatu
melted_data = pd.melt(data_6_vars, id_vars=["ZGON", "PACJENT_NR"], value_vars=cols,
                      var_name="Cecha", value_name="Wartość")

# Tworzenie wykresu boxplot za pomocą Plotly Express
fig = px.box(melted_data, x="Cecha", y="Wartość", color="ZGON",
             labels={"Cecha": "Cecha", "Wartość": "Wartość", "ZGON": "Klasa ZGON"}, points='all')

# Ustawienie customdata
fig.update_traces(customdata=melted_data["PACJENT_NR"])

# Dodanie numeru pacjenta do etykiet punktów
fig.update_traces(
    hovertemplate='%{x}: %{y} <br> Pacjent: %{customdata}'
)

# Dostosowanie rozmiaru czcionki osi x i y
fig.update_layout(xaxis=dict(tickfont=dict(size=18), title=""), yaxis=dict(tickfont=dict(size=16),  title=dict(font=dict(size=18))))
fig.update_layout(showlegend=False)
fig.update_layout(height=600, width=1200)
fig.update_layout(template="plotly_white")
fig.show()
fig.write_image("images5/rozklad_najw_cech.png", format="png", scale=3)

In [None]:
data_with_features.iloc[59, :]

In [None]:
names[rfecv.support_]

In [None]:
# Krzywe ROC
plt.figure(figsize=(8, 6))
for name, (model_type, best_params) in best_models_rfe.items():
    model = model_type(**best_params)
    model.fit(X_train_rfecv, y_train)
    y_pred = model.predict(X_test_rfecv)
    fpr, tpr, roc_auc= model_evaluate(model, X_train_rfecv, X_test_rfecv, y_train, y_test)
    plt.plot(fpr, tpr, lw=2, label=f'{name} (AUC = {roc_auc:.2f})')  # Grubsze linie

# Dodanie linii diagonalnej
plt.plot([0, 1], [0, 1], 'k--', lw=2)

# Ustawienia osi
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

# Powiększenie napisów przy osiach
plt.xlabel('Odsetek fałszywie pozytywnych', fontsize=14)
plt.ylabel('Odsetek prawdziwie pozytywnych', fontsize=14)

# Powiększenie legendy
plt.legend(loc='lower right', fontsize=12)

plt.savefig("images5/roc_rfe.png")
# Wyświetlenie wykresu
plt.show()



### Klasyfikacja w czasie

In [None]:
chosen_vars=['BETET_kw', 'HCO3STTE_kw', 'IONH_kw', 'euclidean_kw']

data_to_extract2=data_to_extract[['PACJENT_NR', 'BADANIE_NR', *chosen_vars]]

selected_features_dict = {
    'BETET_kw': {
        'number_peaks': [{'n': 2}],
        'first_location_of_minimum': None
    },
    'IONH_kw': {
        'number_peaks': [{'n': 2}],
        'first_location_of_maximum': None
    },
    'HCO3STTE_kw': {
        'first_location_of_maximum': None
    },
    'euclidean_kw': {
        'agg_linear_trend': [{'f_agg': 'mean', 'chunk_len': 6, 'attr': 'slope'}]
    }
}

In [None]:
from sklearn.utils import resample

In [None]:
results2 = []

for n_badan in range(1, 65):
    # Filtrowanie danych do n_badan
    data_subset = data_to_extract2[data_to_extract2['BADANIE_NR'] <= n_badan]

    extracted_features_6 = pd.DataFrame()

    # Iteracja po wybranych zmiennych
    for col in chosen_vars:
        selected_features = selected_features_dict.get(col, {})
        features = extract_features(data_subset, column_id="PACJENT_NR", column_sort="BADANIE_NR", 
                                    column_value=col, default_fc_parameters=selected_features)
        features.columns = [f"{col}_{feature}" for feature in features.columns]
        extracted_features_6 = pd.concat([extracted_features_6, features], axis=1)

    # Przygotowanie danych do modelowania
    X = extracted_features_6.dropna(axis=1, how='all')

    # Podział danych na zbiory treningowy i testowy
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed, stratify=y)
    
    # Standaryzacja danych
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    classifiers = {
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'SVC': SVC(),
    'DT': DecisionTreeClassifier(),
    'RF': RandomForestClassifier()
    }

    param_grids = {
    'KNN': {'n_neighbors': [5,10,15,20], 'weights': ['uniform', 'distance']},
    'LDA': {'solver': ['svd', 'lsqr', 'eigen']},
    'SVC': {'C': [0.1, 0.5, 1, 1.5], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'probability': [True]},
    'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50], 'random_state': [seed]},
    'RF': {'n_estimators': [5, 10, 50, 100, 200], 'criterion': ['gini', 'entropy'], 
                      'max_depth': [5, 10, 20, 30, 40, 50], 'random_state': [seed]}
    }

    best_models_for_n={}
    test_scores_for_n={}

    # Przeszukiwanie hiperparametrów dla każdego klasyfikatora
    for name, classifier in classifiers.items():
        grid_search = GridSearchCV(classifier, param_grids[name], cv=4, n_jobs=-1, scoring='accuracy')
        grid_search.fit(X_train_scaled, y_train)
        
        model_type=type(classifier)
        best_params=grid_search.best_params_

        test_scores=[]
        recalls=[]
        precisions=[]
        f1_scores=[]
        
        for i in range(100):
            X_btrain, X_btest, y_btrain, y_btest = train_test_split(X, y, test_size=0.2, random_state=i, stratify=y)

            scaler2 = StandardScaler()
            X_btrain_scaled = scaler2.fit_transform(X_btrain)
            X_btest_scaled = scaler2.transform(X_btest)

            best_model=model_type(**best_params)
            best_model.fit(X_btrain_scaled, y_btrain)
            y_bpred = best_model.predict(X_btest_scaled)
            test_scores.append(accuracy_score(y_btest, y_bpred))
            precisions.append(precision_score(y_btest, y_bpred))
            recalls.append(recall_score(y_btest, y_bpred))
            f1_scores.append(f1_score(y_btest, y_bpred))
        
        mean_test_score=np.mean(test_scores)
        mean_recall=np.mean(recalls)
        mean_precision=np.mean(precisions)
        mean_f1_score=np.mean(f1_scores)

        best_model=model_type(**best_params)
        best_model.fit(X_train_scaled, y_train)
        y_pred = best_model.predict(X_test_scaled)
        test_acc=accuracy_score(y_test, y_pred)
        test_prec=precision_score(y_test, y_pred)
        test_recall=recall_score(y_test, y_pred)
        test_f1=f1_score(y_test, y_pred)
    
        results2.append({'n_badan': n_badan, 'Model': name, 'parameters': best_params, 'Dokładność test': test_acc, 'Czułość test': test_recall, 
                         'Precyzja test': test_prec, 'F1-score test': test_f1, 'Dokładność 100 test': mean_test_score, 'Czułość 100 test': mean_recall, 
                         'Precyzja 100 test': mean_precision, 'F1-score 100 test': mean_f1_score})

    # # Ocena najlepszych modeli na zbiorze testowym
    # for name, (model_type, best_params) in best_models_for_n.items():
    #     model = model_type(**best_params)
    #     model.fit(X_train_scaled, y_train)
    #     y_pred = model.predict(X_test_scaled)
    #     test_score = accuracy_score(y_test, y_pred)
    #     results2.append({'n_badan': n_badan, 'Model': name, 'Dokładność': test_score, 'parameters': best_models_for_n[name]})

In [None]:
results_df=pd.DataFrame(results2)
results_df.head(20)

In [None]:
df=results_df[['n_badan', 'Model', 'Dokładność 100 test', 'Precyzja 100 test', 'Czułość 100 test', 'F1-score 100 test']]
df

In [None]:
x_label='Numer badania'
title='klasyfikacja_obserwacje_accuracy'
# results_df = results_df[results_df['Model'].isin(['LDA', 'KNN'])]
fig = px.line(results_df, x='n_badan', y='F1-score 100 test', color='Model', markers=True, line_shape='linear')

# Dodajemy etykiety i tytuł
fig.update_layout(
    xaxis=dict(
        title=x_label,
        tickfont=dict(size=16),
        title_font=dict(size=20),
    ),
    yaxis=dict(
        title='F1-score',
        tickfont=dict(size=16),
        title_font=dict(size=20),
        range=[0, 1]
    )
)

fig.update_layout(template="plotly_white")
fig.show()
# fig.write_image("images5/wykresy_w_czasie_lda_knn.png", width=1000, height=600, scale=4, format="png")
fig.write_image("images5/wykresy_w_czasie_100_fscore.png", width=1000, height=600, scale=4, format="png")

In [None]:
# results_df2=results_df[results_df['n_badan']>=1]

x_label='Numer badania'
title='klasyfikacja_obserwacje_accuracy'
# results_df = results_df[results_df['Model'].isin(['LDA', 'KNN'])]
fig = px.line(results_df, x='n_badan', y='Dokładność 100 test', color='Model', markers=True, line_shape='linear')

# Dodajemy etykiety i tytuł
fig.update_layout(
    xaxis=dict(
        title=x_label,
        tickfont=dict(size=16),
        title_font=dict(size=20),
    ),
    yaxis=dict(
        title='Dokładność',
        tickfont=dict(size=16),
        title_font=dict(size=20),
        range=[0, 1]
    )
)

fig.update_layout(template="plotly_white")
fig.show()
fig.write_image("images5/wykresy_w_czasie_100_dokladnosc_test.png", width=1000, height=600, scale=4, format="png")

In [None]:
n_badan=6
# Filtrowanie danych do n_badan
data_subset = data_to_extract2[data_to_extract2['BADANIE_NR'] <= n_badan]

extracted_features_6 = pd.DataFrame()

# Iteracja po wybranych zmiennych
for col in chosen_vars:
    selected_features = selected_features_dict.get(col, {})
    features = extract_features(data_subset, column_id="PACJENT_NR", column_sort="BADANIE_NR", 
                                column_value=col, default_fc_parameters=selected_features)
    features.columns = [f"{col}_{feature}" for feature in features.columns]
    extracted_features_6 = pd.concat([extracted_features_6, features], axis=1)

# Przygotowanie danych do modelowania
X6 = extracted_features_6.dropna(axis=1, how='all')

# Podział danych na zbiory treningowy i testowy
X_train6, X_test6, y_train, y_test = train_test_split(X6, y, test_size=0.2, random_state=seed, stratify=y)

# Standaryzacja danych
scaler = StandardScaler()
X_train_scaled6 = scaler.fit_transform(X_train6)
X_test_scaled6 = scaler.transform(X_test6)

classifiers = {
'KNN': KNeighborsClassifier(),
'LDA': LinearDiscriminantAnalysis(),
'SVC': SVC(),
'DT': DecisionTreeClassifier(),
'RF': RandomForestClassifier()
}

param_grids = {
'KNN': {'n_neighbors': [5,10,15,20], 'weights': ['uniform', 'distance']},
'LDA': {'solver': ['svd', 'lsqr', 'eigen']},
'SVC': {'C': [0.1, 0.5, 1, 1.5], 'kernel': ['linear', 'rbf', 'poly', 'sigmoid'], 'probability': [True]},
'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [10, 20, 30, 40, 50], 'random_state': [seed]},
'RF': {'n_estimators': [5, 10, 50, 100, 200], 'criterion': ['gini', 'entropy'], 
                    'max_depth': [5, 10, 20, 30, 40, 50], 'random_state': [seed]}
}

# Przechowywanie najlepszych modeli i ich wyników
best_models6 = {}
best_scores6 = {}
test_scores6 = {}
best_models_class6={}

# Przeszukiwanie hiperparametrów dla każdego klasyfikatora
for name, classifier in classifiers.items():
    print(f"Przetwarzanie {name}...")
    grid_search = GridSearchCV(classifier, param_grids[name], cv=4, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train6, y_train)
    best_models_class6[name]=grid_search.best_estimator_
    best_models6[name] = (type(grid_search.best_estimator_), grid_search.best_params_)
    best_scores6[name] = grid_search.best_score_
    print(f"Najlepsze parametry dla {name}: {grid_search.best_params_}")
    print(f"Najlepszy wynik dla {name}: {grid_search.best_score_}\n")

# Ocena najlepszych modeli na zbiorze testowym
for name, (model_type, best_params) in best_models6.items():
    model = model_type(**best_params)
    model.fit(X_train6, y_train)
    y_pred = model.predict(X_test6)
    test_score = accuracy_score(y_test, y_pred)
    test_scores6[name]=test_score
    print(f"Dokładność {name} na zbiorze testowym: {test_score}")



In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# Lista tytułów
titles = ['KNN', 'LDA', 'SVC', 'DT', 'RF']

# Rysowanie macierzy pomyłek dla każdego klasyfikatora
for ax, (title, (model_type, best_params)) in zip(axes.flatten(), best_models6.items()):
    model = model_type(**best_params)
    plot_confusion_matrix(model, X_train6, X_test6, y_train, y_test, ax, title)

# Usuwanie niepotrzebnych osi w przypadku parzystej liczby klasyfikatorów
if len(classifiers) % 2 != 0:
    fig.delaxes(axes.flatten()[-1])

plt.tight_layout()

plt.savefig("images5/conf_matr_6.png")

plt.show()