### Przygotowanie danych

In [2]:
# pip install requirements.txt

In [2]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error
from sktime.transformations.series.impute import Imputer
import plotly.express as px
import warnings

In [3]:
df_filtr_std = pd.read_csv("../chapter1/filtered_std_data_gasometry.csv")

### Poszukiwanie najlepszej metody uzupełniania braków danych

In [5]:
#Działanie na przefiltrowanych i ujednolicenie zmiennych 
df_imp=df_filtr_std.copy()
cols=["BETET",	"CO2TET",	"HCO3ACTE",	"HCO3STTE",	"O2SATTET",	"O2TET", "IONH", "ZGON"]
df_imp["czy_na"]=df_imp[cols].apply(lambda row: any(row.isna()), axis=1)
id=df_imp[df_imp["czy_na"]==True]["PACJENT_NR"]
df_bezna = df_imp[~df_imp['PACJENT_NR'].isin(id)]
df_bezna = df_bezna[["PACJENT_NR", *cols]]
df_bezna

Unnamed: 0,PACJENT_NR,BETET,CO2TET,HCO3ACTE,HCO3STTE,O2SATTET,O2TET,IONH,ZGON
50,2,-0.043478,3.22,0.983333,0.175,-1.52,-0.372,2.525596,0
51,2,0.739130,2.90,1.183333,0.675,0.16,0.868,1.982770,0
52,2,1.086957,3.73,1.516667,0.900,0.38,1.364,2.294287,0
53,2,0.739130,3.47,1.566667,0.875,0.02,0.572,2.033501,0
54,2,1.108696,2.23,1.516667,1.300,0.46,1.032,1.081419,0
...,...,...,...,...,...,...,...,...,...
2929,145,-0.717391,1.63,0.133333,-0.350,0.18,0.420,2.294287,1
2930,145,-0.891304,1.39,-0.016667,-0.450,0.76,1.640,2.254399,1
2931,145,-1.065217,1.71,-0.050000,-0.650,-0.06,0.316,2.708690,1
2932,145,-0.695652,1.15,0.066667,-0.275,0.62,1.600,1.857967,1


In [6]:
df_imp[df_imp["czy_na"]==True]["PACJENT_NR"]

1         1
36        1
49        1
96        3
980      50
2588    120
2778    133
2816    134
2818    134
Name: PACJENT_NR, dtype: int64

In [7]:
np.random.seed(13)
num_values_to_remove =int(1000)
rows_to_remove = np.random.choice(df_bezna.index, size=num_values_to_remove, replace=True)
cols_to_remove = np.random.choice(df_bezna.columns[1:10], size=num_values_to_remove, replace=True)
id=zip(rows_to_remove, cols_to_remove)
df_nowe=df_bezna.copy()

for row, col in id:
  df_nowe.loc[row, col] = np.nan

df_nowe.isnull().sum().sum()/19250

0.05033766233766234

In [8]:
#KNN na wszystkich danych
wyn_knn_all=list()
n_range=range(1,31)
for i in n_range:
  kmni_all = KNNImputer(n_neighbors=i)
  kmni=kmni_all.fit_transform(df_nowe[cols])
  wyn_knn_all.append(np.sqrt(mean_squared_error(df_bezna[cols], kmni)))

wyn_knn_all_df=pd.DataFrame(np.column_stack([n_range, wyn_knn_all]), columns=["k","rmse"])

fig = px.line(wyn_knn_all_df, x="k", y="rmse", markers=True, line_shape='linear')

# Dodajemy etykiety i tytuł
fig.update_layout(
    xaxis=dict(
        title='Liczba sąsiadów (k)',
        tickfont=dict(size=16),
        title_font=dict(size=20),
    ),
    yaxis=dict(
        title='Miara RMSE',
        tickfont=dict(size=16),
        title_font=dict(size=20)
    )
)
fig.update_layout(template="plotly_white")

# Wyświetlamy wykres
fig.show()
fig.write_image("images3/knn_all.png", width=1000, height=600, scale=4, format="png")

In [9]:
wyn_knn_all_df.iloc[wyn_knn_all_df["rmse"].idxmin()]

k       5.000000
rmse    0.098045
Name: 4, dtype: float64

In [10]:
# Inicjalizujemy słownik do przechowywania wyników
pacjenci_df_knn = {}
n_neighbors_range = range(1, 11)
wyn_knn_pac=list()

for n_neighbors in n_neighbors_range:
    knn_imp = KNNImputer(n_neighbors=n_neighbors)
    pacjenci_df_knn = np.concatenate([knn_imp.fit_transform(pacjent_df[cols]) for _, pacjent_df in df_nowe.groupby(by='PACJENT_NR')],axis=0)
    pacjenci_df_knn= pd.DataFrame(pacjenci_df_knn, columns=cols)
    wyn_knn_pac.append(np.sqrt(mean_squared_error(df_bezna[cols], pacjenci_df_knn)))

In [11]:
wyn_knn_pac_df=pd.DataFrame(np.column_stack([n_neighbors_range, wyn_knn_pac]), columns=["k","rmse"])

fig = px.line(wyn_knn_pac_df, x="k", y="rmse", markers=True, line_shape='linear')

# Dodajemy etykiety i tytuł
fig.update_layout(
    xaxis=dict(
        title='Liczba sąsiadów (k)',
        tickfont=dict(size=16),
        title_font=dict(size=20),
    ),
    yaxis=dict(
        title='Miara RMSE',
        tickfont=dict(size=16),
        title_font=dict(size=20)
    )
)
fig.update_layout(template="plotly_white")
# Wyświetlamy wykres
fig.show()
fig.write_image("images3/knn_patient.png", width=1000, height=600, scale=4, format="png")


In [12]:

imputers = {
    "mean": Imputer(method="mean"),
    "median": Imputer(method="median"),
    "drift": Imputer(method="drift"),
    "backfill": Imputer(method="backfill"),
    "ffill": Imputer(method="pad"),
    "knn_p": KNNImputer(n_neighbors=2)
}

pacjenci_df = {}

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for imp_name, imp in imputers.items():
        pacjenci_df[imp_name] = np.concatenate(
            [imp.fit_transform(pacjent_df[cols]) for _, pacjent_df in df_nowe.groupby(by='PACJENT_NR')],
            axis=0
        )
        pacjenci_df[imp_name] = pd.DataFrame(pacjenci_df[imp_name], columns=cols)

kmni_imputer = KNNImputer(n_neighbors=12)
pacjenci_df["knn_all"]=kmni_imputer.fit_transform(df_nowe[cols])


In [13]:
print("Podstawienie średniego wyniku pacjenta: ", str(np.round(mean_squared_error(df_bezna[cols], pacjenci_df['mean']), 4)))
print("Podstawienie mediany wyników pacjenta: ", str(np.round(mean_squared_error(df_bezna[cols], pacjenci_df['median']), 4)))
print("drift ", str(np.round(mean_squared_error(df_bezna[cols], pacjenci_df['drift']), 4)))
print("ffill ", str(np.round(mean_squared_error(df_bezna[cols], pacjenci_df['ffill']), 4)))
print("backfill ", str(np.round(mean_squared_error(df_bezna[cols], pacjenci_df['backfill']), 4)))
print("Metoda najbliższych sąsiadów k=2 na pacjentach: ", str(np.round(mean_squared_error(df_bezna[cols], pacjenci_df['knn_p']),4)))
print("Metoda najbliższych sąsiadów k=12 na wszystkich: ", str(np.round(mean_squared_error(df_bezna[cols], pacjenci_df["knn_all"]),4)))

Podstawienie średniego wyniku pacjenta:  0.0422
Podstawienie mediany wyników pacjenta:  0.0436
drift  0.0309
ffill  0.031
backfill  0.0365
Metoda najbliższych sąsiadów k=2 na pacjentach:  0.0179
Metoda najbliższych sąsiadów k=12 na wszystkich:  0.0101


### Uzupełnienie danych najlepszą metoda

In [9]:
#wypełnianie braków metodą K najbliższych sąsiadów
imputer = KNNImputer(n_neighbors=12)
df_prepared = pd.DataFrame(imputer.fit_transform(df_filtr_std[cols]), columns=cols)
df_prepared


Unnamed: 0,BETET,CO2TET,HCO3ACTE,HCO3STTE,O2SATTET,O2TET,IONH,ZGON
0,0.326087,2.21,0.850000,0.225,-2.48,-0.720,1.882698,0.0
1,0.086957,5.22,0.866667,0.200,0.86,4.880,3.297125,0.0
2,0.760870,3.10,1.250000,0.800,0.92,6.000,2.201643,0.0
3,0.869565,2.75,1.266667,0.675,0.62,1.440,1.882698,0.0
4,0.608696,2.61,1.083333,0.725,0.82,2.800,1.970160,0.0
...,...,...,...,...,...,...,...,...
2929,-0.717391,1.63,0.133333,-0.350,0.18,0.420,2.294287,1.0
2930,-0.891304,1.39,-0.016667,-0.450,0.76,1.640,2.254399,1.0
2931,-1.065217,1.71,-0.050000,-0.650,-0.06,0.316,2.708690,1.0
2932,-0.695652,1.15,0.066667,-0.275,0.62,1.600,1.857967,1.0


In [10]:
info_cols=df_filtr_std[["PACJENT_NR", "BADANIE_NR"]]
df_prepared2=info_cols.merge(df_prepared, left_index=True, right_index=True)
df_prepared2.to_csv("gasometry_prepared_data.csv")