### Klasyfikacja

In [14]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import StratifiedKFold
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from my_functions import *
from sklearn.tree import DecisionTreeClassifier
import plotly.express as px
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier

In [15]:
df_prep = pd.read_csv("..//datasets/gasometry_prepared_data.csv")
cols=["BETET",	"CO2TET",	"HCO3ACTE",	"HCO3STTE",	"O2SATTET",	"O2TET", "IONH"]
my_blue="#0064B2"
my_red="#D61600"

In [16]:
train_data, test_data = train_test_split(df_prep, test_size=0.2, stratify=df_prep["ZGON"], random_state=42)
train_data = train_data.drop(train_data.columns[0], axis=1)

In [17]:
X_train=train_data[cols]
y_train=train_data["ZGON"]

X_test=test_data[cols]
y_test=test_data["ZGON"]

#### Dummy classifier

In [18]:
dummy_model = DummyClassifier(strategy="most_frequent")
dummy_model.fit(X_train, y_train)
y_pred = dummy_model.predict(X_test)
accuracy_assessment(y_test, y_pred)

Accuracy: 0.5212947189097104
Balanced accuracy: 0.5
Precision: 0.271748183963154
Sensivity (recall): 0.5212947189097104
F1-Score: 0.35725909067496386



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



### Drzewo decyzyjne

In [19]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

ma_train = []
ma_test = []
depth = np.arange(1, 15)
for j in depth:
    model = DecisionTreeClassifier(random_state=42, max_depth=j)
    a_test = []
    a_tren = []

    for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        model.fit(X_train.iloc[train_index], y_train.iloc[train_index])
        a_test.append(model.score(X_train.iloc[test_index], y_train.iloc[test_index]).round(4))
        a_tren.append(model.score(X_train.iloc[train_index], y_train.iloc[train_index]).round(4))

    ma_test.append(np.mean(a_test))
    ma_train.append(np.mean(a_tren))

In [20]:
train_test_plot(depth, ma_train, ma_test, "Max Depth", "max_depth_tree")

In [21]:
op_depth_tree=np.argmax(ma_test)+1
op_depth_tree

5

In [29]:
model_dt= DecisionTreeClassifier(random_state=42)

model_dt.fit(X_train, y_train)

trees_param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': range(1,10),
    'min_samples_leaf': range(1,20)
}
 
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
dt_grid_search = GridSearchCV(model_dt, trees_param_grid, cv=cv_strategy, scoring='balanced_accuracy', return_train_score=True)
dt_grid_search.fit(X_train, y_train)

In [None]:
tree_results=dt_grid_search.cv_results_
results_df = pd.DataFrame(tree_results)
params_df = pd.json_normalize(results_df['params'])
metrics = ['mean_test_score', 'std_test_score']
tree_results_df = pd.concat([params_df, results_df[metrics]], axis=1)

tree_results_df.columns=["criterion", "max_depth", "min_samples_leaf", "mean_test_score", "std_test_score"]
tree_results_df.sort_values(by='mean_test_score', ascending=False).head(10)

Unnamed: 0,criterion,max_depth,min_samples_leaf,mean_test_score,std_test_score
286,entropy,7,2,0.668509,0.018463
105,gini,6,11,0.66769,0.018621
106,gini,6,12,0.667685,0.019376
104,gini,6,10,0.667555,0.014791
67,gini,4,11,0.666154,0.017338
285,entropy,7,1,0.666084,0.018649
66,gini,4,10,0.66571,0.016985
277,entropy,6,12,0.665502,0.015017
107,gini,6,13,0.665309,0.018402
71,gini,4,15,0.665164,0.01686


In [None]:
best_dt_model = dt_grid_search.best_estimator_
best_dt_model.fit(X_train, y_train)
y_pred = best_dt_model.predict(X_test)
accuracy_assessment(y_test, y_pred)

Accuracy: 0.6848381601362862
Balanced accuracy: 0.6825936780406112
Precision: 0.684922634556673
Sensivity (recall): 0.6848381601362862
F1-Score: 0.6838215832990123


In [41]:
# significant_variables(model, X_train, "trees")
indeksy = np.where(model_dt.feature_importances_!=0)[0]
variables= [X_train.columns[i] for i in indeksy]
importances = model_dt.feature_importances_[indeksy]

# sortowanie
importances, variables= zip(*sorted(zip(importances, variables), reverse=True))
df_importances = pd.DataFrame({'Variable': variables, 'Importance': importances})
df_importances 

Unnamed: 0,Variable,Importance
0,IONH,0.220266
1,O2SATTET,0.160454
2,O2TET,0.15836
3,CO2TET,0.152508
4,BETET,0.130298
5,HCO3ACTE,0.097104
6,HCO3STTE,0.081009


In [None]:
# kategorie=['minimum BETET \n measurement', 'minimum [H$^\plus$] \n (maximum pH)', 'maximum [H$^\plus$] (minimum pH)', 'maximum pCO$_2$', 'BETET below the norm [number of cases]',
#            '[H$^\plus$] below the norm \n (pH above the norm [number of cases])', 'pO$_2$ saturation below the norm \n [number of cases]', 'pCO$_2$ below the norm [number of cases]',
#            'BETET above the norm [number of cases]',
#            '[H$^\plus$] above the norm \n (pH below the norm) \n [number of cases]', 'average change in \n ion H concentration', 'average change in \n O$_2$ saturation', 'average change in pCO$_2$',
#            'BETET above the norm [number of cases in a row]', '[H$^\plus$] above the norm \n (pH below the norm) \n [number of cases in a row]', 'BETET below the norm [number of cases in a row]',
#            '[H$^\plus$] below the norm \n (pH above the norm) \n [number of cases in a row]', 'O$_2$ saturation above the norm \n [number of cases in a row]']

### Las losowy

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

ma_train = []
ma_test = []
depth = np.arange(1, 15)
for j in depth:
    model = RandomForestClassifier(random_state=42, max_depth=j)
    a_test = []
    a_tren = []

    for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        model.fit(X_train.iloc[train_index], y_train.iloc[train_index])
        a_test.append(model.score(X_train.iloc[test_index], y_train.iloc[test_index]).round(4))
        a_tren.append(model.score(X_train.iloc[train_index], y_train.iloc[train_index]).round(4))

    ma_test.append(np.mean(a_test))
    ma_train.append(np.mean(a_tren))

In [26]:
train_test_plot(depth, ma_train, ma_test, "Maksymalna głębokość drzew", "max_depth_forest")

In [27]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
n_estimators_values = range(1,20)
ma_rf_train = []
ma_rf_test = []

for n_estimators in n_estimators_values:
    model = RandomForestClassifier(random_state=10, n_estimators=n_estimators, max_depth=18)
    a_test = []
    a_tren = []
    for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        model.fit(X_train.iloc[train_index], y_train.iloc[train_index])
        a_test.append(model.score(X_train.iloc[test_index], y_train.iloc[test_index]).round(4))
        a_tren.append(model.score(X_train.iloc[train_index], y_train.iloc[train_index]).round(4))

    ma_rf_test.append(np.mean(a_test))
    ma_rf_train.append(np.mean(a_tren))

optimal_n_estimators = n_estimators_values[np.argmax(ma_rf_test)]
print("Optimal value n_estimators:", optimal_n_estimators)

Optimal value n_estimators: 17


In [28]:
train_test_plot(n_estimators_values, ma_rf_train, ma_rf_test, "Liczba drzew", "n_estimators_forest")

In [32]:
model_rf=RandomForestClassifier(random_state=42)

rf_param_dist = {
    'n_estimators': range(10,50),
    'max_depth': range(1, 15),
    'min_samples_leaf': range(15)
}

model_rf.fit(X_train, y_train)

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
rf_grid_search = GridSearchCV(model_rf, trees_param_grid, cv=cv_strategy, scoring='balanced_accuracy', return_train_score=True)
rf_grid_search.fit(X_train, y_train)

In [33]:
best_rf_model = rf_grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)
y_pred = best_rf_model.predict(X_test)
accuracy_assessment(y_test, y_pred)

Accuracy: 0.696763202725724
Balanced accuracy: 0.6946130765473449
Precision: 0.696902614050576
Sensivity (recall): 0.696763202725724
F1-Score: 0.6958473631171787


In [42]:
indeksy = np.where(model_rf.feature_importances_!=0)[0]
variables= [X_train.columns[i] for i in indeksy]
importances = model_rf.feature_importances_[indeksy]

# sortowanie
importances, variables= zip(*sorted(zip(importances, variables), reverse=True))
df_importances = pd.DataFrame({'Variable': variables, 'Importance': importances})
df_importances 

Unnamed: 0,Variable,Importance
0,IONH,0.174306
1,O2TET,0.151551
2,O2SATTET,0.142254
3,BETET,0.139096
4,CO2TET,0.135862
5,HCO3STTE,0.135081
6,HCO3ACTE,0.12185


### KNN

In [55]:
k_values = range (1,40)

ma_knn_train = []
ma_knn_test = []

for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k)
    a_test = []
    a_tren = []
    for i, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        model.fit(X_train.iloc[train_index], y_train.iloc[train_index])
        a_test.append(model.score(X_train.iloc[test_index], y_train.iloc[test_index]).round(4))
        a_tren.append(model.score(X_train.iloc[train_index], y_train.iloc[train_index]).round(4))
    ma_knn_test.append(np.mean(a_test))
    ma_knn_train.append(np.mean(a_tren))

optimal_k= k_values[np.argmax(ma_knn_test)]
optimal_k

IndexError: range object index out of range

In [61]:
train_test_plot(k_values, ma_knn_train, ma_knn_test, "Liczba sąsiadów", "liczba_sasiadow")

In [62]:
knn = KNeighborsClassifier()
param_grid = {
    'n_neighbors': (15, 21),
    'leaf_size': (1, 20),
    'weights': ('uniform', 'distance'),
}
knn_grid = GridSearchCV(knn, param_grid, cv=10, scoring='balanced_accuracy', return_train_score=False)
knn_grid.fit(X_train, y_train)

In [67]:
knn_results=knn_grid.cv_results_
results_df = pd.DataFrame(knn_results)
params_df = pd.json_normalize(results_df['params'])
metrics = ['mean_test_score', 'std_test_score']
knn_results_df = pd.concat([params_df, results_df[metrics]], axis=1)
knn_results_df

Unnamed: 0,leaf_size,n_neighbors,weights,mean_test_score,std_test_score
0,1,15,uniform,0.662026,0.019936
1,1,15,distance,0.666248,0.019486
2,1,21,uniform,0.662826,0.01616
3,1,21,distance,0.669957,0.015635
4,20,15,uniform,0.662026,0.019936
5,20,15,distance,0.666248,0.019486
6,20,21,uniform,0.662826,0.01616
7,20,21,distance,0.669957,0.015635


In [63]:
best_knn_model = knn_grid.best_estimator_
best_knn_model.fit(X_train.values, y_train)
y_pred = best_knn_model.predict(X_test.values)
accuracy_assessment(y_test, y_pred)

Accuracy: 0.6882453151618398
Balanced accuracy: 0.6852801618868187
Precision: 0.6891540640998395
Sensivity (recall): 0.6882453151618398
F1-Score: 0.6864965176155545


### XG Boost

In [68]:
model_xgb = XGBClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}
model_xgb = GridSearchCV(model_xgb, param_grid, cv=5, scoring='balanced_accuracy', return_train_score=False)
model_xgb.fit(X_train, y_train)

In [70]:
model_xgb_results=model_xgb.cv_results_
results_df = pd.DataFrame(model_xgb_results)
params_df = pd.json_normalize(results_df['params'])
metrics = ['mean_test_score', 'std_test_score']
model_xgb_results_df = pd.concat([params_df, results_df[metrics]], axis=1)
model_xgb_results_df

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample,mean_test_score,std_test_score
0,0.8,0.01,3,50,0.8,0.675963,0.014301
1,0.8,0.01,3,50,0.9,0.671579,0.010134
2,0.8,0.01,3,50,1.0,0.674669,0.009426
3,0.8,0.01,3,100,0.8,0.679026,0.013534
4,0.8,0.01,3,100,0.9,0.679880,0.011429
...,...,...,...,...,...,...,...
238,1.0,0.20,5,100,0.9,0.648789,0.011841
239,1.0,0.20,5,100,1.0,0.651021,0.002237
240,1.0,0.20,5,200,0.8,0.644546,0.013057
241,1.0,0.20,5,200,0.9,0.637532,0.013303


In [71]:
best_knn_model = knn_grid.best_estimator_
best_knn_model.fit(X_train.values, y_train)
y_pred = best_knn_model.predict(X_test.values)
accuracy_assessment(y_test, y_pred)

Accuracy: 0.6882453151618398
Balanced accuracy: 0.6852801618868187
Precision: 0.6891540640998395
Sensivity (recall): 0.6882453151618398
F1-Score: 0.6864965176155545


In [69]:
import dill

# Zapisz bieżący kernel do pliku
with open('saved_kernel.pkl', 'wb') as f:
    dill.dump_session(f)

ModuleNotFoundError: No module named 'dill'