In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pd.set_option('future.no_silent_downcasting', True)

In [2]:
from sklearn.tree import DecisionTreeClassifier

df_large = pd.read_csv('MPSI_large.csv', sep='\t', encoding='utf-8')

# Prepare as before: drop unnecessary columns
zgon_df_large = df_large.drop(columns=['KG', 'follow up 30 dni'], errors='ignore')

# Prepare features and target
X_zgon_large = zgon_df_large.drop(columns=['zgon'])
y_zgon_large = zgon_df_large['zgon']

# Preprocess object columns
for col in X_zgon_large.select_dtypes(include='object').columns:
    X_zgon_large[col] = X_zgon_large[col].replace('Nie znaleziono', np.nan)
   
    X_zgon_large[col] = X_zgon_large[col].str.replace(',', '.')
    X_zgon_large[col] = X_zgon_large[col].replace('M', 0)
    X_zgon_large[col] = X_zgon_large[col].replace('K', 1)
    X_zgon_large[col] = pd.to_numeric(X_zgon_large[col], errors='coerce')
    
# Split the data
X_train_large, X_test_large, y_train_large, y_test_large = train_test_split(
    X_zgon_large, y_zgon_large, test_size=0.2, random_state=42, stratify=y_zgon_large
)

# Train decision tree
clf_large = DecisionTreeClassifier( random_state=42)
clf_large.fit(X_train_large, y_train_large)

# Evaluate
y_train_pred_large = clf_large.predict(X_train_large)
y_test_pred_large = clf_large.predict(X_test_large)
acc_train_large = accuracy_score(y_train_large, y_train_pred_large)
acc_test_large = accuracy_score(y_test_large, y_test_pred_large)
print(f"Train accuracy (large): {acc_train_large:.3f}")
print(f"Test accuracy (large): {acc_test_large:.3f}")

# Feature importances
importances_large = pd.Series(clf_large.feature_importances_, index=X_train_large.columns)
print("Feature importances (large dataset):")
print(importances_large.sort_values(ascending=False))

Train accuracy (large): 1.000
Test accuracy (large): 0.641
Feature importances (large dataset):
pH (1. gaz. 3 TISS)                             0.235391
BE (1. gaz. 1sza doba)                          0.058982
temperatura ciała przy przyjęciu (TISS nr 1)    0.049927
Bil (1. gaz. 3 TISS)                            0.046685
Cl (1. gaz. 1 TISS)                             0.045277
                                                  ...   
average valeu ofscd 3tiss                       0.000000
SCD<30 average                                  0.000000
scd<30 tiss1                                    0.000000
Sepsa                                           0.000000
Sepsa (0/1)                                     0.000000
Length: 163, dtype: float64


In [3]:
from catboost import CatBoostClassifier

# Prepare features and target for CatBoost
X_cb = X_zgon_large
y_cb = y_zgon_large

cat_clf_new = CatBoostClassifier(verbose=0, random_state=42, max_depth=2, n_estimators=500)
cat_clf_new.fit(X_cb, y_cb)

# Evaluate on train
y_train_pred_cb = cat_clf_new.predict(X_cb)
acc_train_cb = accuracy_score(y_cb, y_train_pred_cb)
print(f"CatBoost Train Accuracy: {acc_train_cb:.3f}")

# Evaluate on test
y_test_pred_cb = cat_clf_new.predict(X_test_large)
acc_test_cb = accuracy_score(y_test_large, y_test_pred_cb)
print(f"CatBoost Test Accuracy: {acc_test_cb:.3f}")
# Feature importances
importances_cb = pd.Series(cat_clf_new.feature_importances_, index=X_cb.columns)
print("CatBoost Feature importances:")
print(importances_cb.sort_values(ascending=False).head(20))

CatBoost Train Accuracy: 0.889
CatBoost Test Accuracy: 0.891
CatBoost Feature importances:
pH (1. gaz. 3 TISS)                      9.529783
SAPS_RAZEM                               7.022840
pH (1. gaz. 2 TISS)                      4.765319
Lac (1. gaz. 3 TISS)                     4.543735
Ca2+ (1. gaz. 1 TISS)                    4.161629
Glukoza (1. gaz. 2 TISS)                 3.402984
DIUREZA W PIERWSZYCH 24 GODZIN Z TISS    3.275101
HCO3 (1. gaz. 2 TISS)                    3.071175
Glukoza (1. gaz. 3 TISS)                 2.548784
albuminy                                 2.498316
BE (1. gaz. 2 TISS)                      2.449735
BE (1. gaz. 3 TISS)                      2.290183
CTK skurczowe godz. 0.00 TISS nr 3       2.259664
bilans płynów TISS 2 (ml) (TISS 2)       1.690703
FIO2 (TISS nr 1)                         1.583980
mOsm (1. gaz. 3 TISS)                    1.580227
Lac (1. gaz. 2 TISS)                     1.572073
Bil (1. gaz. 2 TISS)                     1.504704
Lac (1. g

In [7]:
# Get top 20 features from CatBoost importances
top20_features = importances_cb.sort_values(ascending=False).head(20).index.tolist()

# Prepare data with top 20 features
X_train_top20 = X_train_large[top20_features]
X_test_top20 = X_test_large[top20_features]

# Retrain DecisionTreeClassifier on top 20 features
clf_top20 = DecisionTreeClassifier(random_state=42,min_samples_leaf=8)
clf_top20.fit(X_train_top20, y_train_large)

# Evaluate
y_train_pred_top20 = clf_top20.predict(X_train_top20)
y_test_pred_top20 = clf_top20.predict(X_test_top20)
acc_train_top20 = accuracy_score(y_train_large, y_train_pred_top20)
acc_test_top20 = accuracy_score(y_test_large, y_test_pred_top20)
print(f"Train accuracy (top 20): {acc_train_top20:.3f}")
print(f"Test accuracy (top 20): {acc_test_top20:.3f}")

Train accuracy (top 20): 0.847
Test accuracy (top 20): 0.826


In [9]:
# Get top 20 features from CatBoost importances
top20_features = importances_cb.sort_values(ascending=False).head(20).index.tolist()

# Prepare data with top 20 features
X_train_top20 = X_train_large[top20_features]
X_test_top20 = X_test_large[top20_features]

# Retrain DecisionTreeClassifier on top 20 features
clf_top20 = CatBoostClassifier(random_state=42)
clf_top20.fit(X_train_top20, y_train_large)

# Evaluate
y_train_pred_top20 = clf_top20.predict(X_train_top20)
y_test_pred_top20 = clf_top20.predict(X_test_top20)
acc_train_top20 = accuracy_score(y_train_large, y_train_pred_top20)
acc_test_top20 = accuracy_score(y_test_large, y_test_pred_top20)
print(f"Train accuracy (top 20): {acc_train_top20:.3f}")
print(f"Test accuracy (top 20): {acc_test_top20:.3f}")

Learning rate set to 0.006715
0:	learn: 0.6905366	total: 4.96ms	remaining: 4.95s
1:	learn: 0.6880838	total: 6.96ms	remaining: 3.47s
2:	learn: 0.6849656	total: 8.93ms	remaining: 2.97s
3:	learn: 0.6816608	total: 10.9ms	remaining: 2.71s
4:	learn: 0.6781121	total: 13.1ms	remaining: 2.6s
5:	learn: 0.6753587	total: 15.9ms	remaining: 2.63s
6:	learn: 0.6723322	total: 17.7ms	remaining: 2.52s
7:	learn: 0.6697644	total: 19.5ms	remaining: 2.42s
8:	learn: 0.6670012	total: 21.3ms	remaining: 2.34s
9:	learn: 0.6643244	total: 23ms	remaining: 2.28s
10:	learn: 0.6607535	total: 24.9ms	remaining: 2.24s
11:	learn: 0.6582951	total: 30.3ms	remaining: 2.5s
12:	learn: 0.6554819	total: 32.4ms	remaining: 2.46s
13:	learn: 0.6530237	total: 34.2ms	remaining: 2.41s
14:	learn: 0.6505183	total: 36ms	remaining: 2.36s
15:	learn: 0.6485834	total: 38ms	remaining: 2.34s
16:	learn: 0.6467140	total: 40.3ms	remaining: 2.33s
17:	learn: 0.6440922	total: 42.5ms	remaining: 2.32s
18:	learn: 0.6418783	total: 44.4ms	remaining: 2.29s


prevent overfitting?


In [22]:
clf_top20 = CatBoostClassifier(verbose=0, random_state=42, max_depth=2, n_estimators=500)
clf_top20.fit(X_train_top20, y_train_large)

# Evaluate
y_train_pred_top20 = clf_top20.predict(X_train_top20)
y_test_pred_top20 = clf_top20.predict(X_test_top20)
acc_train_top20 = accuracy_score(y_train_large, y_train_pred_top20)
acc_test_top20 = accuracy_score(y_test_large, y_test_pred_top20)
print(f"Train accuracy (top 20): {acc_train_top20:.3f}")
print(f"Test accuracy (top 20): {acc_test_top20:.3f}")

Train accuracy (top 20): 0.826
Test accuracy (top 20): 0.772


train with random forest

In [24]:
from sklearn.ensemble import RandomForestClassifier
clf_top20 = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=2)
clf_top20.fit(X_train_top20, y_train_large)

# Evaluate
y_train_pred_top20 = clf_top20.predict(X_train_top20)
y_test_pred_top20 = clf_top20.predict(X_test_top20)
acc_train_top20 = accuracy_score(y_train_large, y_train_pred_top20)
acc_test_top20 = accuracy_score(y_test_large, y_test_pred_top20)
print(f"Train accuracy (top 20): {acc_train_top20:.3f}")
print(f"Test accuracy (top 20): {acc_test_top20:.3f}")

Train accuracy (top 20): 0.763
Test accuracy (top 20): 0.761
