Wczytanie Danych

In [1]:
import pandas as pd

df = pd.read_csv('Dane_mpsi.csv',sep='\t', encoding='utf-8')

Najprostsza predykcja zgonu

In [2]:
zgon_df = df.drop(columns=['KG', 'follow up 30 dni'])
zgon_df

Unnamed: 0,zgon,male sex,WIEK,Operowany przed przyjęciem (0/1),Interleukina 6,Prokalcytonina,MAP 1sza doba,pao2/fio2 1sza doba,Waga (kg),Wzrost (cm),BMI,Glukoza (1. gaz. 1sza doba),Lac (1. gaz. 1sza doba),BE (1. gaz. 1sza doba),SOFA - punktacja,Sepsa (0/1)
0,0,0,51,0,110,649,80,4525,70,165,2571166208,32,23,-234,5,1
1,0,1,72,1,0,006,6333333333,1788888889,100,180,3086419753,86,27,-46,10,0
2,0,1,62,1,1115,704,120,1430,80,175,2612244898,72,24,-46,10,0
3,0,0,71,1,517,051,70,3125,80,170,276816609,65,06,-79,13,0
4,1,0,64,0,151844,179,60,0,80,160,3125,42,18,-195,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,0,0,65,1,0,002,8333333333,408,75,165,2754820937,94,2,221,7,0
455,0,1,62,1,738,589,60,385,80,178,252493372,11,74,-153,13,1
456,1,0,73,1,50000,4304,90,1388,60,165,2203856749,128,105,-141,15,1
457,0,1,37,0,4823,132,60,256,70,170,2422145329,56,146,-185,10,0


In [3]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

X_zgon = zgon_df.drop(columns=['zgon'])
y_zgon = zgon_df['zgon']

for col in X_zgon.select_dtypes(include='object').columns:
    X_zgon[col] = X_zgon[col].replace('Nie znaleziono', np.nan)
    X_zgon[col] = X_zgon[col].str.replace(',', '.').astype(float)
    
imputer = SimpleImputer(strategy='mean')
X_zgon = pd.DataFrame(imputer.fit_transform(X_zgon), columns=X_zgon.columns)	

# Split the data
X_train_zgon, X_test_zgon, y_train_zgon, y_test_zgon = train_test_split(
    X_zgon, y_zgon, test_size=0.2, random_state=42, stratify=y_zgon
)

# Pipeline: replace missing values with mean and train a decision tree
clf_zgon = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('tree', DecisionTreeClassifier(max_depth=4, random_state=42))
])
clf_zgon.fit(X_train_zgon, y_train_zgon)

# Evaluate on train
y_train_pred = clf_zgon.predict(X_train_zgon)
acc_train = accuracy_score(y_train_zgon, y_train_pred)
print(f"Train accuracy: {acc_train:.3f}")

# Evaluate on test
y_test_pred = clf_zgon.predict(X_test_zgon)
acc_test = accuracy_score(y_test_zgon, y_test_pred)
print(f"Test accuracy: {acc_test:.3f}")

Train accuracy: 0.763
Test accuracy: 0.674


In [4]:
from sklearn.ensemble import RandomForestClassifier

rf_clf_zgon = RandomForestClassifier(random_state=42, max_depth=3, n_estimators=300)
rf_clf_zgon.fit(X_train_zgon, y_train_zgon)

# Evaluate on train
y_train_pred_rf_zgon = rf_clf_zgon.predict(X_train_zgon)
acc_train_rf_zgon = accuracy_score(y_train_zgon, y_train_pred_rf_zgon)
print(f"Random Forest Train Accuracy: {acc_train_rf_zgon:.3f}")

# Evaluate on test
y_test_pred_rf_zgon = rf_clf_zgon.predict(X_test_zgon)
acc_test_rf_zgon = accuracy_score(y_test_zgon, y_test_pred_rf_zgon)
print(f"Random Forest Test Accuracy: {acc_test_rf_zgon:.3f}")

# Feature importances
rf_importances_zgon = pd.Series(rf_clf_zgon.feature_importances_, index=X_train_zgon.columns)
print("Random Forest Feature importances:")
print(rf_importances_zgon.sort_values(ascending=False))

Random Forest Train Accuracy: 0.777
Random Forest Test Accuracy: 0.761
Random Forest Feature importances:
Lac (1. gaz. 1sza doba)             0.182854
BE (1. gaz. 1sza doba)              0.137004
SOFA - punktacja                    0.136141
WIEK                                0.125826
pao2/fio2 1sza doba                 0.063737
Interleukina 6                      0.057396
Prokalcytonina                      0.055359
BMI                                 0.051958
Glukoza (1. gaz. 1sza doba)         0.050513
MAP 1sza doba                       0.046439
Wzrost (cm)                         0.040422
Waga (kg)                           0.030598
Sepsa (0/1)                         0.011385
Operowany przed przyjęciem (0/1)    0.005398
male sex                            0.004970
dtype: float64


Okej, widzimy że bez feature engineringu mamy 0.76 accuracy i że najważniejsze cechy to Wiek, SOFA i Interleukina  

In [17]:
from itertools import combinations
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

numeric_cols = X_zgon.select_dtypes(include=[np.number]).columns.tolist()

feature_scores = []
feature_data = {}

for col1, col2 in combinations(numeric_cols, 2):
    combinations_dict = {
        f"{col1}_plus_{col2}": X_zgon[col1] + X_zgon[col2],
        f"{col1}_mul_{col2}": X_zgon[col1] * X_zgon[col2],
        f"{col1}_minus_{col2}": X_zgon[col1] - X_zgon[col2],
        f"{col1}_div_{col2}": X_zgon[col1] / (X_zgon[col2].replace(0, np.nan) + 1e-6),
    }

    for op_name, feat in combinations_dict.items():
        score = cross_val_score(
            DecisionTreeClassifier(max_depth=3, random_state=42),
            feat.values.reshape(-1, 1), y_zgon, cv=5, scoring='accuracy'
        ).mean()
        feature_scores.append((op_name, score))
        feature_data[op_name] = feat.fillna(0)

# Sort top features
top_10 = sorted(feature_scores, key=lambda x: x[1], reverse=True)[:10]
top_10_df = pd.DataFrame(top_10, columns=['feature', 'cv_accuracy'])
top_10_df.to_csv('top_10_combined_features.csv', index=False)

# Create full DataFrame of just those 10
X_combined_feats = pd.DataFrame({
    name: feature_data[name] for name, _ in top_10
})

top_10_df

Unnamed: 0,feature,cv_accuracy
0,Lac (1. gaz. 1sza doba)_plus_SOFA - punktacja,0.679623
1,WIEK_mul_Lac (1. gaz. 1sza doba),0.671166
2,Glukoza (1. gaz. 1sza doba)_div_Lac (1. gaz. 1...,0.666842
3,male sex_plus_Lac (1. gaz. 1sza doba),0.662375
4,Lac (1. gaz. 1sza doba)_mul_SOFA - punktacja,0.658003
5,BE (1. gaz. 1sza doba)_minus_SOFA - punktacja,0.65571
6,WIEK_mul_BE (1. gaz. 1sza doba),0.653703
7,BE (1. gaz. 1sza doba)_minus_Sepsa (0/1),0.653703
8,pao2/fio2 1sza doba_div_BE (1. gaz. 1sza doba),0.653679
9,BE (1. gaz. 1sza doba)_mul_SOFA - punktacja,0.653583


In [19]:
X_train_feats = X_combined_feats.loc[X_train_zgon.index]
X_test_feats = X_combined_feats.loc[X_test_zgon.index]
X_train_feats.head()

Unnamed: 0,Lac (1. gaz. 1sza doba)_plus_SOFA - punktacja,WIEK_mul_Lac (1. gaz. 1sza doba),Glukoza (1. gaz. 1sza doba)_div_Lac (1. gaz. 1sza doba),male sex_plus_Lac (1. gaz. 1sza doba),Lac (1. gaz. 1sza doba)_mul_SOFA - punktacja,BE (1. gaz. 1sza doba)_minus_SOFA - punktacja,WIEK_mul_BE (1. gaz. 1sza doba),BE (1. gaz. 1sza doba)_minus_Sepsa (0/1),pao2/fio2 1sza doba_div_BE (1. gaz. 1sza doba),BE (1. gaz. 1sza doba)_mul_SOFA - punktacja
163,15.7,194.4,5.629628,3.7,35.1,-15.8,-201.6,-3.8,-206.428645,-36.4
182,3.1,186.0,6.387095,3.1,0.0,-9.3,-558.0,-10.3,-0.0,-0.0
169,15.3,65.0,7.38461,1.3,18.2,-24.5,-525.0,-10.5,-15.714287,-147.0
311,13.9,95.0,9.105258,1.9,22.8,-16.6,-230.0,-5.6,-29.347832,-55.2
99,19.4,486.0,1.185185,5.4,75.6,-25.3,-1017.0,-12.3,-11.401181,-158.2


In [32]:
from skopt import BayesSearchCV # scikit-optimize
from sklearn.ensemble import RandomForestClassifier
from skopt.space import Integer, Categorical
from sklearn.model_selection import StratifiedKFold

search_space = {
    'max_depth': Integer(2, 15),
    'n_estimators': Integer(100, 5000),
    'max_features': Categorical(['sqrt', 'log2', None]),
}

cv = StratifiedKFold(n_splits=5, shuffle=True)

opt = BayesSearchCV(
    estimator=RandomForestClassifier(),
    search_spaces=search_space,
    n_iter=50,
    scoring='accuracy',
    cv=cv,
    n_jobs=-1
)

opt.fit(X_train_feats, y_train_zgon)

print("Best params:", opt.best_params_)
print(f"Best CV accuracy: {opt.best_score_:.3f}")

Best params: OrderedDict([('max_depth', 2), ('max_features', 'log2'), ('n_estimators', 2487)])
Best CV accuracy: 0.692


In [34]:
results = []
for seed in range(10):
    rf_clf_feats = RandomForestClassifier(max_depth=2, max_features='log2', n_estimators=2500, random_state=seed)
    rf_clf_feats.fit(X_train_feats, y_train_zgon)
    y_test_pred_feats = rf_clf_feats.predict(X_test_feats)
    acc = accuracy_score(y_test_zgon, y_test_pred_feats)
    results.append(acc)

print(f"Max accuracy: {np.max(results):.3f}")
print(f"Min accuracy: {np.min(results):.3f}")
print(f"Average accuracy: {np.mean(results):.3f}")

Max accuracy: 0.750
Min accuracy: 0.717
Average accuracy: 0.734


In [21]:
from catboost import CatBoostClassifier

# Train CatBoost on base features
cat_clf = CatBoostClassifier(verbose=0, random_state=42, max_depth=2, n_estimators=500)
cat_clf.fit(X_train_zgon, y_train_zgon)

# Evaluate on train
y_train_pred_cat = cat_clf.predict(X_train_zgon)
acc_train_cat = accuracy_score(y_train_zgon, y_train_pred_cat)
print(f"CatBoost Train Accuracy: {acc_train_cat:.3f}")

# Evaluate on test
y_test_pred_cat = cat_clf.predict(X_test_zgon)
acc_test_cat = accuracy_score(y_test_zgon, y_test_pred_cat)
print(f"CatBoost Test Accuracy: {acc_test_cat:.3f}")

CatBoost Train Accuracy: 0.777
CatBoost Test Accuracy: 0.750


In [23]:
catboost_results = []
for seed in range(10):
    cat_clf_seed = CatBoostClassifier(verbose=0, random_state=seed, max_depth=2, n_estimators=500)
    cat_clf_seed.fit(X_train_zgon, y_train_zgon)
    y_test_pred_cat_seed = cat_clf_seed.predict(X_test_zgon)
    acc_cat = accuracy_score(y_test_zgon, y_test_pred_cat_seed)
    catboost_results.append(acc_cat)

print(f"CatBoost Max accuracy: {np.max(catboost_results):.3f}")
print(f"CatBoost Min accuracy: {np.min(catboost_results):.3f}")
print(f"CatBoost Average accuracy: {np.mean(catboost_results):.3f}")

CatBoost Max accuracy: 0.772
CatBoost Min accuracy: 0.728
CatBoost Average accuracy: 0.753
