Wczytanie Danych

In [1]:
import pandas as pd

df = pd.read_csv('Dane_mpsi.csv',sep='\t', encoding='utf-8')

Najprostsza predykcja zgonu

In [2]:
zgon_df = df.drop(columns=['KG', 'follow up 30 dni'])
zgon_df

Unnamed: 0,zgon,male sex,WIEK,Operowany przed przyjęciem (0/1),Interleukina 6,Prokalcytonina,MAP 1sza doba,pao2/fio2 1sza doba,Waga (kg),Wzrost (cm),BMI,Glukoza (1. gaz. 1sza doba),Lac (1. gaz. 1sza doba),BE (1. gaz. 1sza doba),SOFA - punktacja,Sepsa (0/1)
0,0,0,51,0,110,649,80,4525,70,165,2571166208,32,23,-234,5,1
1,0,1,72,1,0,006,6333333333,1788888889,100,180,3086419753,86,27,-46,10,0
2,0,1,62,1,1115,704,120,1430,80,175,2612244898,72,24,-46,10,0
3,0,0,71,1,517,051,70,3125,80,170,276816609,65,06,-79,13,0
4,1,0,64,0,151844,179,60,0,80,160,3125,42,18,-195,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,0,0,65,1,0,002,8333333333,408,75,165,2754820937,94,2,221,7,0
455,0,1,62,1,738,589,60,385,80,178,252493372,11,74,-153,13,1
456,1,0,73,1,50000,4304,90,1388,60,165,2203856749,128,105,-141,15,1
457,0,1,37,0,4823,132,60,256,70,170,2422145329,56,146,-185,10,0


In [3]:
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

X_zgon = zgon_df.drop(columns=['zgon'])
y_zgon = zgon_df['zgon']

for col in X_zgon.select_dtypes(include='object').columns:
    X_zgon[col] = X_zgon[col].replace('Nie znaleziono', np.nan)
    X_zgon[col] = X_zgon[col].str.replace(',', '.').astype(float)
    
imputer = SimpleImputer(strategy='mean')
X_zgon = pd.DataFrame(imputer.fit_transform(X_zgon), columns=X_zgon.columns)	

# Split the data
X_train_zgon, X_test_zgon, y_train_zgon, y_test_zgon = train_test_split(
    X_zgon, y_zgon, test_size=0.2, random_state=42, stratify=y_zgon
)

# Pipeline: replace missing values with mean and train a decision tree
clf_zgon = DecisionTreeClassifier(max_depth=4, random_state=42)
clf_zgon.fit(X_train_zgon, y_train_zgon)

# Evaluate on train
y_train_pred = clf_zgon.predict(X_train_zgon)
acc_train = accuracy_score(y_train_zgon, y_train_pred)
print(f"Train accuracy: {acc_train:.3f}")

# Evaluate on test
y_valid_pred = clf_zgon.predict(X_test_zgon)
acc_test = accuracy_score(y_test_zgon, y_valid_pred)
print(f"Test accuracy: {acc_test:.3f}")

Train accuracy: 0.763
Test accuracy: 0.674


In [4]:
from sklearn.ensemble import RandomForestClassifier

rf_clf_zgon = RandomForestClassifier(random_state=42, max_depth=3, n_estimators=300)
rf_clf_zgon.fit(X_train_zgon, y_train_zgon)

# Evaluate on train
y_train_pred_rf_zgon = rf_clf_zgon.predict(X_train_zgon)
acc_train_rf_zgon = accuracy_score(y_train_zgon, y_train_pred_rf_zgon)
print(f"Random Forest Train Accuracy: {acc_train_rf_zgon:.3f}")

# Evaluate on test
y_test_pred_rf_zgon = rf_clf_zgon.predict(X_test_zgon)
acc_test_rf_zgon = accuracy_score(y_test_zgon, y_test_pred_rf_zgon)
print(f"Random Forest Test Accuracy: {acc_test_rf_zgon:.3f}")

# Feature importances
rf_importances_zgon = pd.Series(rf_clf_zgon.feature_importances_, index=X_train_zgon.columns)
print("Random Forest Feature importances:")
print(rf_importances_zgon.sort_values(ascending=False))

Random Forest Train Accuracy: 0.777
Random Forest Test Accuracy: 0.761
Random Forest Feature importances:
Lac (1. gaz. 1sza doba)             0.182854
BE (1. gaz. 1sza doba)              0.137004
SOFA - punktacja                    0.136141
WIEK                                0.125826
pao2/fio2 1sza doba                 0.063737
Interleukina 6                      0.057396
Prokalcytonina                      0.055359
BMI                                 0.051958
Glukoza (1. gaz. 1sza doba)         0.050513
MAP 1sza doba                       0.046439
Wzrost (cm)                         0.040422
Waga (kg)                           0.030598
Sepsa (0/1)                         0.011385
Operowany przed przyjęciem (0/1)    0.005398
male sex                            0.004970
dtype: float64


Okej, widzimy że bez feature engineringu mamy 0.76 accuracy i że najważniejsze cechy to Wiek, SOFA i Interleukina  

### Feature Engineering

Trenujemy DecisionTreeClassifier i patrzymy na to które kombinacje cech dały najlepszy wynik

In [5]:
from itertools import combinations
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import pandas as pd

# Identify numeric columns
df = X_zgon.copy()
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist() # empty

feature_scores = []
feature_data = {}
clf = DecisionTreeClassifier(max_depth=3, random_state=42)

# Base features
for col in numeric_cols:
    arr = df[col].fillna(0).values.reshape(-1, 1)
    score = cross_val_score(clf, arr, y_zgon, cv=3, scoring='accuracy', n_jobs=-1).mean()
    feature_scores.append((col, score))
    feature_data[col] = df[col].fillna(0)

# 1. Simple pairwise combinations
for col1, col2 in combinations(numeric_cols, 2):
    combos = {
        f"{col1}_plus_{col2}": df[col1] + df[col2],
        f"{col1}_mul_{col2}": df[col1] * df[col2],
        f"{col1}_minus_{col2}": df[col1] - df[col2],
        f"{col1}_div_{col2}": df[col1] / (df[col2].replace(0, np.nan) + 1e-6),
    }
    for name, feat in combos.items():
        arr = feat.fillna(0).values.reshape(-1, 1)
        score = cross_val_score(clf, arr, y_zgon, cv=3, scoring='accuracy', n_jobs=-1).mean()
        feature_scores.append((name, score))
        feature_data[name] = feat.fillna(0)

# 2. Polynomial features (degree=3)
poly = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly.fit_transform(df[numeric_cols])
poly_names = poly.get_feature_names_out(numeric_cols)
for idx, name in enumerate(poly_names):
    arr = X_poly[:, idx].reshape(-1, 1)
    score = cross_val_score(clf, arr, y_zgon, cv=3, scoring='accuracy', n_jobs=-1).mean()
    feature_scores.append((f"poly_{name}", score))
    feature_data[f"poly_{name}"] = X_poly[:, idx]

# 3. Deviation from row stats
row_mean = df[numeric_cols].mean(axis=1)
row_std = df[numeric_cols].std(axis=1) + 1e-6
for col in numeric_cols:
    minus_mean = df[col] - row_mean
    zscore = (df[col] - row_mean) / row_std
    for name, feat in [(f"{col}_minus_mean", minus_mean), (f"{col}_zscore", zscore)]:
        arr = feat.values.reshape(-1, 1)
        score = cross_val_score(clf, arr, y_zgon, cv=3, scoring='accuracy', n_jobs=-1).mean()
        feature_scores.append((name, score))
        feature_data[name] = feat

# Sort and select top 30
top_30 = sorted(feature_scores, key=lambda x: x[1], reverse=True)[:30]
top_30_df = pd.DataFrame(top_30, columns=['feature', 'cv_accuracy'])
top_30_df.to_csv('top_30_combined_features.csv', index=False)
top_30_df

Unnamed: 0,feature,cv_accuracy
0,poly_WIEK Lac (1. gaz. 1sza doba)^2,0.675381
1,Lac (1. gaz. 1sza doba)_plus_SOFA - punktacja,0.673203
2,WIEK_mul_Lac (1. gaz. 1sza doba),0.671024
3,poly_WIEK Lac (1. gaz. 1sza doba),0.671024
4,poly_WIEK^2 Lac (1. gaz. 1sza doba),0.668845
5,poly_Waga (kg) Lac (1. gaz. 1sza doba) BE (1. ...,0.668845
6,Glukoza (1. gaz. 1sza doba)_div_Lac (1. gaz. 1...,0.664488
7,BE (1. gaz. 1sza doba)_minus_Sepsa (0/1),0.662309
8,poly_WIEK Wzrost (cm) Lac (1. gaz. 1sza doba),0.662309
9,Lac (1. gaz. 1sza doba)_mul_SOFA - punktacja,0.662309


Wybieramy ręcznie część z nich (tak aby nie było za dużo powtórek)

In [6]:
selected_features = [
    'poly_WIEK Lac (1. gaz. 1sza doba)^2',
    'Lac (1. gaz. 1sza doba)_plus_SOFA - punktacja',
    'poly_Waga (kg) Lac (1. gaz. 1sza doba) BE (1. gaz. 1sza doba)',
    'Glukoza (1. gaz. 1sza doba)_div_Lac (1. gaz. 1sza doba)',
    'BE (1. gaz. 1sza doba)_minus_Sepsa (0/1)',
    'poly_Wzrost (cm) Lac (1. gaz. 1sza doba)^2',
    'male sex_plus_Lac (1. gaz. 1sza doba)',
    'poly_WIEK Wzrost (cm) BE (1. gaz. 1sza doba)'
]

X_top_30_feats = pd.DataFrame({name: feature_data[name] for name, _ in top_30})
X_selected_feats = X_top_30_feats[[col for col in selected_features if col in X_top_30_feats.columns]]

X_train_feats = X_selected_feats.loc[X_train_zgon.index]
X_test_feats = X_selected_feats.loc[X_test_zgon.index]
X_train_feats.head()

Unnamed: 0,poly_WIEK Lac (1. gaz. 1sza doba)^2,Lac (1. gaz. 1sza doba)_plus_SOFA - punktacja,poly_Waga (kg) Lac (1. gaz. 1sza doba) BE (1. gaz. 1sza doba),Glukoza (1. gaz. 1sza doba)_div_Lac (1. gaz. 1sza doba),BE (1. gaz. 1sza doba)_minus_Sepsa (0/1),poly_Wzrost (cm) Lac (1. gaz. 1sza doba)^2,male sex_plus_Lac (1. gaz. 1sza doba),poly_WIEK Wzrost (cm) BE (1. gaz. 1sza doba)
163,524.88,15.7,-756.0,5.629628,-3.8,1312.2,3.7,-36288.0
182,576.6,3.1,-2594.7,6.387095,-10.3,1537.6,3.1,-89280.0
169,84.5,15.3,-1092.0,7.38461,-10.5,278.85,1.3,-86625.0
311,180.5,13.9,-655.5,9.105258,-5.6,577.6,1.9,-36800.0
99,2624.4,19.4,-5186.7,1.185185,-12.3,4519.8,5.4,-157635.0


In [7]:
from skopt import BayesSearchCV # scikit-optimize
from skopt.space import Integer, Categorical
from sklearn.model_selection import StratifiedKFold

search_space = {
    'max_depth': Integer(2, 15),
    'n_estimators': Integer(100, 2000),
    'max_features': Categorical(['sqrt', 'log2', None]),
}

opt = BayesSearchCV(
    estimator=RandomForestClassifier(),
    search_spaces=search_space,
    n_iter=30,
    scoring='accuracy',
    cv=StratifiedKFold(n_splits=5, shuffle=True),
    n_jobs=-1
)

opt.fit(X_train_feats, y_train_zgon)

print("Best params:", opt.best_params_)
print(f"Best CV accuracy: {opt.best_score_:.3f}")

Best params: OrderedDict([('max_depth', 6), ('max_features', None), ('n_estimators', 405)])
Best CV accuracy: 0.684


In [8]:
results = []
for seed in range(10):
    rf_clf_feats = RandomForestClassifier(max_depth=4, max_features='log2', n_estimators=1500, random_state=seed)
    rf_clf_feats.fit(X_train_feats, y_train_zgon)
    y_valid_pred_feats = rf_clf_feats.predict(X_test_feats)
    acc = accuracy_score(y_test_zgon, y_valid_pred_feats)
    results.append(acc)

print(f"Max accuracy: {np.max(results):.3f}")
print(f"Min accuracy: {np.min(results):.3f}")
print(f"Average accuracy: {np.mean(results):.3f}")

Max accuracy: 0.739
Min accuracy: 0.739
Average accuracy: 0.739


In [9]:
from catboost import CatBoostClassifier

# Train CatBoost on base features
cat_clf = CatBoostClassifier(verbose=0, random_state=42, max_depth=2, n_estimators=500)
cat_clf.fit(X_train_feats, y_train_zgon)

# Evaluate on train
y_train_pred_cat = cat_clf.predict(X_train_feats)
acc_train_cat = accuracy_score(y_train_zgon, y_train_pred_cat)
print(f"CatBoost Train Accuracy: {acc_train_cat:.3f}")

# Evaluate on test
y_valid_pred_feats = cat_clf.predict(X_test_feats)
acc_test_cat = accuracy_score(y_test_zgon, y_valid_pred_feats)
print(f"CatBoost Test Accuracy: {acc_test_cat:.3f}")

CatBoost Train Accuracy: 0.741
CatBoost Test Accuracy: 0.696


In [10]:
catboost_results = []
for seed in range(10):
    cat_clf_seed = CatBoostClassifier(verbose=0, random_state=seed, max_depth=2, n_estimators=500)
    cat_clf_seed.fit(X_train_feats, y_train_zgon)
    y_valid_pred_feats = cat_clf_seed.predict(X_test_feats)
    acc_cat = accuracy_score(y_test_zgon, y_valid_pred_feats)
    catboost_results.append(acc_cat)

print(f"CatBoost Max accuracy: {np.max(catboost_results):.3f}")
print(f"CatBoost Min accuracy: {np.min(catboost_results):.3f}")
print(f"CatBoost Average accuracy: {np.mean(catboost_results):.3f}")

CatBoost Max accuracy: 0.717
CatBoost Min accuracy: 0.685
CatBoost Average accuracy: 0.704


Używając bazowych cech + kombinacji

In [11]:
base_combined_features = numeric_cols + selected_features
X_feats = pd.DataFrame({name: feature_data[name] for name, _ in feature_scores})
X_selected_combined_feats = X_feats[[col for col in base_combined_features if col in X_feats.columns]]

X_train_combined_feats = X_selected_combined_feats.loc[X_train_zgon.index]
X_test_combined_feats = X_selected_combined_feats.loc[X_test_zgon.index]

X_train_combined_feats.head()

Unnamed: 0,male sex,WIEK,Operowany przed przyjęciem (0/1),Interleukina 6,Prokalcytonina,MAP 1sza doba,pao2/fio2 1sza doba,Waga (kg),Wzrost (cm),BMI,...,SOFA - punktacja,Sepsa (0/1),poly_WIEK Lac (1. gaz. 1sza doba)^2,Lac (1. gaz. 1sza doba)_plus_SOFA - punktacja,poly_Waga (kg) Lac (1. gaz. 1sza doba) BE (1. gaz. 1sza doba),Glukoza (1. gaz. 1sza doba)_div_Lac (1. gaz. 1sza doba),BE (1. gaz. 1sza doba)_minus_Sepsa (0/1),poly_Wzrost (cm) Lac (1. gaz. 1sza doba)^2,male sex_plus_Lac (1. gaz. 1sza doba),poly_WIEK Wzrost (cm) BE (1. gaz. 1sza doba)
163,1.0,72.0,1.0,390.0,3.59,63.333333,578.0,100.0,180.0,30.864198,...,13.0,1.0,524.88,15.7,-756.0,5.629628,-3.8,1312.2,3.7,-36288.0
182,0.0,60.0,0.0,516.0,100.0,63.333333,0.0,90.0,160.0,35.15625,...,0.0,1.0,576.6,3.1,-2594.7,6.387095,-10.3,1537.6,3.1,-89280.0
169,0.0,50.0,0.0,0.0,1.07,83.333333,165.0,80.0,165.0,29.384757,...,14.0,0.0,84.5,15.3,-1092.0,7.38461,-10.5,278.85,1.3,-86625.0
311,0.0,50.0,1.0,675.0,5.44,86.666667,135.0,75.0,160.0,29.296875,...,12.0,1.0,180.5,13.9,-655.5,9.105258,-5.6,577.6,1.9,-36800.0
99,0.0,90.0,1.0,24944.0,11.7,73.333333,128.833333,85.0,155.0,35.379813,...,14.0,1.0,2624.4,19.4,-5186.7,1.185185,-12.3,4519.8,5.4,-157635.0


In [12]:
results = []
for seed in range(10):
    rf_clf_feats = RandomForestClassifier(max_depth=4, max_features='log2', n_estimators=1500, random_state=seed)
    rf_clf_feats.fit(X_train_combined_feats, y_train_zgon)
    y_valid_pred_feats = rf_clf_feats.predict(X_test_combined_feats)
    acc = accuracy_score(y_test_zgon, y_valid_pred_feats)
    results.append(acc)

print(f"Max accuracy: {np.max(results):.3f}")
print(f"Min accuracy: {np.min(results):.3f}")
print(f"Average accuracy: {np.mean(results):.3f}")

Max accuracy: 0.728
Min accuracy: 0.717
Average accuracy: 0.723


In [13]:
catboost_results = []
for seed in range(10):
    cat_clf_seed = CatBoostClassifier(verbose=0, random_state=seed, max_depth=2, n_estimators=500)
    cat_clf_seed.fit(X_train_combined_feats, y_train_zgon)
    y_valid_pred_feats = cat_clf_seed.predict(X_test_combined_feats)
    acc_cat = accuracy_score(y_test_zgon, y_valid_pred_feats)
    catboost_results.append(acc_cat)

print(f"CatBoost Max accuracy: {np.max(catboost_results):.3f}")
print(f"CatBoost Min accuracy: {np.min(catboost_results):.3f}")
print(f"CatBoost Average accuracy: {np.mean(catboost_results):.3f}")

CatBoost Max accuracy: 0.739
CatBoost Min accuracy: 0.674
CatBoost Average accuracy: 0.707


## Ensemble

Czyli feature engineering nie działa lepiej. Spróbujmy więc ulepszyć model używając tylko bazowych cech i kilku 
różnych modeli głosujących

In [50]:
import numpy as np
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier,
    ExtraTreesClassifier, BaggingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
)
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier

seed = 42

models = {
    'rf': (
        RandomForestClassifier(random_state=seed),
        {
            'n_estimators': [100, 200, 300, 500],
            'max_depth': [3, 5, 7, 10],
            'max_features': ['sqrt', 'log2'],
            'min_samples_split': [2, 5, 7, 10],
        }
    ),
    'gb': (
        GradientBoostingClassifier(random_state=seed),
        {
            'n_estimators': [100, 500],
            'learning_rate': [0.1, 0.01],
            'max_depth': [3, 7],
            'subsample': [0.7, 1.0],
        }
    ),
    'lr': (
        LogisticRegression(solver='saga', max_iter=20000, random_state=seed),
        {
            'C': np.logspace(-3, 1, 5),
            'penalty': ['l1', 'l2'],
        }
    ),
    'svc': (
        SVC(kernel='rbf', probability=True, random_state=seed),
        {
            'C': [0.1, 1, 10, 100],
            'gamma': ['scale', 'auto', 0.01],
        }
    ),
    'knn': (
        KNeighborsClassifier(),
        {
            'n_neighbors': [3, 5, 9],
            'weights': ['uniform', 'distance'],
            'p': [1, 2],
        }
    ),
    'adb': (
        AdaBoostClassifier(random_state=seed),
        {
            'n_estimators': [50, 100, 500],
            'learning_rate': [0.01, 0.1, 1.0],
        }
    ),
    'et': (
        ExtraTreesClassifier(random_state=seed),
        {
            'n_estimators': [100, 300, 500],
            'max_depth': [3, 5, 10],
            'max_features': ['sqrt', 'log2'],
        }
    ),
    'bag': (
        BaggingClassifier(random_state=seed),
        {
            'n_estimators': [10, 100],
            'max_samples': [0.5, 1.0],
            'max_features': [0.5, 1.0],
        }
    ),
    'cat': (
        CatBoostClassifier(
            verbose=0,
            random_state=seed,
            depth=2,
            iterations=500
        ),
        {
            'learning_rate': [0.01, 0.1],
            'l2_leaf_reg': [1, 3, 5],
            'border_count': [32, 64]
        }
    ),
    'gnb': (
        GaussianNB(),
        {}
    ),
    'lda': (
        LinearDiscriminantAnalysis(),
        {
            'solver': ['svd', 'lsqr', 'eigen']
        }
    ),
    'qda': (
        QuadraticDiscriminantAnalysis(),
        {
            'reg_param': [0.0, 0.1, 0.5]
        }
    ),
    'mlp': (
        MLPClassifier(max_iter=5000, random_state=seed),
        {
            'hidden_layer_sizes': [(30,), (30, 30)],
            'alpha': [1e-4, 1e-3, 1e-2],
            'learning_rate_init': [0.01],
        }
    ),
}

In [51]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score

best_estimators = {}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

for name, (estimator, param_grid) in models.items():  # unpack pair here
    gs = GridSearchCV(estimator, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    gs.fit(X_train_zgon, y_train_zgon)

    best = gs.best_estimator_
    best_estimators[name] = best

    cv_acc = gs.best_score_
    train_acc = accuracy_score(y_train_zgon, best.predict(X_train_zgon))

    print(f"{name:4s} | CV acc: {cv_acc:.3f} | Train acc: {train_acc:.3f} | params: {gs.best_params_}")

rf   | CV acc: 0.684 | Train acc: 0.875 | params: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 200}
gb   | CV acc: 0.657 | Train acc: 0.918 | params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500, 'subsample': 1.0}
lr   | CV acc: 0.542 | Train acc: 0.537 | params: {'C': np.float64(0.001), 'penalty': 'l1'}
svc  | CV acc: 0.539 | Train acc: 0.561 | params: {'C': 100, 'gamma': 'scale'}
knn  | CV acc: 0.589 | Train acc: 0.747 | params: {'n_neighbors': 5, 'p': 1, 'weights': 'uniform'}
adb  | CV acc: 0.651 | Train acc: 0.768 | params: {'learning_rate': 1.0, 'n_estimators': 50}
et   | CV acc: 0.676 | Train acc: 0.978 | params: {'max_depth': 10, 'max_features': 'sqrt', 'n_estimators': 500}
bag  | CV acc: 0.673 | Train acc: 0.970 | params: {'max_features': 1.0, 'max_samples': 0.5, 'n_estimators': 100}
cat  | CV acc: 0.673 | Train acc: 0.741 | params: {'border_count': 32, 'l2_leaf_reg': 3, 'learning_rate': 0.01}
gnb  | CV acc: 0.624 | Train acc: 

In [52]:
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(
    estimators=[(n, est) for n, est in best_estimators.items()],
    voting='soft',
    n_jobs=-1
)
ensemble.fit(X_train_zgon, y_train_zgon)

for split, (X_, y_) in [
    ('Train', (X_train_zgon, y_train_zgon)),
    ('Test',  (X_test_zgon, y_test_zgon))
]:
    acc = accuracy_score(y_, ensemble.predict(X_))
    print(f"{split:5s} accuracy: {acc:.3f}")

Train accuracy: 0.828
Test  accuracy: 0.685


Tylko wybrane

In [55]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

selected = ['cat', 'knn', 'lda', 'lr', 'qda']
estimators = [(name, best_estimators[name]) for name in selected]

ensemble = VotingClassifier(
    estimators=estimators,
    voting='soft',
    n_jobs=-1
)
ensemble.fit(X_train_zgon, y_train_zgon)

for split, (X_, y_) in [
    ('Train', (X_train_zgon, y_train_zgon)),
    ('Valid',  (X_test_zgon, y_test_zgon))
]:
    acc = accuracy_score(y_, ensemble.predict(X_))
    print(f"{split:5s} accuracy: {acc:.3f}")

Train accuracy: 0.760
Valid accuracy: 0.750


In [56]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
import random

random.seed(42)
model_names = list(best_estimators.keys())
combos = set()
while len(combos) < 500:
    size = random.randint(2, 7)
    combos.add(tuple(sorted(random.sample(model_names, size))))

# CV on train for each combo
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []
for combo in combos:
    ens = VotingClassifier(
        estimators=[(n, best_estimators[n]) for n in combo],
        voting='soft', n_jobs=-1
    )
    scores = cross_val_score(
        ens, X_train_zgon, y_train_zgon,
        cv=cv, scoring='accuracy', n_jobs=-1
    )
    cv_results.append((combo, scores.mean(), scores.std()))

# Sort by mean CV accuracy
cv_results.sort(key=lambda x: x[1], reverse=True)

# Display top 10
for combo, mean_acc, std_acc in cv_results[:10]:
    print(f"{combo} | CV mean: {mean_acc:.3f} ± {std_acc:.3f}")

('adb', 'rf') | CV mean: 0.676 ± 0.055
('lda', 'rf') | CV mean: 0.673 ± 0.023
('adb', 'bag', 'cat', 'et', 'gb', 'lr', 'rf') | CV mean: 0.670 ± 0.041
('bag', 'lda') | CV mean: 0.670 ± 0.028
('adb', 'bag', 'gb', 'rf') | CV mean: 0.668 ± 0.046
('bag', 'cat', 'et') | CV mean: 0.668 ± 0.036
('cat', 'lda', 'svc') | CV mean: 0.668 ± 0.019
('bag', 'rf') | CV mean: 0.665 ± 0.032
('bag', 'cat', 'gnb', 'lr', 'rf') | CV mean: 0.665 ± 0.031
('cat', 'lr') | CV mean: 0.665 ± 0.051


In [61]:
# Re‐fit the best ensemble on full train and evaluate on test
best_combo, _, _ = cv_results[0]
best_ens = VotingClassifier(
    estimators=[(n, best_estimators[n]) for n in best_combo],
    voting='soft', n_jobs=-1
)

best_ens.fit(X_train_zgon, y_train_zgon)
print("Final Test accuracy:",
      accuracy_score(y_test_zgon, best_ens.predict(X_test_zgon)))

Final Test accuracy: 0.75
