Nach diesem [Beispiel](https://www.kaggle.com/code/arthurtok/introduction-to-ensembling-stacking-in-python)

In [1]:
import pandas as pd
# Funktionen
def transform_data(df:pd.DataFrame):
    # Now the names...
    # First, seperate the first and last name
    df[["LastName", "FirstName_tmp"]] = df["Name"].str.split(",", expand=True)
    
    # Now the salutation forms
    df[["SalutForm","FirstName"]] = df["FirstName_tmp"].str.split(".", n=1, expand=True)
    
    # Grouping family size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    df["Family"] = None
    
    df = df.drop([
        "PassengerId",
        "FirstName_tmp",
        "Name",
        "Cabin",
        "SibSp",
        "Parch",
        "Ticket"], axis=1)
                              
    return df

def group_families_fam_size(df: pd.DataFrame):
    for fam in df["LastName"].unique():
        tmp_df = df.loc[df["LastName"] == fam ]
        
        unique_fam_sizes = tmp_df["FamilySize"].unique()
        
        i = 0
        for size in unique_fam_sizes:
            df.loc[(df["FamilySize"] == size) & (df["LastName"] == fam), "Family"] = f"{fam}_{i}"
            i = i + 1 

In [2]:
from sklearn import compose, impute, linear_model, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

def prepare_data():

    # Lade den Trainingsdatensatz
    train = pd.read_csv("../data/train.csv")
    
    # Lade den Kaggle Test Datensatz
    # Dieser muss zusammen mit dem Trainingsdatensatz verarbeitet werden, sonst fehlen einige Spalten im Kaggle Testdatensatz!
    X_test_kaggle = pd.read_csv("../data/test.csv")
    _X_test_kaggle = X_test_kaggle.copy()

    X_test_kaggle["Survived"] = 0 # Dummy damit Pandas keine Zicken macht
    X_test_kaggle["IsKaggleTestData"] = True
    
    # Transformiere die Daten mit der ersten Funktion und erstelle eine Spalte um später die Kaggle Testdaten zu extrahiern
    train = transform_data(train)
    group_families_fam_size(train)
    X_test_kaggle = transform_data(X_test_kaggle)
    group_families_fam_size(X_test_kaggle)
    train["IsKaggleTestData"] = False
    
    # Kombiniere beide Datensätze, da es ansonsten Probleme mit der Pipeline gibt
    df = pd.concat([train, X_test_kaggle], axis=0)
    
    # Typisiere die Spalten
    num_cols = ["Age", "Fare", "FamilySize"]
    cat_cols = ["Pclass", "Sex", "Embarked", "SalutForm", "Family"]
    dist_col = ["IsKaggleTestData"]
    
    # Definiere die Pipeline und die verschiedenen Preprocesors
    numerical_preprocessor = Pipeline(steps=[
        ("imputer", impute.SimpleImputer(strategy="mean")),
        ("scaler", preprocessing.StandardScaler())
    ])

    categorical_preprocessor = Pipeline(steps=[
        ("imputer", impute.SimpleImputer(strategy="most_frequent")),
        ("onehot", preprocessing.OneHotEncoder(handle_unknown="error", sparse_output=False)),
    ])

    preprocessor = compose.ColumnTransformer(
        transformers=[
            ("numerical", numerical_preprocessor, num_cols),
            ("categorical", categorical_preprocessor, cat_cols),
            ("passthrough", "passthrough", dist_col)
        ]
    )
    
    # Löse die Labels von Features ab
    y = df[["Survived","IsKaggleTestData"]]
    X = df.drop(["Survived"], axis=1)
    
    # Preprocess die Features
    preprocessor.set_output(transform="pandas")
    X_pipe = preprocessor.fit_transform(X)
    
    # Löse die Kaggle Testdaten von der Gesamtheit der Features und bereinige diese von der Dummy Spalte
    X_test_kaggle_pipe = X_pipe.loc[X_pipe["passthrough__IsKaggleTestData"] == True]
    X_test_kaggle_pipe = X_test_kaggle_pipe.drop(["passthrough__IsKaggleTestData"], axis = 1)
    
    # Löse die Trainingsfeatures von der Gesamtheit der Features und bereinige diese von der Dummy Spalte
    X_train_full_pipe = X_pipe.loc[X_pipe["passthrough__IsKaggleTestData"] == False]
    X_train_full_pipe = X_train_full_pipe.drop(["passthrough__IsKaggleTestData"], axis = 1)
    
    # Trenne die Trainingslabels von den Dummy Labels
    y_train_full_pipe = y.loc[y["IsKaggleTestData"] == False]
    y_train_full_pipe = y_train_full_pipe.drop(["IsKaggleTestData"], axis = 1)
    
    # Erstelle aus den Trainingsfeatures ein weiteren Testdatensatz, der lokal benutzt wird
    X_train, X_test, y_train, y_test = train_test_split(X_train_full_pipe, y_train_full_pipe, test_size=0.2)
    
    # Gebe die lokalen Trainings- und Testdatensätze sowie den Kaggle Testdatensatz zurück
    return X_train, X_test, y_train, y_test, X_test_kaggle_pipe, _X_test_kaggle

In [3]:
X_train, X_test, y_train, y_test, X_test_kaggle_pipe, _X_test_kaggle = prepare_data()

In [4]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 7 to 695
Columns: 948 entries, numerical__Age to categorical__Family_van Melkebeke_0
dtypes: float64(948)
memory usage: 5.2 MB


# Baue die verschiedenen Modelle

In [5]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [6]:
# Hyperparameter für die Modelle

# Random Forest
rf_params = {
    "n_jobs": -1,
    "n_estimators": 500,
     "warm_start": True, 
     #"max_features": 0.2,
    "max_depth": 6,
    "min_samples_leaf": 2,
    "max_features" : "sqrt",
    "verbose": 0
}

# Extra Trees
et_params = {
    "n_jobs": -1,
    "n_estimators":500,
    # "max_features": 0.5,
    "max_depth": 8,
    "min_samples_leaf": 2,
    "verbose": 0
}

# AdaBoost
ada_params = {
    "n_estimators": 500,
    "learning_rate" : 0.75
}

# Gradient Boosting
gb_params = {
    "n_estimators": 500,
    # "max_features": 0.2,
    "max_depth": 5,
    "min_samples_leaf": 2,
    "verbose": 0
}

# Support Vector Classifier
svc_params = {
    "kernel" : "linear",
    "C" : 0.025
}

bag_svc_params = {
    "estimator": SVC(),
    "n_estimators": 500,
    "max_samples": 50,
    "bootstrap": True,
    "n_jobs": -1
}

bag_rf_params = {
    "estimator": RandomForestClassifier(),
    "n_estimators": 100,
    "max_samples": 200,
    "bootstrap": True,
    "n_jobs": -1
}

bag_dt_params = {
    "estimator": DecisionTreeClassifier(),
    "n_estimators": 100,
    "max_samples": 200,
    "bootstrap": True,
    "n_jobs": -1
}

# für die Grid Search (siehe weiter unten)
param_grid = {
    "n_estimators": [50, 100, 200, 300, 350, 500],
    "max_samples": [50, 100, 150, 200, 250, 350],
    "bootstrap": [False, True]
}

In [7]:
# Create 5 objects that represent our 4 models
rf = RandomForestClassifier(**rf_params)
et = ExtraTreesClassifier(**et_params)
ada = AdaBoostClassifier(**ada_params)
gb = GradientBoostingClassifier(**gb_params)
svc = SVC(**svc_params)
bag_svc = BaggingClassifier(**bag_svc_params)
bag_rf = BaggingClassifier(**bag_rf_params)
bag_dt = BaggingClassifier(**bag_dt_params)

In [14]:
# Grid Search
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(estimator=bag_dt,
                           param_grid=param_grid,
                           cv=10,
                           verbose=1,
                           scoring="accuracy"
                          )

In [9]:
models = {
    "RandomForest": rf,
    "ExtraTrees": et,
    "AdaBoost": ada,
    "GradientBoosting": gb,
    "SVC": svc,
    "Bagging_SVC": bag_svc,
    "Bagging_RandomForest": bag_rf,
    "Bagging_DecisionTree": bag_dt,
    "GridSearchBest": grid_search,
}

In [10]:
# Trainiere sie alle
for model in models.values():
    model.fit(X_train, y_train.to_numpy().ravel())

Fitting 10 folds for each of 72 candidates, totalling 720 fits
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.2s
[CV] END ...bootstrap=False, max_samples=50, n_estimators=50; total time=   0.1s
[CV] END ..bootstrap=False, max_samples=50, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=False, max_samples=50, n_

[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=350; total time=   0.4s
[CV] END .bootstrap=False, max_samples=100, n_estimators=500; total time=   0.6s
[CV] END .bootstrap=False, max_samples=100, n_estimators=500; total time=   0.6s
[CV] END .bootstrap=False, max_samples=100, n_estimators=500; total time=   0.6s
[CV] END .bootstrap=False, m

[CV] END .bootstrap=False, max_samples=200, n_estimators=200; total time=   0.3s
[CV] END .bootstrap=False, max_samples=200, n_estimators=200; total time=   0.3s
[CV] END .bootstrap=False, max_samples=200, n_estimators=200; total time=   0.3s
[CV] END .bootstrap=False, max_samples=200, n_estimators=200; total time=   0.3s
[CV] END .bootstrap=False, max_samples=200, n_estimators=200; total time=   0.3s
[CV] END .bootstrap=False, max_samples=200, n_estimators=200; total time=   0.3s
[CV] END .bootstrap=False, max_samples=200, n_estimators=200; total time=   0.3s
[CV] END .bootstrap=False, max_samples=200, n_estimators=300; total time=   0.4s
[CV] END .bootstrap=False, max_samples=200, n_estimators=300; total time=   0.4s
[CV] END .bootstrap=False, max_samples=200, n_estimators=300; total time=   0.4s
[CV] END .bootstrap=False, max_samples=200, n_estimators=300; total time=   0.4s
[CV] END .bootstrap=False, max_samples=200, n_estimators=300; total time=   0.4s
[CV] END .bootstrap=False, m

[CV] END ..bootstrap=False, max_samples=350, n_estimators=50; total time=   0.1s
[CV] END ..bootstrap=False, max_samples=350, n_estimators=50; total time=   0.2s
[CV] END ..bootstrap=False, max_samples=350, n_estimators=50; total time=   0.2s
[CV] END ..bootstrap=False, max_samples=350, n_estimators=50; total time=   0.2s
[CV] END ..bootstrap=False, max_samples=350, n_estimators=50; total time=   0.1s
[CV] END .bootstrap=False, max_samples=350, n_estimators=100; total time=   0.2s
[CV] END .bootstrap=False, max_samples=350, n_estimators=100; total time=   0.2s
[CV] END .bootstrap=False, max_samples=350, n_estimators=100; total time=   0.2s
[CV] END .bootstrap=False, max_samples=350, n_estimators=100; total time=   0.2s
[CV] END .bootstrap=False, max_samples=350, n_estimators=100; total time=   0.2s
[CV] END .bootstrap=False, max_samples=350, n_estimators=100; total time=   0.2s
[CV] END .bootstrap=False, max_samples=350, n_estimators=100; total time=   0.2s
[CV] END .bootstrap=False, m

[CV] END ...bootstrap=True, max_samples=50, n_estimators=350; total time=   0.4s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=350; total time=   0.4s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=350; total time=   0.4s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, max_samples=50, n_estimators=500; total time=   0.5s
[CV] END ...bootstrap=True, 

[CV] END ..bootstrap=True, max_samples=150, n_estimators=200; total time=   0.3s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.4s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.4s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.4s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.3s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.4s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.4s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.4s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.3s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.4s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=300; total time=   0.4s
[CV] END ..bootstrap=True, max_samples=150, n_estimators=350; total time=   0.4s
[CV] END ..bootstrap=True, m

[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=100; total time=   0.2s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=200; total time=   0.3s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=200; total time=   0.3s
[CV] END ..bootstrap=True, max_samples=250, n_estimators=200; total time=   0.3s
[CV] END ..bootstrap=True, m

[CV] END ..bootstrap=True, max_samples=350, n_estimators=500; total time=   0.7s
[CV] END ..bootstrap=True, max_samples=350, n_estimators=500; total time=   0.7s
[CV] END ..bootstrap=True, max_samples=350, n_estimators=500; total time=   0.7s
[CV] END ..bootstrap=True, max_samples=350, n_estimators=500; total time=   0.7s
[CV] END ..bootstrap=True, max_samples=350, n_estimators=500; total time=   0.7s
[CV] END ..bootstrap=True, max_samples=350, n_estimators=500; total time=   0.7s
[CV] END ..bootstrap=True, max_samples=350, n_estimators=500; total time=   0.7s


In [11]:
# Ersetze die Grid Search durch das beste Modell
models["GridSearchBest"] = grid_search.best_estimator_
print("Beste Parameter:", grid_search.best_params_)

Beste Parameter: {'bootstrap': False, 'max_samples': 250, 'n_estimators': 100}


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from datetime import datetime

result_text = ""
for name, model in models.items():
    result = cross_val_score(model, X_train, y_train.to_numpy().ravel(), scoring="accuracy")
    mean = sum(result) / len(result)
    y_test_predict = model.predict(X_test)

    
    string = f"Modell: {name}"
    string = f"{string}\nDurc. Accuracy: {mean} (cross val. auf Trainingsdatensatz)"
    string = f"{string}\nAccuracy: \t{accuracy_score(y_test, y_test_predict)} (Testdatensatz)"
    string = f"{string}\nPrecision: \t{precision_score(y_test, y_test_predict)} (Testdatensatz)"
    string = f"{string}\nRecall: \t{recall_score(y_test, y_test_predict)} (Testdatensatz)"
    string = f"{string}\nF1 Score: \t{f1_score(y_test, y_test_predict)} (Testdatensatz)"
    print(string, "\n")
    result_text = f"{result_text}\n{string}"
    
with open(f"../submissions/{datetime.today().strftime('%Y-%m-%d %H_%M_%S')}_last values.txt", "w") as file:
    file.write(result_text)
    file.close()


Modell: RandomForest
Durc. Accuracy: 0.7429331232148134 (cross val. auf Trainingsdatensatz)
Accuracy: 	0.7262569832402235 (Testdatensatz)
Precision: 	0.9354838709677419 (Testdatensatz)
Recall: 	0.3815789473684211 (Testdatensatz)
F1 Score: 	0.5420560747663552 (Testdatensatz) 

Modell: ExtraTrees
Durc. Accuracy: 0.8132079188417217 (cross val. auf Trainingsdatensatz)
Accuracy: 	0.7653631284916201 (Testdatensatz)
Precision: 	0.7833333333333333 (Testdatensatz)
Recall: 	0.618421052631579 (Testdatensatz)
F1 Score: 	0.6911764705882353 (Testdatensatz) 

Modell: AdaBoost
Durc. Accuracy: 0.8413079877868611 (cross val. auf Trainingsdatensatz)
Accuracy: 	0.7877094972067039 (Testdatensatz)
Precision: 	0.7878787878787878 (Testdatensatz)
Recall: 	0.6842105263157895 (Testdatensatz)
F1 Score: 	0.732394366197183 (Testdatensatz) 

Modell: GradientBoosting
Durc. Accuracy: 0.8272234807446075 (cross val. auf Trainingsdatensatz)
Accuracy: 	0.7932960893854749 (Testdatensatz)
Precision: 	0.782608695652174 (Test

# =>Bagging (RandomForest) und Bagging (SVC) sind vielversprechend

In [13]:
# Vorhersagen für Kaggle

name = "Bagging_RandomForest"

# Check, ob der Name im Dict ist
if name not in models.keys():
    raise KeyError(f"Das Modell {name} ist nicht im Dictionary!")


preds_kaggle = models[name].predict(X_test_kaggle_pipe)
output = pd.DataFrame({"PassengerId": _X_test_kaggle["PassengerId"], "Survived": preds_kaggle})
output_path = f"../submissions/{datetime.today().strftime('%Y-%m-%d %H_%M_%S')} {name}.csv"
# output.to_csv(output_path, index=False)