Nach dieser [Quelle](https://www.kaggle.com/code/wissams/titanic-competition-step-by-step-using-xgboost)

**Hat auch gute explorative Datenanalyse!**

In [1]:
import pandas as pd

# Zeige alle Daten, egal wie groß das DF ist
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Funktionen
def transform_data(df:pd.DataFrame):
    # Now the names...
    # First, seperate the first and last name
    df[["LastName", "FirstName_tmp"]] = df["Name"].str.split(",", expand=True)
    
    # Now the salutation forms
    df[["SalutForm","FirstName"]] = df["FirstName_tmp"].str.split(".", n=1, expand=True)
    df["SalutForm"] = df["SalutForm"].str.replace(" ", "")
    title_mapping= {
        "Mr":"Mr",
        "Mrs":"Mrs",
        "Miss":"Miss",
        "Master":"Master",
        "Don":"Rare",
        "Rev":"Rare",
        "Dr":"Rare",
        "Mme":"Mrs",
        "Ms":"Miss",
        "Major":"Rare",
        "Lady":"Rare" ,
        "Sir":"Rare",
        "Mlle":"Miss",
        "Col":"Rare",
        "Capt":"Rare",
        "theCountess":"Rare",
        "Jonkheer":"Rare",
        "Dona":"Rare"
    }
    df["Title"] = df["SalutForm"].map(title_mapping)
    
    # Grouping family size
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Extrahiere den ersten Buchstaben der Kabine (falls nicht vorhanden -> U)
    # U --> Unknown!
    df["CabinLetter"] = df["Cabin"].str[:1]
    df["CabinLetter"].fillna("U", inplace=True)
    
    # Erstelle die Spalte "IsMale", um unnötiges OHE zu vermeiden
    df["IsMale"]= df["Sex"].apply(lambda x: 1 if x=="male" else 0)
    
    # Ermittle die Gruppengröße (Wie viele Passagiere haben diesselbe Ticketnummer?)
    tickets= df.groupby(["Ticket"])[["PassengerId"]].count().rename(columns={"PassengerId": "Count"})
    df["GroupSize"]=df["Ticket"].apply(lambda x: tickets.loc[x, "Count"])

    # Verwerfe nicht benötigte Zeilen
    df = df.drop([
        "PassengerId",
        "FirstName_tmp",
        "Name",
        "Cabin",
        "SibSp",
        "Parch",
        "FirstName",
        "Sex",
        "Ticket"], axis=1)
                              
    return df

def group_families_fam_size(df: pd.DataFrame):
    for fam in df["LastName"].unique():
        tmp_df = df.loc[df["LastName"] == fam ]
        
        unique_fam_sizes = tmp_df["FamilySize"].unique()
        
        i = 0
        for size in unique_fam_sizes:
            df.loc[(df["FamilySize"] == size) & (df["LastName"] == fam), "Family"] = f"{fam}_{i}"
            i = i + 1 

In [2]:
from sklearn import compose, impute, linear_model, preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

def prepare_data():

    # Lade den Trainingsdatensatz
    train = pd.read_csv("../data/train.csv")
    train["IsKaggleTestData"] = False
    
    # Lade den Kaggle Test Datensatz
    # Dieser muss zusammen mit dem Trainingsdatensatz verarbeitet werden, sonst fehlen einige Spalten im Kaggle Testdatensatz!
    X_test_kaggle = pd.read_csv("../data/test.csv")
    _X_test_kaggle = X_test_kaggle.copy()

    X_test_kaggle["Survived"] = 0 # Dummy damit Pandas keine Zicken macht
    X_test_kaggle["IsKaggleTestData"] = True
    
    # Kombiniere beide Datensätze, da es ansonsten Probleme mit der Pipeline gibt
    df = pd.concat([train, X_test_kaggle], axis=0)
    
    # Transformiere die Daten mit der ersten Funktion und erstelle eine Spalte um später die Kaggle Testdaten zu extrahiern
    df = transform_data(df)
    df_for_exploration = df.copy()
    
    # Typisiere die Spalten
    num_cols = ["Age", "Fare", "FamilySize"]
    cat_cols = ["Pclass", "Embarked", "SalutForm", "CabinLetter"]
    dist_col = ["IsKaggleTestData", "IsMale"]
    
    # Definiere die Pipeline und die verschiedenen Preprocesors
    numerical_preprocessor = Pipeline(steps=[
        ("imputer", impute.SimpleImputer(strategy="mean")),
        ("scaler", preprocessing.StandardScaler())
    ])

    categorical_preprocessor = Pipeline(steps=[
        ("imputer", impute.SimpleImputer(strategy="most_frequent")),
        ("onehot", preprocessing.OneHotEncoder(handle_unknown="error", sparse_output=False)),
    ])

    preprocessor = compose.ColumnTransformer(
        transformers=[
            ("numerical", numerical_preprocessor, num_cols),
            ("categorical", categorical_preprocessor, cat_cols),
            ("passthrough", "passthrough", dist_col)
        ]
    )
    
    # Löse die Labels von Features ab
    y = df[["Survived","IsKaggleTestData"]]
    X = df.drop(["Survived"], axis=1)
    
    # Preprocess die Features
    preprocessor.set_output(transform="pandas")
    X_pipe = preprocessor.fit_transform(X)
    
    # Löse die Kaggle Testdaten von der Gesamtheit der Features und bereinige diese von der Dummy Spalte
    X_test_kaggle_pipe = X_pipe.loc[X_pipe["passthrough__IsKaggleTestData"] == True]
    X_test_kaggle_pipe = X_test_kaggle_pipe.drop(["passthrough__IsKaggleTestData"], axis = 1)
    
    # Löse die Trainingsfeatures von der Gesamtheit der Features und bereinige diese von der Dummy Spalte
    X_train_full_pipe = X_pipe.loc[X_pipe["passthrough__IsKaggleTestData"] == False]
    X_train_full_pipe = X_train_full_pipe.drop(["passthrough__IsKaggleTestData"], axis = 1)
    
    # Trenne die Trainingslabels von den Dummy Labels
    y_train_full_pipe = y.loc[y["IsKaggleTestData"] == False]
    y_train_full_pipe = y_train_full_pipe.drop(["IsKaggleTestData"], axis = 1)
    
    # Erstelle aus den Trainingsfeatures ein weiteren Testdatensatz, der lokal benutzt wird
    X_train, X_test, y_train, y_test = train_test_split(X_train_full_pipe, y_train_full_pipe, test_size=0.2)
    
    # Gebe die lokalen Trainings- und Testdatensätze sowie den Kaggle Testdatensatz zurück
    return X_train, X_test, y_train, y_test, X_test_kaggle_pipe, _X_test_kaggle, df_for_exploration

In [3]:
X_train, X_test, y_train, y_test, X_test_kaggle_pipe, _X_test_kaggle, df_data_ex = prepare_data()

In [4]:
df_data_ex = df_data_ex.loc[df_data_ex["IsKaggleTestData"] == False]
df_data_ex = df_data_ex.drop(["IsKaggleTestData"], axis = 1)

In [5]:
df_data_ex.head()

Unnamed: 0,Survived,Pclass,Age,Fare,Embarked,LastName,SalutForm,Title,FamilySize,CabinLetter,IsMale,GroupSize
0,0,3,22.0,7.25,S,Braund,Mr,Mr,2,U,1,1
1,1,1,38.0,71.2833,C,Cumings,Mrs,Mrs,2,C,0,2
2,1,3,26.0,7.925,S,Heikkinen,Miss,Miss,1,U,0,1
3,1,1,35.0,53.1,S,Futrelle,Mrs,Mrs,2,C,0,2
4,0,3,35.0,8.05,S,Allen,Mr,Mr,1,U,1,1


In [6]:
df_data_ex["CabinLetter"].value_counts()

U    687
C     59
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: CabinLetter, dtype: int64