In [2]:
from pathlib import Path
import pandas as pd

train_set = pd.read_csv(Path("data/train.csv"))
test_set = pd.read_csv(Path("data/test.csv"))

In [3]:
train_set.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [4]:
working_set = train_set.copy()
working_set_labels = working_set["Survived"].copy()
working_set.drop(columns=["Survived"], inplace=True)

In [176]:
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FunctionTransformer, make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

embarked_pipeline = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder(drop="first", sparse_output=False))

age_pipeline = make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(
            lambda X: pd.DataFrame({
                    # "Age": pd.Series(X[:, 0]), 
                    "IsBaby": pd.cut(pd.Series(X[:, 0]), bins=[0, 2, float('inf')], labels=[1, 0]),
                }),
            feature_names_out=lambda X1, X2: ["IsBaby"])
        )

family_fare_pipeline = make_pipeline(
        FunctionTransformer(
            lambda X: pd.DataFrame({
                "FamilySize": X.iloc[:, 0] + X.iloc[:, 1] + 1,
                "FarePerPerson": X.iloc[:, 2] / (X.iloc[:, 0] + X.iloc[:, 1] + 1),
                # "IsAlone": (X.iloc[:, 0] + X.iloc[:, 1] == 0).astype(int)
            }),
            feature_names_out=lambda X1, X2: ["FamilySize", "FarePerPerson"]
        ),
        # ColumnTransformer(
        #     transformers=[
        #         ("scale_fare", StandardScaler(), ["FarePerPerson"]),
        #         ("passthrough", "passthrough", ["FamilySize"])
        #     ]
        # )
)

def cabin_transform(X):
    return pd.DataFrame(X).map(lambda x: 'F' if x[0] == 'T' else x[0])

cabin_pipeline = make_pipeline(
        SimpleImputer(strategy="constant", fill_value="G"),
        FunctionTransformer(
            cabin_transform,
            feature_names_out=lambda X1, X2: ["Cabin"]),
        OneHotEncoder(handle_unknown="ignore", sparse_output=False),
        # OrdinalEncoder(categories=[list("ABCDEFG")], dtype=int)
        )

name_pipeline = make_pipeline(
        FunctionTransformer(
            lambda X: pd.DataFrame({
                "IsMaster": X.iloc[:, 0].str.extract(r"([A-Za-z]+)\.")[0] == "Master",
            }),
            feature_names_out=lambda X1, X2: ["IsMaster"]
        ),
    )

class_pipeline = make_pipeline(
        OneHotEncoder(handle_unknown="ignore", sparse_output=False),
    )
preprocessor = ColumnTransformer(
    transformers=[
        ("age", age_pipeline, ["Age"]),
        ("family_fare", family_fare_pipeline, ["SibSp", "Parch", "Fare"]),
        # ("impute_onehot", embarked_pipeline, ["Embarked"]),
        # ("cabin", cabin_pipeline, ["Cabin"]),
        ("sex_encoder", OrdinalEncoder(categories=[["male", "female"]]), ["Sex"]),
        ("class", "passthrough", ["Pclass"]),
        # ("name", name_pipeline, ["Name"]),
    ]
)

working_set_pp = pd.DataFrame(preprocessor.fit_transform(working_set), columns=preprocessor.get_feature_names_out())
working_set_pp.head(10)

Unnamed: 0,age__IsBaby,family_fare__FamilySize,family_fare__FarePerPerson,sex_encoder__Sex,class__Pclass
0,0.0,2.0,3.625,0.0,3.0
1,0.0,2.0,35.64165,1.0,1.0
2,0.0,1.0,7.925,1.0,3.0
3,0.0,2.0,26.55,1.0,1.0
4,0.0,1.0,8.05,0.0,3.0
5,0.0,1.0,8.4583,0.0,3.0
6,0.0,1.0,51.8625,0.0,1.0
7,1.0,5.0,4.215,0.0,3.0
8,0.0,3.0,3.7111,1.0,3.0
9,0.0,2.0,15.0354,1.0,2.0


In [175]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score

# Define the Random Forest model
rf_clf = RandomForestClassifier(random_state=42)

# Set up k-fold cross-validation
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(rf_clf, working_set_pp, working_set_labels, cv=kfold, scoring='accuracy')

# Print the accuracy for each fold and the mean accuracy
print(f"Cross-Validation Accuracies: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean()}")

Cross-Validation Accuracies: [0.83333333 0.83146067 0.78651685 0.79775281 0.78651685 0.82022472
 0.83146067 0.7752809  0.80898876 0.85393258]
Mean Accuracy: 0.8125468164794007


In [177]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

cv_scores = cross_val_score(rf_clf, working_set_pp, working_set_labels, cv=kfold, scoring='accuracy')

print(f"Cross-Validation Accuracies: {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean()}")


Cross-Validation Accuracies: [0.83333333 0.80898876 0.78651685 0.78651685 0.79775281 0.83146067
 0.85393258 0.78651685 0.84269663 0.86516854]
Mean Accuracy: 0.8192883895131086


In [147]:
rf_clf.fit(working_set_pp, working_set_labels)

In [148]:
test_set_pp = pd.DataFrame(preprocessor.transform(test_set), columns=preprocessor.get_feature_names_out())

test_set_predict = rf_clf.predict(test_set_pp)

submission = pd.DataFrame({
    "PassengerId": test_set["PassengerId"],
    "Survived": test_set_predict
})

submission.to_csv("data/submission.csv", index=False)
