In [206]:
import os
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Build a Pipeline

## Central methods to call on the data

In [207]:
def load_titanic_data():
    return pd.read_csv(os.path.join('data', 'train.csv'))

In [208]:
def split_data(df):
    stratify_by = df["Pclass"]
    return train_test_split(df, test_size=0.2, stratify=stratify_by, random_state=42)

In [209]:
def drop_columns(df):
    columns_to_drop = ["PassengerId", "Name", "Ticket", "Cabin"]
    return df.drop(columns_to_drop, axis=1)

In [210]:
titanic = load_titanic_data()
# we call the train set just "titanic"
titanic, titanic_test_set = split_data(titanic)
titanic = drop_columns(titanic)

In [211]:
titanic.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
820,1,1,female,52.0,1,1,93.5,S
439,0,2,male,31.0,0,0,10.5,S
821,1,3,male,27.0,0,0,8.6625,S
403,0,3,male,28.0,1,0,15.85,S
343,0,2,male,25.0,0,0,13.0,S
514,0,3,male,24.0,0,0,7.4958,S
40,0,3,female,40.0,1,0,9.475,S
101,0,3,male,,0,0,7.8958,S
93,0,3,male,26.0,1,2,20.575,S
81,1,3,male,29.0,0,0,9.5,S


## Now we create Pipelines for the different columns

In [212]:
def transform_sex(df):
    # we are going to use an OrdinalEncoder to make numerical data of the sex
    sex_pipeline = Pipeline([
        ("encode", OrdinalEncoder())
    ])
    sex_pipeline.fit(df)
    return pd.DataFrame(sex_pipeline.transform(df), columns=sex_pipeline.get_feature_names_out(), index=df.index)

In [213]:
def transform_age(df):
    # since there are lots of null values we are going to impute them
    age_pipeline = Pipeline([
        ("impute", SimpleImputer(strategy="median"))
    ])
    age_pipeline.fit(df)
    return pd.DataFrame(age_pipeline.transform(df), columns=age_pipeline.get_feature_names_out(), index=df.index)


In [214]:
def transform_sipsp_parch(df):
    # from SibSp and Parch, we create a new column "Alone"
    # Create a mask for the conditions
    mask = (df["SibSp"] == 0) & (df["Parch"] == 0)

    # Create a new column, initialized with 1
    df.loc[:, "Alone"] = 1
    df.loc[~mask, "Alone"] = 0 # set 0 where the condition is not met
    df = df.drop(["SibSp", "Parch"], axis=1)
    return df

In [215]:
def transform_embarked(df):
    # we impute the null values with the most frequent and afterward encode it
    embarked_pipeline = Pipeline([
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("encode", OrdinalEncoder())
    ])
    embarked_pipeline.fit(df)
    return pd.DataFrame(embarked_pipeline.transform(df), columns=embarked_pipeline.get_feature_names_out(), index=df.index)


In [216]:
titanic["Sex"] = transform_sex(titanic[["Sex"]])
titanic["Age"] = transform_age(titanic[["Age"]])
titanic["Alone"] = transform_sipsp_parch(titanic[["SibSp", "Parch"]])
titanic["Embarked"] = transform_embarked(titanic[["Embarked"]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, "Alone"] = 1


In [217]:
titanic.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Alone
820,1,1,0.0,52.0,1,1,93.5,2.0,0
439,0,2,1.0,31.0,0,0,10.5,2.0,1
821,1,3,1.0,27.0,0,0,8.6625,2.0,1
403,0,3,1.0,28.0,1,0,15.85,2.0,0
343,0,2,1.0,25.0,0,0,13.0,2.0,1
514,0,3,1.0,24.0,0,0,7.4958,2.0,1
40,0,3,0.0,40.0,1,0,9.475,2.0,0
101,0,3,1.0,28.0,0,0,7.8958,2.0,1
93,0,3,1.0,26.0,1,2,20.575,2.0,0
81,1,3,1.0,29.0,0,0,9.5,2.0,1


In [218]:
titanic.isna().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Alone       0
dtype: int64

# Train the data

## KNeighborsRegressor

In [219]:
X = titanic.drop(["Survived"], axis=1)
y = titanic["Survived"]

In [220]:
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X, y)
print("The score of our model is ", knn_model.score(X, y))

The score of our model is  0.39983862433862416


In [221]:
cv_results = cross_validate(knn_model, X, y, cv=5, verbose=5)
print("The score of our model is ", knn_model.score(X, y))

[CV] END ......................................., score=0.045 total time=   0.0s
[CV] END ......................................., score=0.229 total time=   0.0s
[CV] END ......................................., score=0.193 total time=   0.0s
[CV] END ......................................., score=0.027 total time=   0.0s
[CV] END ......................................., score=0.010 total time=   0.0s
The score of our model is  0.39983862433862416


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished


In [222]:
grid = GridSearchCV(estimator=KNeighborsRegressor(),
                   param_grid={"n_neighbors": range(1, 20)},
                   return_train_score=True)
grid.fit(X, y)

In [223]:
print(f"Best param: {grid.best_params_}, best score: {grid.best_score_}")

Best param: {'n_neighbors': 11}, best score: 0.16042832603314933


In [224]:
best_model = grid.best_estimator_
print("The score of our model is ", best_model.score(X, y))

The score of our model is  0.3063475665748392


## DecisionTreeRegressor

In [225]:
dtr = DecisionTreeRegressor()
dtr.fit(X, y)
print("The score of our model is ", dtr.score(X, y))

The score of our model is  0.9587472442680776


In [226]:
search_space = {
    'max_depth': [None, 1,2,3],
    'splitter': ["best", "random"],
    'min_samples_split': [2,3,4]
}   # these are possibilities how the DecisionTreeRegressor can be configured
grid = GridSearchCV(estimator=DecisionTreeRegressor(),
                    param_grid=search_space,
                    return_train_score=True,
                    )
grid.fit(X, y)
results = grid.cv_results_
results = pd.DataFrame(results)
print("Best estimator: ", grid.best_estimator_)
print("Score of best estimator: ", grid.best_estimator_.score(X, y))

Best estimator:  DecisionTreeRegressor(max_depth=3)
Score of best estimator:  0.4904980977472344


In [227]:
best_model = grid.best_estimator_
print("The score of our model is ", best_model.score(X, y))

The score of our model is  0.4904980977472344
