In [210]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import pickle

In [211]:
train_data = pd.read_csv("titanic_data.csv")
test_data = pd.read_csv("test.csv")
train_labels = train_data["Survived"]
train_data.drop(columns=["Survived"], inplace=True)
train_data[:10]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [212]:
def create_treatment_column(single_data, add_miss = False):
        """This function create a treatment's list:
        1 - Miss.
        2 - Mr.
        3 - Mrs.
        4 - Master
        5 - Others"""
        if re.findall(r"(.+(M|m)iss.+)|(.+(M|m)ile.+)|(.+(M|m)s.+)", single_data) and add_miss:
            return "Miss"
        if re.findall(r"(.+(M|m)rs.+)|(.+(M|m)me.+)", single_data):
            return "Mrs"
        elif re.findall(r".+(M|m)r.+", single_data):
            return "Mr"
        elif re.findall(r".+(M|m)aster.+", single_data):
            return "Master"
        return "Others"

In [213]:
def add_treatment(X, add_miss = False):
    X["Treatment"] = X["Name"].apply(create_treatment_column, add_miss)
    return X

In [214]:
def create_social_group(X):
    group = X["SibSp"] + X["Parch"] + 1
    X["Social_group"] = np.where(group == 1, 0, np.where(group == 2, 1, np.where(group <=4, 2, 3)))
    return X

In [215]:
def create_features_cabin(X, cabin_name = True):
    if cabin_name:
        X["Cabin_name"] = X["Cabin"].apply(lambda x: str(x)[0])
    return X

In [216]:
def new_features(X, cabin_name = True, add_miss = False):
    X = add_treatment(X, add_miss)
    X = create_social_group(X)
    X = create_features_cabin(X, cabin_name)
    return X

In [217]:
std_cols = ["Age", "Fare"]

std_encoder = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler())
])

hot_encoder = Pipeline([
    ("imputer", SimpleImputer(strategy="constant", fill_value="N")),
    ("imputer2", SimpleImputer(missing_values = "T", strategy="constant", fill_value="n")),
    ("1hot_encoder", OneHotEncoder(sparse = False))
])

partial_pipeline = ColumnTransformer([
    ("std_encoder", std_encoder, std_cols),
    ("sex_binarizer", OrdinalEncoder(), ["Sex"]),
    ("pclass", OrdinalEncoder(), ["Pclass"]),
    ("1hot_encoder", hot_encoder, ["Treatment", "Cabin_name", "Embarked"])
])

full_pipeline = Pipeline([
    ("new_features", FunctionTransformer(new_features, validate=False, kw_args={"cabin_name": True, "add_miss": False})),
    ("partial_pipeline", partial_pipeline)
])

In [218]:
import warnings
warnings.simplefilter(action='ignore', category=Warning)

clf = Pipeline([
    ("full_pipeline", full_pipeline),
    ("clf", GradientBoostingClassifier(random_state=42))
])

clf.fit(train_data, train_labels)


score_train = clf.score(train_data, train_labels)
score_val = cross_val_score(clf, train_data, train_labels, cv=4)

print("Score Train-set: {}\nScore Val-set: {} +/ {}".format(score_train, score_val.mean(), score_val.std()))

Score Train-set: 0.9023569023569024
Score Val-set: 0.8328011557762118 +/ 0.018289064356790875


In [219]:
param_grid = {"full_pipeline__new_features__cabin_name": [True, False], 
              "full_pipeline__new_features__add_miss": [True, False]}

t = GridSearchCV(clf, param_grid, cv=4)
t.fit(train_data, train_labels)

ValueError: Invalid parameter add_miss for estimator FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function new_features at 0x000001BC914EC9D8>,
          inv_kw_args=None, inverse_func=None,
          kw_args={'cabin_name': True, 'add_miss': False},
          pass_y='deprecated', validate=False). Check the list of available parameters with `estimator.get_params().keys()`.