## 0. Modules and Custom Functions

In [1]:
from numpy import nan
from pandas import read_csv, Categorical, cut
from matplotlib.pyplot import style, rcParams

In [2]:
style.use("ggplot")
ggcolors = rcParams['axes.prop_cycle'].by_key()['color']
rcParams["figure.figsize"] = (18, 8)

In [3]:
def get_features(df):
    
    numerical = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical = df.select_dtypes(include=["object", "category"]).columns.tolist()
    dates = df.select_dtypes(include=["datetime", "datetime64[ns]"]).columns.tolist()
    other = df.columns.difference(numerical + categorical + dates).tolist()
    
    return numerical, categorical, dates, other

In [4]:
# df):
# emoved = df["name"].str.split(",").apply(lambda x: x[1]).copy()
# stname_removed.str.split(".").apply(lambda x: x[0]).copy()
# e = title.str.lower().str.strip().copy()
# n_title.apply(lambda t: "other" if t not in ["mr", "mrs", "ms", "miss"] else t).copy()
# 


def get_women_marital_status(gender, title):
    if (gender == "female") and (title in ["mrs", "ms"]):
        marital_status = True
    elif gender == "male":
        marital_status = None
    else:
        marital_status = False
    return marital_status


def was_adult(age):
    if age is None:
        output = None
    else:
        output = 18 < age
    return output
    

## 1. Get Data

In [5]:
data = read_csv("../data/raw/train.csv")
data = data.rename(columns=dict(zip(data.columns, data.columns.str.lower())))
data = data.set_index("passengerid")

## 2. Data Transformation

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

In [7]:
def create_category_features(df):
    df_aux = df.copy()
    df_aux.loc[:, "pclass"] = Categorical(df_aux["pclass"].map({3:"low", 2:"mid", 1:"high"}), categories=["low", "mid", "high"], ordered=True)
    df_aux.loc[:, "age_group"] = cut(df_aux["age"], range(0, 90, 10))
    df_aux.loc[:, "fare_group"] = cut(df_aux["fare"], range(0, 600, 50), include_lowest=True, right=False)
    return df_aux


def create_title_features(df):
    df_aux = df.copy()
    lastname_removed = df_aux["name"].str.split(",").apply(lambda x: x[1])
    title = lastname_removed.str.split(".").apply(lambda x: x[0])
    clean_title = title.str.lower().str.strip()
    df_aux.loc[:, "title"] = clean_title.apply(lambda t: "other" if t not in ["mr", "mrs", "ms", "miss"] else t)
    df_aux.loc[:, "married_female"] = df_aux.apply(lambda x: get_women_marital_status(x["sex"], x["title"]), axis=1)
    return df_aux


def create_ticket_features(df):
    df["travel_alone"] = ~df.duplicated(subset=['ticket'], keep=False)
    return df


def create_age_features(df):
    df["adult"] = nan
    df.loc[~df["age"].isna(), "adult"] = df["age"][~df["age"].isna()].apply(lambda x: 18 < x).copy()
    return df

In [8]:
get_categories = FunctionTransformer(create_category_features, validate=False)
get_titles = FunctionTransformer(create_title_features, validate=False)
get_tickets = FunctionTransformer(create_ticket_features, validate=False)
get_ages = FunctionTransformer(create_age_features, validate=False)

In [9]:
pipe = Pipeline(steps=[
    ("categories", get_categories),
    ("titles", get_titles),
    # ("tickets", get_tickets),
    # ("ages", get_ages)
], verbose=True)

In [10]:
numerical, categorical, dates, other = get_features(data.drop("survived", axis=1, inplace=False))

In [11]:
df_model = data.copy()

X = df_model[numerical + categorical]
y = df_model["survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

In [12]:
pipe.fit(X_train, y_train)

[Pipeline] ........ (step 1 of 2) Processing categories, total=   0.0s
[Pipeline] ............ (step 2 of 2) Processing titles, total=   0.0s


Pipeline(steps=[('categories',
                 FunctionTransformer(func=<function create_category_features at 0x7fea5fb58af0>)),
                ('titles',
                 FunctionTransformer(func=<function create_title_features at 0x7fea5fb69790>))],
         verbose=True)