In [51]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Opening Datasets
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train.set_index(["PassengerId"], inplace=True)
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Parte 1: Selección de Características

En esta sección se eligen las características que se utilizarán para trabajar el modelo, se transformarán y se crearán otras nuevas de cara a la ejecución y prueba del modelo.

### 1.1. Name

In [52]:
train_test_data = [train, test]
for df in train_test_data:
    df["title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)

In [53]:
train["title"].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Col           2
Major         2
Ms            1
Jonkheer      1
Countess      1
Don           1
Mme           1
Lady          1
Sir           1
Capt          1
Name: title, dtype: int64

In [54]:
title_mapping = {"Mlle": "high", "Ms": "high", "Mme": "high", 
                 "Sir": "high", "Lady": "high", "Countess": "high", 
                 "Mrs": "high", "Miss": "high", "Master": "mid", 
                 "Major": "mid", "Col": "mid", "Dr": "mid", 
                 "Mr": "low", "Rev": "low", "Dona": "low", 
                 "Jonkheer": "low", "Don": "low", "Capt": "low" }

for dataset in train_test_data:
    dataset["title"] = dataset["title"].map(title_mapping)

In [55]:
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low


### 1.2. Pclass

In [56]:
pclass_mapping = {1: "high", 2: "mid", 3: "low"}

for dataset in train_test_data:
    dataset["class"] = dataset["Pclass"].map(pclass_mapping)

In [57]:
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,class
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low,low
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high,high
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high,low
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high,high
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low,low


### 1.3. Sex

In [58]:
sex_mapping = { 'male': 0, 'female': 1 }
for dataset in train_test_data:
    dataset['sex'] = dataset['Sex'].map(sex_mapping)

In [59]:
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,class,sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low,low,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high,high,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high,low,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high,high,1
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low,low,0


### 1.4. Age

In [60]:
train["Age"].fillna(train.groupby("title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("title")["Age"].transform("median"), inplace=True)

dataset = [train, test]

for dataset in train_test_data:
    dataset.loc[dataset["Age"] <= 5, "age"] = "little_infant"
    dataset.loc[(dataset["Age"] > 5) & (dataset["Age"] <= 11), "age"] = "infant"
    dataset.loc[(dataset["Age"] > 12) & (dataset["Age"] <= 18), "age"] = "teen"
    dataset.loc[(dataset["Age"] > 18) & (dataset["Age"] <= 25), "age"] = "young"
    dataset.loc[(dataset["Age"] > 25) & (dataset["Age"] <= 60), "age"] = "adult"
    dataset.loc[ dataset["Age"] > 60, "age"] = "old"

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,class,sex,age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low,low,0,young
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high,high,1,adult
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high,low,1,adult
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high,high,1,adult
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low,low,0,adult


### 1.5. SibSp & Parch

In [61]:
for dataset in train_test_data:
    dataset["family_tmp"] = dataset["SibSp"]+dataset["Parch"]+1
    dataset.loc[dataset["family_tmp"] <= 1, "family_size"] = "alone",
    dataset.loc[(dataset["family_tmp"] > 1) & (dataset["family_tmp"] <= 3), "family_size"] = "small",
    dataset.loc[(dataset["family_tmp"] > 3) & (dataset["family_tmp"] <= 6), "family_size"] = "mid",
    dataset.loc[ dataset["family_tmp"] > 6, "family_size"] = "giant"

train = train.drop(['family_tmp'], axis=1)
test = test.drop(['family_tmp'], axis=1)

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,class,sex,age,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low,low,0,young,small
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high,high,1,adult,small
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high,low,1,adult,alone
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high,high,1,adult,small
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low,low,0,adult,alone


### 1.6. Ticket

In [62]:
train = train.drop(["Ticket"], axis=1)
test = test.drop(["Ticket"], axis=1)
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,class,sex,age,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S,low,low,0,young,small
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,high,high,1,adult,small
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S,high,low,1,adult,alone
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,high,high,1,adult,small
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S,low,low,0,adult,alone


In [63]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,class,sex,age,family_size
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,,Q,low,low,0,adult,alone
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,,S,high,low,1,adult,small
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,,Q,low,mid,0,old,alone
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,,S,low,low,0,adult,alone
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,,S,high,low,1,young,small


### 1.7. Fare

In [64]:
train["Fare"].fillna(train["Fare"].median(), inplace=True)
train.loc[train["Fare"] <= 8.0, "fare"] = "low"
train.loc[(train["Fare"] > 8.0) & (train["Fare"] <= 15.0), "fare"] = "mid"
train.loc[(train["Fare"] > 15.0) & (train["Fare"] <= 31.0), "fare"] = "high"
train.loc[train["Fare"] > 31.0, "fare"] = "highest"
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,class,sex,age,family_size,fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S,low,low,0,young,small,low
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,high,high,1,adult,small,highest
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S,high,low,1,adult,alone,low
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,high,high,1,adult,small,highest
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S,low,low,0,adult,alone,mid


In [65]:
test["Fare"].fillna(test["Fare"].median(), inplace=True)
test.loc[test["Fare"] <= 8.0, "fare"] = "low"
test.loc[(test["Fare"] > 8.0) & (test["Fare"] <= 15.0), "fare"] = "mid"
test.loc[(test["Fare"] > 15.0) & (test["Fare"] <= 31.0), "fare"] = "high"
test.loc[test["Fare"] > 31.0, "fare"] = "highest"
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,class,sex,age,family_size,fare
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,,Q,low,low,0,adult,alone,low
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,,S,high,low,1,adult,small,low
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,,Q,low,mid,0,old,alone,mid
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,,S,low,low,0,adult,alone,mid
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,,S,high,low,1,young,small,mid


### 1.8. Cabin

In [66]:
drop = ['Cabin']
train.drop(drop, axis=1,inplace=True)
test.drop(drop, axis=1, inplace=True)

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,title,class,sex,age,family_size,fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,low,low,0,young,small,low
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,high,high,1,adult,small,highest
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,high,low,1,adult,alone,low
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,high,high,1,adult,small,highest
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,low,low,0,adult,alone,mid


In [67]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,title,class,sex,age,family_size,fare
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,low,low,0,adult,alone,low
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,high,low,1,adult,small,low
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,low,mid,0,old,alone,mid
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,low,low,0,adult,alone,mid
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,high,low,1,young,small,mid


### 1.9. Embarked

In [69]:
train['embarked'] = train['Embarked']
test['embarked'] = test['Embarked']
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,title,class,sex,age,family_size,fare,embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,low,low,0,young,small,low,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,high,high,1,adult,small,highest,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,high,low,1,adult,alone,low,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,high,high,1,adult,small,highest,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,low,low,0,adult,alone,mid,S


In [70]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,title,class,sex,age,family_size,fare,embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,Q,low,low,0,adult,alone,low,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,S,high,low,1,adult,small,low,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,Q,low,mid,0,old,alone,mid,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,S,low,low,0,adult,alone,mid,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,S,high,low,1,young,small,mid,S
