In [1]:
# Importing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Opening Datasets
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

train.set_index(["PassengerId"], inplace=True)
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Part 1: Feature Engineering

En esta sección se eligen las características que se utilizarán para trabajar el modelo, se transformarán y se crearán otras nuevas de cara a la ejecución y prueba del modelo.

### 1.1. Name

In [2]:
train_test_data = [train, test]
for df in train_test_data:
    df["title"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False)

In [3]:
train["title"].value_counts()

Mr          517
Miss        182
Mrs         125
Master       40
Dr            7
Rev           6
Mlle          2
Col           2
Major         2
Ms            1
Capt          1
Mme           1
Sir           1
Lady          1
Jonkheer      1
Countess      1
Don           1
Name: title, dtype: int64

In [4]:
title_mapping = {"Mlle": "high", "Ms": "high", "Mme": "high", 
                 "Sir": "high", "Lady": "high", "Countess": "high", 
                 "Mrs": "high", "Miss": "high", "Master": "mid", 
                 "Major": "mid", "Col": "mid", "Dr": "mid", 
                 "Mr": "low", "Rev": "low", "Dona": "low", 
                 "Jonkheer": "low", "Don": "low", "Capt": "low" }

for dataset in train_test_data:
    dataset["title"] = dataset["title"].map(title_mapping)

In [5]:
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low


### 1.2. Pclass

In [6]:
pclass_mapping = {1: "high", 2: "mid", 3: "low"}

for dataset in train_test_data:
    dataset["pclass"] = dataset["Pclass"].map(pclass_mapping)

In [7]:
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,pclass
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low,low
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high,high
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high,low
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high,high
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low,low


### 1.3. Sex

In [8]:
sex_mapping = { 'male': 0, 'female': 1 }
for dataset in train_test_data:
    dataset['sex'] = dataset['Sex'].map(sex_mapping)

In [9]:
train.head(5)

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,pclass,sex
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low,low,0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high,high,1
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high,low,1
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high,high,1
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low,low,0


### 1.4. Age

In [10]:
train["Age"].fillna(train.groupby("title")["Age"].transform("median"), inplace=True)
test["Age"].fillna(test.groupby("title")["Age"].transform("median"), inplace=True)

dataset = [train, test]

for dataset in train_test_data:
    dataset.loc[dataset["Age"] <= 5, "age"] = "little_infant"
    dataset.loc[(dataset["Age"] > 5) & (dataset["Age"] <= 11), "age"] = "infant"
    dataset.loc[(dataset["Age"] > 12) & (dataset["Age"] <= 18), "age"] = "teen"
    dataset.loc[(dataset["Age"] > 18) & (dataset["Age"] <= 25), "age"] = "young"
    dataset.loc[(dataset["Age"] > 25) & (dataset["Age"] <= 60), "age"] = "adult"
    dataset.loc[ dataset["Age"] > 60, "age"] = "old"

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,pclass,sex,age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low,low,0,young
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high,high,1,adult
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high,low,1,adult
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high,high,1,adult
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low,low,0,adult


### 1.5. SibSp & Parch

In [11]:
for dataset in train_test_data:
    dataset["family_tmp"] = dataset["SibSp"]+dataset["Parch"]+1
    dataset.loc[dataset["family_tmp"] <= 1, "family_size"] = "alone",
    dataset.loc[(dataset["family_tmp"] > 1) & (dataset["family_tmp"] <= 3), "family_size"] = "small",
    dataset.loc[(dataset["family_tmp"] > 3) & (dataset["family_tmp"] <= 6), "family_size"] = "mid",
    dataset.loc[ dataset["family_tmp"] > 6, "family_size"] = "giant"

train = train.drop(['family_tmp'], axis=1)
test = test.drop(['family_tmp'], axis=1)

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,title,pclass,sex,age,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low,low,0,young,small
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high,high,1,adult,small
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,high,low,1,adult,alone
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high,high,1,adult,small
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,low,low,0,adult,alone


### 1.6. Ticket

In [12]:
train = train.drop(["Ticket"], axis=1)
test = test.drop(["Ticket"], axis=1)
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,pclass,sex,age,family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S,low,low,0,young,small
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,high,high,1,adult,small
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S,high,low,1,adult,alone
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,high,high,1,adult,small
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S,low,low,0,adult,alone


In [13]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,pclass,sex,age,family_size
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,,Q,low,low,0,adult,alone
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,,S,high,low,1,adult,small
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,,Q,low,mid,0,old,alone
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,,S,low,low,0,adult,alone
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,,S,high,low,1,young,small


### 1.7. Fare

In [14]:
train["Fare"].fillna(train["Fare"].median(), inplace=True)
train.loc[train["Fare"] <= 8.0, "fare"] = "low"
train.loc[(train["Fare"] > 8.0) & (train["Fare"] <= 15.0), "fare"] = "mid"
train.loc[(train["Fare"] > 15.0) & (train["Fare"] <= 31.0), "fare"] = "high"
train.loc[train["Fare"] > 31.0, "fare"] = "highest"
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,pclass,sex,age,family_size,fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,,S,low,low,0,young,small,low
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,high,high,1,adult,small,highest
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,,S,high,low,1,adult,alone,low
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,high,high,1,adult,small,highest
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,,S,low,low,0,adult,alone,mid


In [15]:
test["Fare"].fillna(test["Fare"].median(), inplace=True)
test.loc[test["Fare"] <= 8.0, "fare"] = "low"
test.loc[(test["Fare"] > 8.0) & (test["Fare"] <= 15.0), "fare"] = "mid"
test.loc[(test["Fare"] > 15.0) & (test["Fare"] <= 31.0), "fare"] = "high"
test.loc[test["Fare"] > 31.0, "fare"] = "highest"
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,pclass,sex,age,family_size,fare
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,,Q,low,low,0,adult,alone,low
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,,S,high,low,1,adult,small,low
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,,Q,low,mid,0,old,alone,mid
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,,S,low,low,0,adult,alone,mid
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,,S,high,low,1,young,small,mid


### 1.8. Cabin

In [16]:
train["Cabin"].fillna('U', inplace=True)
test["Cabin"].fillna('U', inplace=True)

train['cabin'] = train['Cabin'].apply(lambda x:x[0])
test['cabin'] = test['Cabin'].apply(lambda x:x[0])

train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,pclass,sex,age,family_size,fare,cabin
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,U,S,low,low,0,young,small,low,U
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,high,high,1,adult,small,highest,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,U,S,high,low,1,adult,alone,low,U
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,high,high,1,adult,small,highest,C
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,U,S,low,low,0,adult,alone,mid,U


In [17]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,pclass,sex,age,family_size,fare,cabin
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,U,Q,low,low,0,adult,alone,low,U
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,U,S,high,low,1,adult,small,low,U
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,U,Q,low,mid,0,old,alone,mid,U
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,U,S,low,low,0,adult,alone,mid,U
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,U,S,high,low,1,young,small,mid,U


### 1.9. Embarked

In [18]:
train['embarked'] = train['Embarked']
test['embarked'] = test['Embarked']
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,pclass,sex,age,family_size,fare,cabin,embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,U,S,low,low,0,young,small,low,U,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C85,C,high,high,1,adult,small,highest,C,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,U,S,high,low,1,adult,alone,low,U,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,C123,S,high,high,1,adult,small,highest,C,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,U,S,low,low,0,adult,alone,mid,U,S


In [19]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,title,pclass,sex,age,family_size,fare,cabin,embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,7.8292,U,Q,low,low,0,adult,alone,low,U,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,7.0,U,S,high,low,1,adult,small,low,U,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,9.6875,U,Q,low,mid,0,old,alone,mid,U,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,8.6625,U,S,low,low,0,adult,alone,mid,U,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,12.2875,U,S,high,low,1,young,small,mid,U,S


### 1.10. Feature Cleaning

In [20]:
drop = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Cabin', 'Fare', 'Embarked']
train.drop(drop, axis=1,inplace=True)
test.drop(drop, axis=1, inplace=True)

train.head()

Unnamed: 0_level_0,Survived,title,pclass,sex,age,family_size,fare,cabin,embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,low,low,0,young,small,low,U,S
2,1,high,high,1,adult,small,highest,C,C
3,1,high,low,1,adult,alone,low,U,S
4,1,high,high,1,adult,small,highest,C,S
5,0,low,low,0,adult,alone,mid,U,S


In [21]:
test.head()

Unnamed: 0,PassengerId,title,pclass,sex,age,family_size,fare,cabin,embarked
0,892,low,low,0,adult,alone,low,U,Q
1,893,high,low,1,adult,small,low,U,S
2,894,low,mid,0,old,alone,mid,U,Q
3,895,low,low,0,adult,alone,mid,U,S
4,896,high,low,1,young,small,mid,U,S


### 1.11. Final Features

In [22]:
title_dummies = pd.get_dummies(train.title, prefix="title")
pclass_dummies = pd.get_dummies(train.pclass, prefix="class")
age_dummies = pd.get_dummies(train.age, prefix="age")
family_size_dummies = pd.get_dummies(train.family_size, prefix="family_size")
fare_dummies = pd.get_dummies(train.fare, prefix="fare")
embarked_dummies = pd.get_dummies(train.embarked, prefix="embarked")
cabin_dummies = pd.get_dummies(train.cabin, prefix="cabin")

target = train['Survived']
drop = ['Survived','title','pclass', 'age', 'family_size', 'fare', 'cabin','embarked']
train.drop(drop, axis=1,inplace=True)

train = pd.concat([train, title_dummies, pclass_dummies, 
                   age_dummies, family_size_dummies, 
                   fare_dummies, cabin_dummies, 
                   embarked_dummies], axis=1)
train.head()

Unnamed: 0_level_0,sex,title_high,title_low,title_mid,class_high,class_low,class_mid,age_adult,age_infant,age_little_infant,...,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T,cabin_U,embarked_C,embarked_Q,embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,1,1,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0
3,1,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1
4,1,1,0,0,1,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
5,0,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [23]:
title_dummies = pd.get_dummies(test.title, prefix="title")
pclass_dummies = pd.get_dummies(test.pclass, prefix="class")
age_dummies = pd.get_dummies(test.age, prefix="age")
family_size_dummies = pd.get_dummies(test.family_size, prefix="family_size")
fare_dummies = pd.get_dummies(test.fare, prefix="fare")
embarked_dummies = pd.get_dummies(test.embarked, prefix="embarked")
cabin_dummies = pd.get_dummies(test.cabin, prefix="cabin")

drop = ['title','pclass', 'age', 'family_size', 'fare', 'cabin','embarked']
test.drop(drop, axis=1,inplace=True)

test = pd.concat([test, title_dummies, pclass_dummies, 
                  age_dummies, family_size_dummies, 
                  fare_dummies, cabin_dummies,
                  embarked_dummies], axis=1)
test.head()

Unnamed: 0,PassengerId,sex,title_high,title_low,title_mid,class_high,class_low,class_mid,age_adult,age_infant,...,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_U,embarked_C,embarked_Q,embarked_S
0,892,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,1,0
1,893,1,1,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,1
2,894,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,0
3,895,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,1
4,896,1,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,1


## Part 2: Model Selection

En esta sección se llevan las características al modelo y se comienzan a probar predicciones.

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np

  from numpy.core.umath_tests import inner1d


### 2.1 K-Fold Cross Validation

In [35]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
k_fold = KFold(n_splits=10, shuffle=True, random_state=0)

### 2.2.- K-Nearest Neighboors

In [45]:
clf = KNeighborsClassifier(n_neighbors = 5)
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.82222222 0.86516854 0.7752809  0.82022472 0.84269663 0.83146067
 0.86516854 0.83146067 0.75280899 0.82022472]


In [46]:
# Score's Mean
print("Score's mean: ",round(np.mean(score)*100, 2))

Score's mean:  82.27


### 2.3- Decision Trees

In [47]:
clf = DecisionTreeClassifier()
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.82222222 0.83146067 0.78651685 0.78651685 0.85393258 0.80898876
 0.85393258 0.84269663 0.7752809  0.80898876]


In [48]:
# Score's Mean
print("Score's mean: ",round(np.mean(score)*100, 2))

Score's mean:  81.71


### 2.4.- Random Forest

In [49]:
clf = RandomForestClassifier(n_estimators=100)
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.83333333 0.84269663 0.78651685 0.82022472 0.84269663 0.82022472
 0.85393258 0.83146067 0.82022472 0.80898876]


In [50]:
# Score's Mean
print("Score's mean: ",round(np.mean(score)*100, 2))

Score's mean:  82.6


### 2.5.- Naive Bayes

In [51]:
clf = GaussianNB()
scoring = 'accuracy'
score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
print(score)

[0.75555556 0.68539326 0.61797753 0.71910112 0.65168539 0.82022472
 0.69662921 0.70786517 0.78651685 0.80898876]


In [52]:
# Score's Mean
print("Score's mean: ",round(np.mean(score)*100, 2))

Score's mean:  72.5


### 2.6.- Support Vector Machines

In [53]:
cs = [0.1, 1, 10, 100, 1000]
kernel = 'rbf'
scoring = 'accuracy'
for c in cs:
    clf = SVC(kernel=kernel, C=c)
    score = cross_val_score(clf, train, target, cv=k_fold, n_jobs=1, scoring=scoring)
    print(score)
    print("Score para c=", c)
    print(round(np.mean(score)*100,2)) 
    print("----------------------------------------------")

[0.8        0.7752809  0.78651685 0.75280899 0.78651685 0.78651685
 0.78651685 0.79775281 0.79775281 0.80898876]
Score para c= 0.1
78.79
----------------------------------------------
[0.81111111 0.78651685 0.80898876 0.7752809  0.79775281 0.80898876
 0.83146067 0.82022472 0.83146067 0.84269663]
Score para c= 1
81.14
----------------------------------------------
[0.82222222 0.78651685 0.82022472 0.82022472 0.82022472 0.80898876
 0.84269663 0.84269663 0.83146067 0.83146067]
Score para c= 10
82.27
----------------------------------------------
[0.82222222 0.83146067 0.80898876 0.79775281 0.84269663 0.83146067
 0.83146067 0.86516854 0.79775281 0.79775281]
Score para c= 100
82.27
----------------------------------------------
[0.82222222 0.85393258 0.78651685 0.82022472 0.82022472 0.80898876
 0.84269663 0.84269663 0.7752809  0.80898876]
Score para c= 1000
81.82
----------------------------------------------
