# Titanic: Machine Learning from Disaster

Autor: Diego López

Ultima Actualización: 2016-11-27

Kaggle link: [Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic)

El objetivo de este concurso es predecir si una persona sobrevive o no al accidente del titanic basado en distinas variables demográficas y de detalle ticket.

## Datos

Se cuenta solo con dos archivos train y test que tienen el siguiente diccionario de datos:

- survival:        Survival (0 = No; 1 = Yes)
- pclass:          Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
- name:            Name
- sex:             Sex
- age:             Age
- sibsp:           Number of Siblings/Spouses Aboard
- parch:           Number of Parents/Children Aboard
- ticket:          Ticket Number
- fare:            Passenger Fare
- cabin:           Cabin
- embarked:        Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

## Solución

In [1]:
# Leyendo información
import pandas as pd
import numpy as np
train = pd.read_csv("train.csv", index_col= "PassengerId")
test = pd.read_csv("test.csv" , index_col= "PassengerId")

In [2]:
# Juntamos ambos train y test y verificamos como van los nulls
full = pd.concat([train, test], ignore_index=True)
full.isnull().sum()

# Como vemos hay buen porcentaje de personas con la Edad y Cabina en Nulo.
# Para la edad utilizaremos modelo predictivo en cambio para cabina llenaremos con valores Z0

Age          263
Cabin       1014
Embarked       2
Fare           1
Name           0
Parch          0
Pclass         0
Sex            0
SibSp          0
Survived     418
Ticket         0
dtype: int64

In [125]:
# Completamos los valores vacios 2 para Embarked y uno para Fare basado en class y embarked
# Analisis visual PENDIENTE para justificar
_ = train.set_value(train.Embarked.isnull(), 'Embarked', 'C')
_ = test.set_value(test.Fare.isnull(), 'Fare', 8.05)

In [152]:
from sklearn import preprocessing 

# Creamos la variable titulo y controlamos los casos extraños
full["Title"] = full["Name"].str.replace('(.*, )|(\..*)','')
list_title_mr = ["Rev","Master","Dr","Don","Capt","Jonkheer","Major","Sir","Col"]
list_title_miss = ["Ms","Mlle", "Lady"]
list_title_mrs = ["Dona","Mme", "the Countess"]
full.loc[full['Title'].isin(list_title_mr), 'Title'] = 'Mr'
full.loc[full['Title'].isin(list_title_miss), 'Title'] = 'Miss'
full.loc[full['Title'].isin(list_title_mrs), 'Title'] = 'Mrs'

# Creamos la variable Group Size
full['Family_size'] = full['SibSp'] + full['Parch'] + 1
_ = full.set_value(full.Family_size == 1, 'Group_size', 0)
_ = full.set_value((1 < full.Family_size) &  (full.Family_size < 5), 'Group_size', 1)
_ = full.set_value( full.Family_size > 4, 'Group_size', 2)

# Creamos variable de agrupación de costo del ticket
_ = full.set_value(full.Fare <= 50, 'Fare_group', 0)
_ = full.set_value((full.Fare > 50) &  (full.Fare <= 150), 'Fare_group', 1)
_ = full.set_value(full.Fare > 150, 'Fare_group', 2)

# Cortamos la variable cabina en letra y numero
full['Cabin'] = full['Cabin'].fillna('Z0')
full['Cabin_letter'] = full.Cabin.str.extract('([a-zA-Z])')
full['Cabin_number'] = full.Cabin.str.extract('[a-zA-Z]?(\d+)')

# Transformamos los strings en numeros
le = preprocessing.LabelEncoder()
full.Sex = le.fit_transform(full.Sex)
full.Embarked = le.fit_transform(full.Embarked)
full.Title = le.fit_transform(full.Title)
full.Cabin_letter = le.fit_transform(full.Cabin_letter)



In [None]:
# Utilizaremos esta función de GridSearchCV para tunear nuestros modelos de regresion
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

def get_reg_model(estimator, parameters, X_train, y_train, scoring):  
    model = GridSearchCV(estimator, param_grid=parameters, scoring=scoring)
    model.fit(X_train, y_train)
    return model.best_estimator_

In [None]:
# Campos que utilizaremos para la predicción de edad
full_noAge = full[~full.Age.isnull()]
fields = ['Pclass','Embarked','Fare','Sex','Title','Family_size','Cabin_letter','Cabin_number'];
y = np.asarray(full_noAge.Age,dtype='i')

In [None]:
# Aplicamos gridsearch a xgboost para determinar parametros optimos
XGB = xgb.XGBRegressor(seed= 42)
scoring = make_scorer(mean_absolute_error, greater_is_better=False)
parameters = {'max_depth':np.arange(2,8), 'learning_rate': np.linspace(0.01,0.3,10),'reg_alpha':np.linspace(0.1,1.0,5), 'reg_lambda': np.linspace(1.0,3.0,5)}
reg_xgb = get_model(XGB, parameters, full[fields], y, scoring)
reg_xgb

In [None]:
# Aplicamos gridsearch a random forest para determinar parametros optimos
rf = RandomForestRegressor()
scoring = make_scorer(mean_absolute_error, greater_is_better=False)
parameters = {'max_depth':np.arange(3,8), 'n_estimators': np.arange(10,110,10), 'max_features': np.arange(1,len(fields) + 1)}
reg_rf = get_model(rf, parameters, full[fields], y, scoring)
reg_rf

In [166]:
# Teniendo todas las variables preparadas realizamos una predicción de edad utilizando 3 métodos: Regresión Lineal
# Random Forest y XGBoost con parametros optimizados

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.cross_validation import cross_val_score
from sklearn import metrics

clf1 = LinearRegression()
clf3 = RandomForestRegressor(max_depth = 7, n_estimators = 50, max_features = 3)
clf4 = xgb.XGBRegressor(max_depth=4, learning_rate = 0.042,reg_alpha=0.325,reg_lambda=2.0)


res_reglin,res_sgd,res_rf,res_xgb = [],[],[],[]

for i in range(10,15):
    res_reglin = np.append(res_reglin,cross_val_score(clf1, full_noAge[fields], y, cv=i, scoring='neg_mean_absolute_error'))
    res_rf = np.append(res_rf,cross_val_score(clf3, full_noAge[fields], y, cv=i, scoring='neg_mean_absolute_error'))
    res_xgb = np.append(res_xgb,cross_val_score(clf4, full_noAge[fields], y, cv=i, scoring='neg_mean_absolute_error'))

print(pd.Series(-res_reglin).describe())
print(pd.Series(-res_rf).describe())
print(pd.Series(-res_xgb).describe())

# Vemos que XGBoost ofrece un menor error aunque su variación estándar esta un poco debajo de random Forest
# Los modelos solo tienen un error promedio de 7.5 años lo cual es aceptable

count    60.000000
mean      8.023727
std       0.545866
min       6.884487
25%       7.619737
50%       8.040971
75%       8.373985
max       9.176355
dtype: float64
count    60.000000
mean      7.420226
std       0.465088
min       6.537685
25%       7.070930
50%       7.353834
75%       7.805982
max       8.466641
dtype: float64
count    60.000000
mean      7.271161
std       0.503940
min       6.242401
25%       6.919829
50%       7.191699
75%       7.578059
max       8.247223
dtype: float64


In [167]:
# Utilizamos XGBoost para predecir la edad y actualizamos valores en full
clf4.fit(full_noAge[fields], y)
X_predict = full[full.Age.isnull()]
pred = clf4.predict(X_predict[fields])
full.set_value(full.Age.isnull(), 'Age', pred)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_size,Group_size,Fare_group,Person,Cabin_letter,Cabin_number
0,22.0,Z0,2,7.25,"Braund, Mr. Owen Harris",0,3,1,1,0.0,A/5 21171,1,2,1.0,0.0,,8,0
1,38.0,C85,0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,0,1,1.0,PC 17599,2,2,1.0,1.0,,2,85
2,26.0,Z0,2,7.925,"Heikkinen, Miss. Laina",0,3,0,0,1.0,STON/O2. 3101282,0,1,0.0,0.0,,8,0
3,35.0,C123,2,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,0,1,1.0,113803,2,2,1.0,1.0,,2,123
4,35.0,Z0,2,8.05,"Allen, Mr. William Henry",0,3,1,0,0.0,373450,1,1,0.0,0.0,,8,0


In [171]:
# Ya que tenemos las edades seteadas, creamos una variable Persona que indica que si eres menor de 18 eres "niño"
# en caso contrario se setea tu sexo
full['Person'] = full.Sex
_ = full.set_value(full.Age < 18, 'Person', 2)
full

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,Family_size,Group_size,Fare_group,Person,Cabin_letter,Cabin_number
0,22.000000,Z0,2,7.2500,"Braund, Mr. Owen Harris",0,3,1,1,0.0,A/5 21171,1,2,1.0,0.0,1,8,0
1,38.000000,C85,0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,0,1,1.0,PC 17599,2,2,1.0,1.0,0,2,85
2,26.000000,Z0,2,7.9250,"Heikkinen, Miss. Laina",0,3,0,0,1.0,STON/O2. 3101282,0,1,0.0,0.0,0,8,0
3,35.000000,C123,2,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,0,1,1.0,113803,2,2,1.0,1.0,0,2,123
4,35.000000,Z0,2,8.0500,"Allen, Mr. William Henry",0,3,1,0,0.0,373450,1,1,0.0,0.0,1,8,0
5,26.042585,Z0,1,8.4583,"Moran, Mr. James",0,3,1,0,0.0,330877,1,1,0.0,0.0,1,8,0
6,54.000000,E46,2,51.8625,"McCarthy, Mr. Timothy J",0,1,1,0,0.0,17463,1,1,0.0,1.0,1,4,46
7,2.000000,Z0,2,21.0750,"Palsson, Master. Gosta Leonard",1,3,1,3,0.0,349909,1,5,2.0,0.0,2,8,0
8,27.000000,Z0,2,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,3,0,0,1.0,347742,2,3,1.0,0.0,0,8,0
9,14.000000,Z0,0,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,2,0,1,1.0,237736,2,2,1.0,0.0,2,8,0


In [173]:
# Preparamos los datos para la prediccion de sobrevivencia
n_train = full[~full.Survived.isnull()]
n_test = full[full.Survived.isnull()]

In [None]:
# Gridsearch para optimizar el XGB CLassifier
XGB = xgb.XGBClassifier(seed= 42)
scoring = make_scorer(accuracy_score, greater_is_better=True)
parameters = {'max_depth':np.arange(2,8), 'learning_rate': np.linspace(0.01,0.3,7), 'reg_alpha':np.linspace(0.1,1.0,5), 'reg_lambda': np.linspace(1.0,3.0,5)}
reg_xgb = get_model(XGB, parameters, n_train[fields], y, scoring)
reg_xgb

In [None]:
# Gridsearch para optimizar el RForest CLassifier
rf = RandomForestClassifier()
scoring = make_scorer(accuracy_score, greater_is_better=True)
parameters = {'max_features':np.arange(2,len(fields) + 1), 'n_estimators' : [50,75,100,125,150], 'min_samples_split': [50,75,100,125,150], 'max_depth': np.arange(2,8)}
reg_rf = get_model(rf, parameters, n_train[fields], y, scoring)
rf

In [174]:
# Aplicamos Regresion Logistica, RandomForest y XGBoost como clasificadores para determinar si usuario sobrevivió
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

clf1 = LogisticRegression()
clf3 = RandomForestClassifier(max_depth=3, max_features=3, min_samples_split=75, n_estimators=75)
clf4 = xgb.XGBClassifier(learning_rate = 0.3, max_depth = 2,reg_alpha=0.1,reg_lambda = 2.5)

fields = ['Pclass','Embarked','Fare_group','Person','Title','Family_size','Cabin_letter','Cabin_number'];
y = np.asarray(n_train.Survived,dtype='i')
res_reglin,res_sgd,res_rf,res_xgb = [],[],[],[]

for i in range(10,15):
    res_reglin = np.append(res_reglin,cross_val_score(clf1, n_train[fields], y, cv=i, scoring='accuracy'))
    res_rf = np.append(res_rf,cross_val_score(clf3, n_train[fields], y, cv=i, scoring='accuracy'))
    res_xgb = np.append(res_xgb,cross_val_score(clf4, n_train[fields], y, cv=i, scoring='accuracy'))

print(pd.Series(res_reglin).describe())
print(pd.Series(res_rf).describe())
print(pd.Series(res_xgb).describe())

# XGBoost Obtiene los mejores resultados

Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x0000024AB214A828>>
Traceback (most recent call last):
  File "C:\Users\Diego\Anaconda3\lib\site-packages\xgboost-0.6-py3.5.egg\xgboost\core.py", line 337, in __del__
    _check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'


count    60.000000
mean      0.700168
std       0.069527
min       0.569231
25%       0.647548
50%       0.705784
75%       0.750702
max       0.857143
dtype: float64
count    60.000000
mean      0.794167
std       0.044838
min       0.700000
25%       0.761905
50%       0.790123
75%       0.825099
max       0.897059
dtype: float64
count    60.000000
mean      0.802630
std       0.040838
min       0.698413
25%       0.777154
50%       0.801235
75%       0.838235
max       0.873016
dtype: float64


In [181]:
# Entrenamos y predecimos para finalmente enviarlo a Kaggle
clf3.fit(n_train[fields], y)
pred = clf3.predict(n_test[fields])

submission = pd.DataFrame({
        "PassengerId": test.index.values,
        "Survived": pred
    })
submission.to_csv('predictionn2.csv', index=False)