# Competición titanic con keras 


## importar librerias y datos

In [93]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

## Unir los dataframes.

In [94]:
data = pd.concat([train_data, test_data])

## Ingeniera de datos

In [95]:
y_train = data[:891].Survived

# Borrar columnas innesesarias.
features_to_remove = ['PassengerId', 'Ticket', 'Cabin',  'Survived']
data = data.drop(features_to_remove , axis=1)
#  Reemplazar datos categóricos por  de sexo0s y  por 1s.
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
data['Embarked'] = LabelEncoder().fit_transform(data['Embarked'])

### Generación de nuevos atributos

A partir de ``SibSp`` y ``Parch`` generamos y reemplazamos lo anterior con un nuevo atributo para verificar si el pasajero arrivó solo o con algún pariente: ``IsAlone``.

In [96]:
data['IsAlone'] = 0
data.loc[ (data['SibSp'] + data['Parch']) == 0, 'IsAlone'] = 1 
data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

A partir de Los prefijos de los nombres del los pasajeros, se genera un nuevo atributo que representa el prefijo de los nombres: ``Title``.


In [97]:
# Extraer prefijo del nombre de las personas.
data['Title'] = data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

# Reemplazar los titulos raros con uno mas abarcativo: 'Rare'.
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle','Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')
data['Title'] = LabelEncoder().fit_transform(data['Title'])
data.drop(['Name'], axis=1, inplace=True)

Rellenamos ``Age`` faltantes según la relación con ``Sex`` y ``Pclass``.

In [98]:
guess_ages = np.zeros((2,3))

for i in range(0, 2): # itera con => 0,1 para sexo
    for j in range(0, 3):  # itera con => 1,2,3 para pclass
        guess_df = data[(data['Sex'] == i) & (data['Pclass'] == j+1)]['Age'].dropna()

        # Convierte el numero decimal al .5 mas cercano
        guess_ages[i,j] = round( guess_df.mean() *2 ) / 2

        data.loc[ (data.Age.isnull()) & (data.Sex == i) & (data.Pclass == j+1),'Age'] = guess_ages[i,j]

data["AgeCat"]= pd.cut(data["Age"], bins=[0,14.9,30,45,60,max(data["Age"]+1)], labels=['1','2','3','4','5'])
data.drop(['Age'], axis=1, inplace=True)

Rellenamos ``Fare`` con su mediana.

In [99]:
data['Fare'].fillna(data['Fare'].dropna().median(), inplace=True)
#data.drop(['Fare'],axis=1,inplace=True)

rellamos ``Embarked`` con el valor mas comun

In [100]:
data['Embarked'].fillna('S', inplace=True)

Convertimos columna de datos categóricos a variables dummys.

In [101]:
data = pd.get_dummies(data, columns=['Pclass', 'Embarked','Title'], prefix=['Pclass','Embarked','Title'])#, drop_first=True)

Split y visualización de ``data``.

In [156]:
X_train = data[:891]
test = data[891:]

In [155]:
data['AgeCat'] = LabelEncoder().fit_transform(data['AgeCat'])

In [157]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   Pclass    891 non-null    int64
 1   Sex       891 non-null    int32
 2   Embarked  891 non-null    int32
 3   IsAlone   891 non-null    int64
 4   Title     891 non-null    int32
 5   AgeCat    891 non-null    int64
dtypes: int32(3), int64(3)
memory usage: 38.3 KB


## Definición de modelo

In [115]:
def build_model():
  model = keras.Sequential([
    keras.layers.Dense(10, activation='relu', input_shape=[len(X_train.keys())]),
    #keras.layers.Dropout(0.2),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(10, activation='relu'),
    #keras.layers.Dropout(0.2),
   # keras.layers.Dense(10, activation='relu'),
    #keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid') 
  ])

  model.compile(loss='binary_crossentropy',
                optimizer=Adam(learning_rate=0.003),
                metrics=['accuracy'])

  return model

model = build_model()
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_35 (Dense)            (None, 10)                70        
                                                                 
 dense_36 (Dense)            (None, 100)               1100      
                                                                 
 dense_37 (Dense)            (None, 100)               10100     
                                                                 
 dense_38 (Dense)            (None, 100)               10100     
                                                                 
 dense_39 (Dense)            (None, 1)                 101       
                                                                 
Total params: 21,471
Trainable params: 21,471
Non-trainable params: 0
_________________________________________________________________


## Entrenamiento del modelo


In [158]:
history = model.fit(
    X_train,
    y_train,
    verbose=2, epochs=90, batch_size = 5 , validation_split=0.2)

Epoch 1/90
143/143 - 1s - loss: 0.5533 - accuracy: 0.7346 - val_loss: 0.5087 - val_accuracy: 0.8101 - 1s/epoch - 10ms/step
Epoch 2/90
143/143 - 0s - loss: 0.4993 - accuracy: 0.7809 - val_loss: 0.3933 - val_accuracy: 0.8268 - 410ms/epoch - 3ms/step
Epoch 3/90
143/143 - 0s - loss: 0.4879 - accuracy: 0.7823 - val_loss: 0.3952 - val_accuracy: 0.8324 - 407ms/epoch - 3ms/step
Epoch 4/90
143/143 - 0s - loss: 0.4669 - accuracy: 0.7992 - val_loss: 0.4009 - val_accuracy: 0.8380 - 389ms/epoch - 3ms/step
Epoch 5/90
143/143 - 0s - loss: 0.4592 - accuracy: 0.8104 - val_loss: 0.3845 - val_accuracy: 0.8436 - 359ms/epoch - 3ms/step
Epoch 6/90
143/143 - 0s - loss: 0.4411 - accuracy: 0.8020 - val_loss: 0.3873 - val_accuracy: 0.8436 - 451ms/epoch - 3ms/step
Epoch 7/90
143/143 - 0s - loss: 0.4430 - accuracy: 0.8048 - val_loss: 0.4215 - val_accuracy: 0.8212 - 363ms/epoch - 3ms/step
Epoch 8/90
143/143 - 0s - loss: 0.4535 - accuracy: 0.8188 - val_loss: 0.3924 - val_accuracy: 0.8547 - 400ms/epoch - 3ms/step
Ep

## Evaluar con data 100% accuracy
para evitar tener que estar subiendo constantemente la submission a kaggle, lo comparamos con el resultado final de la competición que se encuentra en un repo en github.

In [159]:
y_test = pd.read_csv('data/submission_100accuracy.csv')['Survived']

model.evaluate(test, y_test)



[0.7925856113433838, 0.7751196026802063]

## Predicciones

In [19]:
preds = model.predict(test)

preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0



## Exportar predicciones

In [20]:
prediction = pd.DataFrame({'PassengerId': test_data.PassengerId.values, 'Survived': preds.ravel().astype('int64')})
prediction.to_csv('data/results.csv', index=False)

In [21]:
prediction.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
