# Competición titanic con keras 


## importar librerias y datos

In [2]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

## Unir los dataframes.

In [3]:
data = pd.concat([train_data, test_data])

## Ingeniera de datos

In [4]:
y_train = data[:891].Survived

# Borrar columnas innesesarias.
features_to_remove = ['PassengerId', 'Name', 'Ticket', 'Cabin',  'Survived']
data = data.drop(features_to_remove , axis=1)
#  Reemplazar datos categóricos por 0s y 1s.
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
# Imputar valores vacios con el valor medio de la edad.
data['Age'] = data['Age'].fillna(data['Age'].mean())
# convertir columna de datos categóricos a varias categorias ordinales para cada categoria.
data = pd.get_dummies(data, columns=['Pclass', 'Embarked'], prefix=['Pclass','Embarked'])

### Generación de nuevo atributo
A partir de ``SibSp`` y ``Parch`` generamos y reemplazamos lo anterior con un nuevo atributo para verificar si el pasajero arrivó solo o con algún pariente: ``IsAlone``.


In [5]:
data['IsAlone'] = 0
data.loc[ (data['SibSp'] + data['Parch']) == 0, 'IsAlone'] = 1 
data.drop(['SibSp', 'Parch'], axis=1, inplace=True)
### Split de datos en train y test
X_train = data[:891]
test = data[891:]

In [6]:
data

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,IsAlone
0,1,22.000000,7.2500,0,0,1,0,0,1,0
1,0,38.000000,71.2833,1,0,0,1,0,0,0
2,0,26.000000,7.9250,0,0,1,0,0,1,1
3,0,35.000000,53.1000,1,0,0,0,0,1,0
4,1,35.000000,8.0500,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...
413,1,29.881138,8.0500,0,0,1,0,0,1,1
414,0,39.000000,108.9000,1,0,0,1,0,0,1
415,1,38.500000,7.2500,0,0,1,0,0,1,1
416,1,29.881138,8.0500,0,0,1,0,0,1,1


## Definicion de modelo

In [16]:
def build_model():
  model = keras.Sequential([
    keras.layers.Dense(512, activation='relu', input_shape=[len(X_train.keys())]),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(512, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation='sigmoid') 
  ])

  model.compile(loss='binary_crossentropy',
                optimizer=Adam(learning_rate=0.01),
                metrics=['accuracy'])

  return model

model = build_model()
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 512)               5632      
                                                                 
 dropout_4 (Dropout)         (None, 512)               0         
                                                                 
 dense_7 (Dense)             (None, 512)               262656    
                                                                 
 dropout_5 (Dropout)         (None, 512)               0         
                                                                 
 dense_8 (Dense)             (None, 512)               262656    
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 dense_9 (Dense)             (None, 1)                

## Entrenamiento del modelo


In [17]:
history = model.fit(
    X_train,
    y_train,
    verbose=2, epochs=100, validation_split=0.2)

Epoch 1/100
23/23 - 4s - loss: 3.1937 - accuracy: 0.6124 - val_loss: 0.6248 - val_accuracy: 0.6425 - 4s/epoch - 154ms/step
Epoch 2/100
23/23 - 1s - loss: 0.6423 - accuracy: 0.6629 - val_loss: 0.5375 - val_accuracy: 0.6816 - 542ms/epoch - 24ms/step
Epoch 3/100
23/23 - 1s - loss: 0.6007 - accuracy: 0.6559 - val_loss: 0.4850 - val_accuracy: 0.7263 - 542ms/epoch - 24ms/step
Epoch 4/100
23/23 - 1s - loss: 0.5975 - accuracy: 0.6924 - val_loss: 0.4880 - val_accuracy: 0.7709 - 589ms/epoch - 26ms/step
Epoch 5/100
23/23 - 0s - loss: 0.5702 - accuracy: 0.7135 - val_loss: 0.4761 - val_accuracy: 0.8045 - 473ms/epoch - 21ms/step
Epoch 6/100
23/23 - 0s - loss: 0.5747 - accuracy: 0.7388 - val_loss: 0.5280 - val_accuracy: 0.7765 - 457ms/epoch - 20ms/step
Epoch 7/100
23/23 - 1s - loss: 0.5618 - accuracy: 0.7500 - val_loss: 0.4285 - val_accuracy: 0.7877 - 752ms/epoch - 33ms/step
Epoch 8/100
23/23 - 1s - loss: 0.5463 - accuracy: 0.7374 - val_loss: 0.4378 - val_accuracy: 0.8045 - 644ms/epoch - 28ms/step
Ep

## Predicciones

In [18]:
preds = model.predict(test)

preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0



## Exportar predicciones

In [19]:
prediction = pd.DataFrame({'PassengerId': test_data.PassengerId.values, 'Survived': preds.ravel().astype('int64')})
prediction.to_csv('data/results.csv', index=False)

In [20]:
prediction.head(5)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
