 # Competición titanic con keras


 ## importar librerias y datos

In [143]:
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

 ## Unir los dataframes.

In [144]:
data = pd.concat([train_data, test_data])


 ## Ingeniera de datos

In [145]:
y_train = data[:891].Survived

# Borrar columnas innesesarias.
features_to_remove = ['PassengerId', 'Ticket', 'Cabin',  'Survived', 'Fare', ]
data = data.drop(features_to_remove , axis=1)
#  Reemplazar datos categóricos por de sexo 0s y  por 1s.
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])

Verificamos si hay valores nulos.

In [146]:
data.isna().sum()

Pclass        0
Name          0
Sex           0
Age         263
SibSp         0
Parch         0
Embarked      2
dtype: int64

 ### Generación de nuevos atributos

 A partir de ``SibSp`` y ``Parch`` generamos y reemplazamos lo anterior con un nuevo atributo para verificar si el pasajero arrivó solo o con algún pariente: ``IsAlone``.

In [147]:
data['IsAlone'] = 0
data.loc[ (data['SibSp'] + data['Parch']) == 0, 'IsAlone'] = 1 
data.drop(['SibSp', 'Parch'], axis=1, inplace=True)


 A partir de Los prefijos de los nombres del los pasajeros, se genera un nuevo atributo que representa el prefijo de los nombres: ``Title``.


In [148]:
# Extraer prefijo del nombre de las personas.
data['Title'] = data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

# Reemplazar los titulos raros con uno mas abarcativo: 'Rare'.
data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
data['Title'] = data['Title'].replace(['Mlle','Ms'], 'Miss')
data['Title'] = data['Title'].replace('Mme', 'Mrs')
data['Title'] = LabelEncoder().fit_transform(data['Title'])
data.drop(['Name'], axis=1, inplace=True)


 Rellenamos ``Age`` faltantes según la relación con ``Sex`` y ``Pclass``.

In [149]:
guess_ages = np.zeros((2,3))

for i in range(0, 2): # itera con => 0,1 para sexo
    for j in range(0, 3):  # itera con => 1,2,3 para pclass
        guess_df = data[(data['Sex'] == i) & (data['Pclass'] == j+1)]['Age'].dropna()

        # Convierte el numero decimal al .5 mas cercano
        guess_ages[i,j] = round( guess_df.mean() *2 ) / 2

        data.loc[ (data.Age.isnull()) & (data.Sex == i) & (data.Pclass == j+1),'Age'] = guess_ages[i,j]

data["AgeCat"]= pd.cut(data["Age"], bins=[0, 15, 30, 45, 60, max(data["Age"]+1)], labels=[1, 2, 3, 4, 5])
data.drop(['Age'], axis=1, inplace=True)


 rellamos ``Embarked`` con el valor mas comun

In [150]:
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)


 Convertimos columna de datos categóricos a variables dummys.

In [151]:
data = pd.get_dummies(data, columns=['Pclass', 'Embarked', 'Title'], prefix=['Pclass','Embarked', 'Title'])#, drop_first=True)


 Split y visualización de ``data``.

In [152]:
X_train = data[:891]
test = data[891:]


In [153]:
X_train


Unnamed: 0,Sex,IsAlone,AgeCat,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Title_0,Title_1,Title_2,Title_3,Title_4
0,1,0,2,0,0,1,0,0,1,0,0,1,0,0
1,0,0,3,1,0,0,1,0,0,0,0,0,1,0
2,0,1,2,0,0,1,0,0,1,0,1,0,0,0
3,0,0,3,1,0,0,0,0,1,0,0,0,1,0
4,1,1,3,0,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,1,1,2,0,1,0,0,0,1,0,0,0,0,1
887,0,1,2,1,0,0,0,0,1,0,1,0,0,0
888,0,0,2,0,0,1,0,0,1,0,1,0,0,0
889,1,1,2,1,0,0,1,0,0,0,0,1,0,0


 ## Definición de modelo

In [160]:
def build_model():
  model = keras.Sequential([
    keras.layers.Dense(10, activation='relu', input_shape=[len(X_train.keys())]),
    #keras.layers.Dropout(0.2),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(10, activation='relu'),
    #keras.layers.Dropout(0.2),
   # keras.layers.Dense(10, activation='relu'),
    #keras.layers.Dense(10, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid') 
  ])

  model.compile(loss='binary_crossentropy',
                optimizer=Adam(learning_rate=0.00251),
                metrics=['accuracy'])

  return model

keras.utils.set_random_seed(0)

model = build_model()
model.summary()


Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_48 (Dense)            (None, 10)                150       
                                                                 
 dense_49 (Dense)            (None, 10)                110       
                                                                 
 dense_50 (Dense)            (None, 10)                110       
                                                                 
 dense_51 (Dense)            (None, 1)                 11        
                                                                 
Total params: 381
Trainable params: 381
Non-trainable params: 0
_________________________________________________________________


 ## Entrenamiento del modelo


In [161]:
history = model.fit(
    X_train,
    y_train,
    verbose=2, epochs=70, batch_size = 5 , validation_split=0.2)


Epoch 1/70
143/143 - 2s - loss: 0.5628 - accuracy: 0.7360 - val_loss: 0.4180 - val_accuracy: 0.8101 - 2s/epoch - 17ms/step
Epoch 2/70
143/143 - 1s - loss: 0.4571 - accuracy: 0.7879 - val_loss: 0.3848 - val_accuracy: 0.8380 - 515ms/epoch - 4ms/step
Epoch 3/70
143/143 - 1s - loss: 0.4413 - accuracy: 0.8062 - val_loss: 0.3854 - val_accuracy: 0.8492 - 519ms/epoch - 4ms/step
Epoch 4/70
143/143 - 0s - loss: 0.4414 - accuracy: 0.8034 - val_loss: 0.3798 - val_accuracy: 0.8436 - 421ms/epoch - 3ms/step
Epoch 5/70
143/143 - 0s - loss: 0.4329 - accuracy: 0.8090 - val_loss: 0.3774 - val_accuracy: 0.8380 - 447ms/epoch - 3ms/step
Epoch 6/70
143/143 - 0s - loss: 0.4357 - accuracy: 0.8104 - val_loss: 0.3663 - val_accuracy: 0.8436 - 472ms/epoch - 3ms/step
Epoch 7/70
143/143 - 0s - loss: 0.4311 - accuracy: 0.8244 - val_loss: 0.3720 - val_accuracy: 0.8492 - 431ms/epoch - 3ms/step
Epoch 8/70
143/143 - 0s - loss: 0.4308 - accuracy: 0.8090 - val_loss: 0.3709 - val_accuracy: 0.8659 - 458ms/epoch - 3ms/step
Ep

 ## Evaluar con data 100% accuracy
 para evitar tener que estar subiendo constantemente la submission a kaggle, lo comparamos con el resultado final de la competición que se encuentra en un repo en github.

In [162]:
y_test = pd.read_csv('data/submission_100accuracy.csv')['Survived']

model.evaluate(test, y_test)




[0.5450281500816345, 0.7942583560943604]

 ## Predicciones

In [163]:
preds = model.predict(test)
preds[preds > 0.5] = 1
preds[preds <= 0.5] = 0




 ## Exportar predicciones

In [164]:
prediction = pd.DataFrame({'PassengerId': test_data.PassengerId.values, 'Survived': preds.ravel().astype('int64')})
prediction.to_csv('data/results.csv', index=False)


In [None]:
prediction.head(5)


