## Importação de Dados e Bibliotecas

In [1]:
# Import the Pandas library
import pandas as pd
import numpy as np
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.optimizers import SGD, RMSprop, Adam
from keras.layers import Dense, Activation, Dropout
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, cross_val_score
import matplotlib.pylab as plt
from keras import regularizers

# Load the train and test datasets to create two DataFrames
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
train = pd.read_csv(train_url, index_col = 0)
train["teste"] = 0
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
test = pd.read_csv(test_url, index_col = 0)
test["teste"] = 1

Using TensorFlow backend.


## Análise dos Dados Importados

In [2]:
print('\nHeader - Test:')
print(test.head())
print('\n\nHeader - Train:')
print(train.head())


print('\n\nShape - Test:')
print(test.shape)
print('\n\nShape - Train:')
print(train.shape)

print('\n\nDados estatísticos - Test:')
print(test.describe())

print('\n\nDados estatísticos - Train:')
print(train.describe())


Header - Test:
             Pclass                                          Name     Sex  \
PassengerId                                                                 
892               3                              Kelly, Mr. James    male   
893               3              Wilkes, Mrs. James (Ellen Needs)  female   
894               2                     Myles, Mr. Thomas Francis    male   
895               3                              Wirz, Mr. Albert    male   
896               3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

              Age  SibSp  Parch   Ticket     Fare Cabin Embarked  teste  
PassengerId                                                              
892          34.5      0      0   330911   7.8292   NaN        Q      1  
893          47.0      1      0   363272   7.0000   NaN        S      1  
894          62.0      0      0   240276   9.6875   NaN        Q      1  
895          27.0      0      0   315154   8.6625   NaN        S      1  


Nota-se que temos:

*PassengerId:* valor único para cada passageiro

*Survived:* 0 se morreu, 1 se sobreviveu

*Pclass:* 1, 2, 3 - 1a, 2a e 3a classe

*Name:* nome - único para cada passageiro (possível descobrir tamanho da família e o sexo)
    Formato: Sobrenome, Título Nome
    -- Utilizaremos apenas o Título

*Sex:* female, male

*Age:* int
    -- Dividiremos entre criança, adulto e idoso. Como apresentado na média, desvio padrão e mediana, a maior parte das pessoas são adultas. Então todos aqueles que não tiverem idade informada, consideraremos adultos.

*Sibsp (Siblings Spouse):* quantidade de irmãos e esposa

*Parch (Parent Children):* quantidade de pais e filhos

*Ticket:* único para cada passageiro
    -- Desconsideraremos este dado

*Fare:* quantidade paga pelo ticket (possível determinar a classe - possível que primeira classe tenha sido salva antes da terceira

*Cabin:* algumas são mais próximas à porta
    -- Devido ao grande número de NaN, desconsideraremos este dado

*Embarked:* portão de embarque

## Definição das Funções

In [4]:
def get_title_last_name(name):
    full_name = name['Name'].str.split(', ', n=0, expand=True)
    last_name = full_name[0]
    titles = full_name[1].str.split('.', n=0, expand=True)
    titles = titles[0]
    return(titles)

def normalize(series):
    return (series - series.min()) / (series.max() - series.min())

def build_model():
    m = Sequential([
        Dense(100, activation='relu', input_dim=feature_count),
        Dropout(0.3),
        Dense(30, activation='relu'),
        Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.03)),
    ])
    m.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])
    return m

def fix_age(titanic):
    #logica idade:
    # <18 - child (0)
    # >18<50 - adult (1)
    # > 50 - elderly (2)

    # Create the column Age_Cat and assign to '1' (default)
    titanic['Age_Cat'] = 'Adult'
    # Assign 0 to passengers under 18, 2 to those who are 50 or older.
    titanic['Age_Cat'][titanic['Age'] < 18] = 'Child'
    titanic["Age_Cat"][titanic["Age"] >= 50] = 'Elderly'    
    titanic = titanic.drop('Age', axis=1)
    return titanic

def normaliza_num(titanic):
    titanic['Parch'] = normalize(titanic['Parch'])
    titanic['SibSp'] = normalize(titanic['SibSp'])
    titanic['Fare'] = normalize(titanic['Fare'])
    return titanic

def encod_cat(titanic):
    titanic = titanic.join(pd.get_dummies(titanic['Sex'], 'Sex')).drop('Sex', axis=1)
    titanic = titanic.join(pd.get_dummies(titanic['Embarked'], 'Embarked')).drop('Embarked', axis=1)
    titanic = titanic.join(pd.get_dummies(titanic['Age_Cat'], 'Age_Cat')).drop('Age_Cat', axis=1)
    titanic = titanic.join(pd.get_dummies(titanic['Title'], 'Title')).drop('Title', axis=1)
    titanic = titanic.join(pd.get_dummies(titanic['Pclass'], 'Pclass')).drop('Pclass', axis=1)
    return titanic


In [5]:
pd.options.mode.chained_assignment = None  # default='warn'
titanic = pd.concat((train, test), axis=0)
titanic['Title'] = get_title_last_name(titanic)

titanic = fix_age(titanic) # como tem muito NaN, iremos dar um fix
titanic = encod_cat(titanic) # one_hot nos atributos categoricos
titanic = normaliza_num(titanic) # normalizacao nos atributos numericos
titanic = titanic.drop(['Name', 'Cabin', 'Ticket'], axis=1) # descarta Nome: so consideraremos o titulo, cabine: muitos NaN, e Ticket
test_data = titanic[titanic['teste'] == 1]
titanic = titanic.drop(['teste'], axis=1)
print(titanic)

titanic = titanic.as_matrix()

feature_count = titanic.shape[1]

                 Fare     Parch  SibSp  Survived  Sex_female  Sex_male  \
PassengerId                                                              
1            0.014151  0.000000  0.125       0.0           0         1   
2            0.139136  0.000000  0.125       1.0           1         0   
3            0.015469  0.000000  0.000       1.0           1         0   
4            0.103644  0.000000  0.125       1.0           1         0   
5            0.015713  0.000000  0.000       0.0           0         1   
6            0.016510  0.000000  0.000       0.0           0         1   
7            0.101229  0.000000  0.000       0.0           0         1   
8            0.041136  0.111111  0.375       0.0           0         1   
9            0.021731  0.222222  0.000       1.0           1         0   
10           0.058694  0.000000  0.125       1.0           1         0   
11           0.032596  0.111111  0.125       1.0           1         0   
12           0.051822  0.000000  0.000

## 10 cross fold validation

In [6]:
model = KerasClassifier(build_fn=build_model, nb_epoch=500, batch_size=30, verbose=2)
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1000)
results = cross_val_score(model, titanic [:len(train)], train['Survived'], cv=cv, n_jobs=-1)


Epoch 1/10
Epoch 1/10
Epoch 1/10
Epoch 1/10
Epoch 1/10
Epoch 1/10
Epoch 1/10
Epoch 1/10
0s - loss: 0.5532 - acc: 0.8416
Epoch 2/10
Epoch 1/10
Epoch 1/10
0s - loss: 0.3546 - acc: 0.9002
Epoch 3/10
0s - loss: 0.2253 - acc: 0.9651
Epoch 4/10
1s - loss: 0.5575 - acc: 0.8317
Epoch 2/10
1s - loss: 0.5585 - acc: 0.8354
Epoch 2/10
1s - loss: 0.5574 - acc: 0.8352
Epoch 2/10
0s - loss: 0.1424 - acc: 0.9938
Epoch 5/10
0s - loss: 0.3562 - acc: 0.9040
0s - loss: 0.3589 - acc: 0.9040
Epoch 3/10
Epoch 3/10
0s - loss: 0.3526 - acc: 0.9039
Epoch 3/10
0s - loss: 0.1021 - acc: 1.0000
Epoch 6/10
0s - loss: 0.2243 - acc: 0.9613
Epoch 4/10
0s - loss: 0.2235 - acc: 0.9676
Epoch 4/10
1s - loss: 0.5594 - acc: 0.8292
Epoch 2/10
0s - loss: 0.5581 - acc: 0.8182
Epoch 2/10
0s - loss: 0.0792 - acc: 1.0000
Epoch 7/10
0s - loss: 0.2246 - acc: 0.9538
0s - loss: 0.1490 - acc: 0.9900
0s - loss: 0.3488 - acc: 0.9127
Epoch 3/10
Epoch 4/10
Epoch 5/10
0s - loss: 0.1444 - acc: 0.9938
Epoch 5/10
0s - loss: 0.3517 - acc: 0.911

## Build and Predict Model

In [7]:
model = build_model()
model.fit(titanic [:len(train)], train['Survived'], epochs=500, batch_size=10, verbose=2)

Epoch 1/500
0s - loss: 0.4368 - acc: 0.8788
Epoch 2/500
0s - loss: 0.1712 - acc: 0.9787
Epoch 3/500
0s - loss: 0.0786 - acc: 0.9989
Epoch 4/500
0s - loss: 0.0470 - acc: 1.0000
Epoch 5/500
0s - loss: 0.0269 - acc: 1.0000
Epoch 6/500
0s - loss: 0.0155 - acc: 1.0000
Epoch 7/500
0s - loss: 0.0084 - acc: 1.0000
Epoch 8/500
0s - loss: 0.0051 - acc: 1.0000
Epoch 9/500
0s - loss: 0.0034 - acc: 1.0000
Epoch 10/500
0s - loss: 0.0025 - acc: 1.0000
Epoch 11/500
0s - loss: 0.0018 - acc: 1.0000
Epoch 12/500
0s - loss: 0.0014 - acc: 1.0000
Epoch 13/500
0s - loss: 0.0011 - acc: 1.0000
Epoch 14/500
0s - loss: 8.5199e-04 - acc: 1.0000
Epoch 15/500
0s - loss: 6.9279e-04 - acc: 1.0000
Epoch 16/500
0s - loss: 5.5748e-04 - acc: 1.0000
Epoch 17/500
0s - loss: 4.5304e-04 - acc: 1.0000
Epoch 18/500
0s - loss: 3.7469e-04 - acc: 1.0000
Epoch 19/500
0s - loss: 3.2411e-04 - acc: 1.0000
Epoch 20/500
0s - loss: 2.7922e-04 - acc: 1.0000
Epoch 21/500
0s - loss: 2.5284e-04 - acc: 1.0000
Epoch 22/500
0s - loss: 2.2389e-

<keras.callbacks.History at 0x7f7f5acc8a20>

In [8]:
p_survived = model.predict_classes(titanic [:len(test)], verbose=2)

## Build CSV for Submission

In [9]:
submission = pd.DataFrame()
submission['PassengerId'] = test_data.index
submission['Survived'] = p_survived

submission.to_csv('my_titanic_neuralnetwork.csv', index=False)