In [1]:
import pandas as pd
import re
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

pd.options.display.max_columns = None
#pd.set_printoptions(max_columns=500)

TRAIN = pd.read_csv("../../../datasources/titanic/train.csv")
TEST = pd.read_csv("../../../datasources/titanic/test.csv")
FULL = pd.concat([TRAIN, TEST]) # Assemble les deux jeux de données

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [2]:
###################################
# Fonction globale de préparation : Retourne le DataFrame préparé
###################################
def featureEngineering(data):
    fulldim = data.copy()
    
    # Sexe
    sexe = pd.get_dummies(fulldim['Sex'], prefix='sex')
    fulldim = fulldim.join(sexe)

    # Cabine, récupération du pont (on remplace le pont T proche du pont A)
    pont = pd.get_dummies(fulldim['Cabin'].fillna('X').str[0].replace('T', 'A'), prefix='Cabin')
    fulldim = fulldim.join(pont)
    
    # Cabine à Babord ou Tribord ?
    fulldim['CabinNumber'] = 0
    fulldim['CabinBaTri'] = 0
    fulldim['Cabin'] = fulldim['Cabin'].fillna('X0')
    for idx, cab in enumerate(fulldim['Cabin']):
        cb = cab.split(" ")[0]
        if cb[1:].isnumeric():
            numcab = int(cb[1:])
        else:
            numcab = 0
        # Récupère le numéro de cabine
        fulldim.loc[idx, 'CabinNumber']= numcab
        # Affecte pair (0) ou Impair (1) / Babord ou Tribord
        if (numcab > 0): 
            fulldim.loc[idx, 'CabinBaTri'] = int(numcab % 2)
        else:
            fulldim.loc[idx, 'CabinBaTri'] = 2 # inconnu !
    
    # Titre et Rang
    fulldim['Titre'] = fulldim.Name.map(lambda x : x.split(",")[1].split(".")[0]).fillna('X')
    fulldim['Rang'] = 0
    vip = ['Don','Sir', 'Major', 'Col', 'Jonkheer', 'Dr']
    femmeenfant = ['Miss', 'Mrs', 'Lady', 'Mlle', 'the Countess', 'Ms', 'Mme', 'Dona', 'Master']
    for idx, titre in enumerate(fulldim['Titre']): 
        if (titre.strip() in femmeenfant) :
            fulldim.loc[idx, 'Rang'] = 'FE'
        elif (titre.strip() in vip) :
            fulldim.loc[idx, 'Rang'] = 'VIP'
        else :
            fulldim.loc[idx, 'Rang'] = 'Autres'
    rg = pd.get_dummies(fulldim['Rang'], prefix='Rang')
    fulldim = fulldim.join(rg)
    
    # Age et catégories d'age
    age = fulldim['Age'].fillna(fulldim['Age'].median()) # rempl. NaN par Age médian
    catAge = []
    for i in range(fulldim.shape[0]) :
        if age[i] <= 3:
            catAge.append("bebe")
        elif age[i] > 3 and age[i] <= 16:
            catAge.append("enfant")
        elif age[i] > 16 and age[i] < 60:
            catAge.append("adulte") 
        else:
            catAge.append("vieux")
    catage = pd.DataFrame(catAge, columns = ['catAge'])
    # Force la catégorie d'age pour les Master (jeunes hommes)
    for idx, titre in enumerate(fulldim['Titre']): 
        if titre.strip() == 'Master':
            catage.loc[idx, 'catAge'] = 'enfant'      
    dumAge = pd.get_dummies(pd.DataFrame(catage, columns = ['catAge']), prefix='catAge')
    fulldim = fulldim.join(dumAge)
    
    # Embarquement ! est-ce nécessaire ???
    emb = pd.get_dummies(fulldim['Embarked'], prefix='emb')
    #fulldim = fulldim.join(emb)
    
    # Prix unitaire - Ticket, Prépartion d'un DF (TicketCounts) contenant les ticket avec leur nb d'occurence
    TicketCounts = pd.DataFrame(fulldim['Ticket'].value_counts().head())
    TicketCounts['TicketCount'] = TicketCounts['Ticket'] # renomme la colonne Ticket
    TicketCounts['Ticket'] = TicketCounts.index # rajoute une colonne Ticket pour le merge (jointure)
    # reporte le résultat dans le dataframe test (jointure des datasets)
    fin = pd.merge(fulldim, TicketCounts, how='left', on='Ticket')
    fin['PrixUnitaire'] = fin['Fare'] / fin['TicketCount'].fillna(1)
    prxunit = pd.DataFrame(fin['PrixUnitaire'])
    # Prix moyen 3eme classe (pour le passager de 3eme qui n'a pas de prix) ... on aurait pu faire une fonction ici ;-)
    prx3eme = fulldim.loc[fulldim['Pclass'] == 3]['Fare'].mean()
    prxunit = prxunit['PrixUnitaire'].fillna(prx3eme)
    fulldim = fulldim.join(prxunit)
    
    # Classe
    #pc = pd.DataFrame(MinMaxScaler().fit_transform(data[['Pclass']]), columns = ['Classe'])
    pc = pd.get_dummies(fulldim['Pclass'], prefix='Classe')
    fulldim = fulldim.join(pc)
    
    # family count
    fulldim['familyCount'] = fulldim['SibSp'] + fulldim['Parch']
    
    # Supprime les colonnes d'origine et de travail (temporaires)
    columns = ['Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Pclass', 'Cabin', 'Embarked',
               'Titre', 'Rang', 'CabinNumber']
    fulldim.drop(columns, inplace=True, axis=1)
    
    return fulldim

In [3]:
Xtrain = featureEngineering(TRAIN)
del Xtrain['Survived']
Xtest = featureEngineering(TEST)
ytrain = TRAIN.Survived

In [4]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=Xtrain.shape[1], activation='relu', input_dim=Xtrain.shape[1]))
model.add(Dense(activation="relu", units=100, kernel_initializer="uniform"))
model.add(Dense(activation="sigmoid", units=1, kernel_initializer="uniform"))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model.fit(Xtrain, ytrain, epochs=50, batch_size=30)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f50c380a588>

In [5]:
scores = model.evaluate(Xtrain, ytrain)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
predictions = model.predict(Xtest)
print([1 if x >= 0.5 else 0 for x in predictions])


acc: 82.04%
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,