# Kaggle Titanic: aller plus loin

In [1]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler

train = pd.read_csv("../datasources/titanic/train.csv")
test = pd.read_csv("../datasources/titanic/test.csv")
full = pd.concat([train, test]) # Assemble les deux jeux de données

# Travail sur le ticket

In [2]:
noticket = []
full['Ticket'].fillna('X')
for ticketnn in full['Ticket']:
    if (ticketnn == 'X'):
        noticket.append(1)
    else:
        noticket.append(0)
pd.DataFrame(noticket)[0].value_counts()

0    1309
Name: 0, dtype: int64

# Tout le monde a un ticket !

In [3]:
test['Ticket'].value_counts().head()

PC 17608    5
CA. 2343    4
113503      4
347077      3
PC 17483    3
Name: Ticket, dtype: int64

#  Par contre les tickets ne sont pas uniques ! il va donc falloir diviser le prix par nombre de ticket

In [4]:
# Inutile / juste pour un exemple de groupby, on va utiliser le count à la place ;-)
#listunit = pd.DataFrame(test.groupby(['Ticket'])['Fare'].mean())
#listunit['Ticket'] = listunit.index

# Prépartion d'un DF (TicketCounts) contenant les ticket avec leur nb d'occurence
TicketCounts = pd.DataFrame(test['Ticket'].value_counts().head())
TicketCounts['TicketCount'] = TicketCounts['Ticket'] # renomme la colonne Ticket
TicketCounts['Ticket'] = TicketCounts.index # rajoute une colonne Ticket pour le merge (jointure)

# Reporte le résultat dans le dataframe test (jointure des datasets)
fin = pd.merge(test, TicketCounts, how='left', on='Ticket')
fin['PrixUnitaire'] = fin['Fare'] / fin['TicketCount'].fillna(1)

#prxunit = pd.DataFrame(fin['PrixUnitaire'])
display(fin.head(3))

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,TicketCount,PrixUnitaire
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,,7.8292
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,,7.0
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,,9.6875


# Attention on a un passager qui n'a pas de prix de ticket

In [5]:
import numpy as np
test.loc[np.isnan(test['Fare'])]

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
152,1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S


In [6]:
# prennons la moyenne des embarqués à S... !
test.loc[test['Pclass'] == 3]['Fare'].mean()

12.459677880184334

# Travail sur le nom

In [None]:
familynames = []
for noms in full["Name"]:
    familynames.append(re.search('([A-Za-z0-9]*),\ ([A-Za-z0-9 ]*)\. (.*)', noms).group(1))
pdfamilynames = pd.DataFrame(familynames, columns = ['familynames'])

# Créé une liste des noms de famille avec plus de 2 occurences
famsurv = full.join(pdfamilynames)
famCount = famsurv['familynames'].value_counts()
pdfamCounts = pd.DataFrame(famCount, columns = ['familynames'])
pdfamCounts['famCount'] = pdfamCounts['familynames']
pdfamCounts['familynames'] = pdfamCounts.index
pdfamCounts = pdfamCounts[pdfamCounts['famCount'] >= 2]
print(pdfamCounts)

# Fonction ajoutant les colonnes noms famille dans un DF
def addColumnFamilyName(data):
    # ajoute les colonnes nulles avec les noms de famille
    for family in pdfamCounts['familynames']:
        data[family] = 0
    # récupère le nom de famille dans le DF
    for idx, f in enumerate(data["Name"]):
        # Modifie les colonnes dummies du nom de famille en 1 ou 0 selon le nom de famille
        iNom = re.search('([A-Za-z0-9]*),\ ([A-Za-z0-9 ]*)\. (.*)', f).group(1)
        for col in data.columns:
            if (col == iNom):
                data.loc[idx, col] = 1


# Travail sur le titre

In [26]:
full['Titre'] = full.Name.map(lambda x : x.split(",")[1].split(".")[0])
full['NomFamille'] = full.Name.map(lambda x : x.split(",")[0])
titre = pd.DataFrame(full['Titre'])
full['Titre'].value_counts() # affiche tous les titres possible

 Mr              757
 Miss            260
 Mrs             197
 Master           61
 Dr                8
 Rev               8
 Col               4
 Mlle              2
 Major             2
 Ms                2
 Dona              1
 Jonkheer          1
 the Countess      1
 Don               1
 Mme               1
 Capt              1
 Lady              1
 Sir               1
Name: Titre, dtype: int64

In [27]:
X = test
X['Rang'] = 0
X['Titre'] = X.Name.map(lambda x : x.split(",")[1].split(".")[0])
vip = ['Don','Sir', 'Major', 'Col', 'Jonkheer', 'Dr', 'Rev']
femmeenfant = ['Miss', 'Mrs', 'Lady', 'Mlle', 'the Countess', 'Ms', 'Mme', 'Dona', 'Master']
for idx, titre in enumerate(X['Titre']): 
    if (titre.strip() in femmeenfant) :
        X.loc[idx, 'Rang'] = 'FE'
    elif (titre.strip() in vip) :
        X.loc[idx, 'Rang'] = 'VIP'
    else :
        X.loc[idx, 'Rang'] = 'Autres'
X['Rang'].value_counts()

Autres    240
FE        173
VIP         5
Name: Rang, dtype: int64

# Travail sur l'age

In [45]:
age = X['Age'].fillna(X['Age'].mean())
catAge = []
for i in range(X.shape[0]) :
    if age[i] < 3:
        catAge.append("bebe")
    elif age[i] >= 3 and age[i] < 15:
        catAge.append("enfant")
    elif age[i] >= 15 and age[i] < 60:
        catAge.append("adulte") 
    else:
        catAge.append("vieux")
print(pd.DataFrame(catAge, columns = ['catAge'])['catAge'].value_counts())
cat = pd.get_dummies(pd.DataFrame(catAge, columns = ['catAge']), prefix='catAge')
cat.head(3)

adulte    373
enfant     21
vieux      14
bebe       10
Name: catAge, dtype: int64


Unnamed: 0,catAge_adulte,catAge_bebe,catAge_enfant,catAge_vieux
0,1,0,0,0
1,1,0,0,0
2,0,0,0,1


# Création d'une fonction de préparation

In [46]:
def dataprep(data):
    # Sexe
    sexe = pd.get_dummies(data['Sex'], prefix='sex')
    
    # Cabine, récupération du pont (on remplace le pont T proche du pont A)
    cabin = pd.get_dummies(data['Cabin'].fillna('X').str[0].replace('T', 'A'), prefix='Cabin')
    
    # Age et catégories d'age
    age = data['Age'].fillna(data['Age'].mean())
    catAge = []
    for i in range(data.shape[0]) :
        if age[i] < 3:
            catAge.append("bebe")
        elif age[i] >= 3 and age[i] < 15:
            catAge.append("enfant")
        elif age[i] >= 15 and age[i] < 60:
            catAge.append("adulte") 
        else:
            catAge.append("vieux")
    catage = pd.get_dummies(pd.DataFrame(catAge, columns = ['catAge']), prefix='catAge')
    
    # Titre et Rang
    data['Titre'] = data.Name.map(lambda x : x.split(",")[1].split(".")[0]).fillna('X')
    data['Rang'] = 0
    vip = ['Don','Sir', 'Major', 'Col', 'Jonkheer', 'Dr']
    femmeenfant = ['Miss', 'Mrs', 'Lady', 'Mlle', 'the Countess', 'Ms', 'Mme', 'Dona', 'Master']
    for idx, titre in enumerate(data['Titre']): 
        if (titre.strip() in femmeenfant) :
            data.loc[idx, 'Rang'] = 'FE'
        elif (titre.strip() in vip) :
            data.loc[idx, 'Rang'] = 'VIP'
        else :
            data.loc[idx, 'Rang'] = 'Autres'
    rg = pd.get_dummies(data['Rang'], prefix='Rang')
    
    # Embarquement
    emb = pd.get_dummies(data['Embarked'], prefix='emb')
    
    # Prix unitaire - Ticket, Prépartion d'un DF (TicketCounts) contenant les ticket avec leur nb d'occurence
    TicketCounts = pd.DataFrame(data['Ticket'].value_counts().head())
    TicketCounts['TicketCount'] = TicketCounts['Ticket'] # renomme la colonne Ticket
    TicketCounts['Ticket'] = TicketCounts.index # rajoute une colonne Ticket pour le merge (jointure)
    # reporte le résultat dans le dataframe test (jointure des datasets)
    fin = pd.merge(data, TicketCounts, how='left', on='Ticket')
    fin['PrixUnitaire'] = fin['Fare'] / fin['TicketCount'].fillna(1)
    prxunit = pd.DataFrame(fin['PrixUnitaire'])
    # Prix moyen 3eme classe (pour le passager de 3eme qui n'a pas de prix) ... on aurait pu faire une fonction ici ;-)
    prx3eme = data.loc[data['Pclass'] == 3]['Fare'].mean()
    prxunit = prxunit['PrixUnitaire'].fillna(prx3eme)
    
    # Classe
    pc = pd.DataFrame(MinMaxScaler().fit_transform(data[['Pclass']]), columns = ['Classe'])
    
    dp = data[['SibSp', 'Parch', 'Name']].join(pc).join(sexe).join(emb).join(prxunit).join(cabin).join(age).join(catage).join(rg)
    addColumnFamilyName(dp)
    del dp['Name']
    
    return dp

In [47]:
Xtrain = dataprep(train)
Xtest = dataprep(test)

In [48]:
Xtrain.describe(include='all')

Unnamed: 0,SibSp,Parch,Classe,sex_female,sex_male,emb_C,emb_Q,emb_S,PrixUnitaire,Cabin_A,...,Robbins,Vovk,Yousseff,Brocklebank,Lemore,Badt,Keefe,Hodges,Dantcheff,Potter
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,...,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,0.523008,0.381594,0.654321,0.352413,0.647587,0.188552,0.08642,0.722783,30.658908,0.017957,...,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122,0.001122
std,1.102743,0.806057,0.418036,0.47799,0.47799,0.391372,0.281141,0.447876,49.724754,0.132871,...,0.033501,0.033501,0.033501,0.033501,0.033501,0.033501,0.033501,0.033501,0.033501,0.033501
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,7.8958,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,13.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,27.9,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8.0,6.0,1.0,1.0,1.0,1.0,1.0,1.0,512.3292,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


# Entrainement du modèle sur un Random Forest

In [49]:
y = train.Survived

#clf = LinearSVC(random_state=4)
#clf.fit(Xtrain, y)
#p_tr = clf.predict(Xtrain)
#print ("Score Train -->", round(clf.score(Xtrain, y) *100,4), " %")

rf = RandomForestClassifier(n_estimators=100, random_state=1, max_features=1)
rf.fit(Xtrain, y)
p_tr = rf.predict(Xtrain)
print ("Score Train -->", round(rf.score(Xtrain, y) *100,4), " %")

Score Train --> 99.8878  %


# Formattage des résultats pour Kaggle ;-)

In [50]:
p_test = clf.predict(Xtest)
result = pd.DataFrame(test['PassengerId'])
pred = pd.DataFrame(p_test, columns=['Survived'])
result = result.join(pred)

In [51]:
result.describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.964115
std,120.810458,0.186227
min,892.0,0.0
25%,996.25,1.0
50%,1100.5,1.0
75%,1204.75,1.0
max,1309.0,1.0


In [52]:
result.to_csv("./data/result.csv", columns=["PassengerId", "Survived"], index=False)

# Score Kaggle : 0.76555