In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn import tree

In [None]:
gender = pd.read_csv("../data/raw/titanic/gender_submission.csv")
gender.info()
gender.head()

In [2]:
train = pd.read_csv("../data/raw/titanic/train.csv")
train.info()
train.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
test = pd.read_csv("../data/raw/titanic/test.csv") # dataset used by Kaggle to evaluate our model performance egal to everyone
test.info()
test.head()

In [3]:
train["Sex"].unique()

array(['male', 'female'], dtype=object)

In [4]:
train["Pclass"].unique()

array([3, 1, 2])

In [5]:
train["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [None]:
fig = px.histogram(train, x = "Sex", color = "Survived", barmode = "group", title = "Proportions of men and women who survived and died")
fig.show()

In [None]:
fig = px.histogram(train, x = "Sex", color = "Survived", barmode = "group", facet_col = "Pclass",
                  title = "Proportions of men and women who survived and died according to their social class")
fig.show()

In [None]:
fig = px.histogram(train, x = "Embarked", color = "Survived", barmode = "group", facet_col = "Pclass",
                  title = "Proportions of people who survived and died according to the embarked place and their social class")
fig.show()

In [None]:
fig = px.histogram(train, x = "Embarked", color = "Sex", barmode = "group", facet_col = "Pclass",
                  title="Repartition of men and women in different embarked places according to their social class")
fig.show()

In [6]:
# Splitting the train dataset, to evaluate my model's performance
# Good practice: splitting the earlier as possible, to avoid that data from dataset could be found in the test dataset
# ex: filling missing data with mean of all data of the whole "Age" column: calculate the mean in the train dataset and report in
# in the test one
# ex: Embarked: missing values at 0 => to verify the impact on the dataset
X_train, X_test = train_test_split(
    train, test_size = 0.33, random_state = 42
)

In [None]:
X_train

In [None]:
X_test

In [7]:
def apply_male_int(sex):
    if sex == "male":
        sex = 1
    else:
        sex = 0
    return sex

In [8]:
X_train["male"] = X_train["Sex"].apply(apply_male_int)
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Name         596 non-null    object 
 4   Sex          596 non-null    object 
 5   Age          478 non-null    float64
 6   SibSp        596 non-null    int64  
 7   Parch        596 non-null    int64  
 8   Ticket       596 non-null    object 
 9   Fare         596 non-null    float64
 10  Cabin        134 non-null    object 
 11  Embarked     595 non-null    object 
 12  male         596 non-null    int64  
dtypes: float64(2), int64(6), object(5)
memory usage: 65.2+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1
718,719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q,1
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C,1
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C,1
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,0


In [9]:
def apply_Pclass_int(Pclass):
    if Pclass != 1:
        Pclass = 0
    return Pclass

In [10]:
X_train["Pclass1"] = X_train["Pclass"].apply(apply_Pclass_int)
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Name         596 non-null    object 
 4   Sex          596 non-null    object 
 5   Age          478 non-null    float64
 6   SibSp        596 non-null    int64  
 7   Parch        596 non-null    int64  
 8   Ticket       596 non-null    object 
 9   Fare         596 non-null    float64
 10  Cabin        134 non-null    object 
 11  Embarked     595 non-null    object 
 12  male         596 non-null    int64  
 13  Pclass1      596 non-null    int64  
dtypes: float64(2), int64(7), object(5)
memory usage: 69.8+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Pclass1
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1
718,719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q,1,0
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C,1,0
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C,1,0
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,0,0


In [11]:
def apply_Pclass(Pclass):
    if Pclass != 2:
        Pclass = 0
    return Pclass

In [12]:
X_train["Pclass2"] = X_train["Pclass"].apply(apply_Pclass)
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Name         596 non-null    object 
 4   Sex          596 non-null    object 
 5   Age          478 non-null    float64
 6   SibSp        596 non-null    int64  
 7   Parch        596 non-null    int64  
 8   Ticket       596 non-null    object 
 9   Fare         596 non-null    float64
 10  Cabin        134 non-null    object 
 11  Embarked     595 non-null    object 
 12  male         596 non-null    int64  
 13  Pclass1      596 non-null    int64  
 14  Pclass2      596 non-null    int64  
dtypes: float64(2), int64(8), object(5)
memory usage: 74.5+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Pclass1,Pclass2
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1,0
718,719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q,1,0,0
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C,1,0,2
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C,1,0,0
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,0,0,0


In [13]:
def apply_Pclass_last(Pclass):
    if Pclass != 3:
        Pclass = 0
    return Pclass

In [14]:
X_train["Pclass3"] = X_train["Pclass"].apply(apply_Pclass_last)
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Name         596 non-null    object 
 4   Sex          596 non-null    object 
 5   Age          478 non-null    float64
 6   SibSp        596 non-null    int64  
 7   Parch        596 non-null    int64  
 8   Ticket       596 non-null    object 
 9   Fare         596 non-null    float64
 10  Cabin        134 non-null    object 
 11  Embarked     595 non-null    object 
 12  male         596 non-null    int64  
 13  Pclass1      596 non-null    int64  
 14  Pclass2      596 non-null    int64  
 15  Pclass3      596 non-null    int64  
dtypes: float64(2), int64(9), object(5)
memory usage: 79.2+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Pclass1,Pclass2,Pclass3
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1,0,0
718,719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q,1,0,0,3
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C,1,0,2,0
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C,1,0,0,3
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,0,0,0,3


In [15]:
# replacing NaN values by the mean of the column in Age one
X_train["Age"].unique()

array([54.  ,   nan, 25.  , 26.  , 22.  , 31.  , 49.  , 19.  , 24.  ,
       16.  , 39.  , 47.  , 60.  , 27.  , 38.  , 44.  , 21.  , 45.  ,
        8.  , 32.  , 50.  , 23.  , 15.  , 28.  , 41.  , 33.  , 52.  ,
       20.  ,  9.  , 43.  , 17.  , 37.  , 62.  , 46.  , 29.  , 56.  ,
       59.  , 58.  ,  3.  , 30.  , 28.5 , 36.  ,  0.75, 35.  , 55.  ,
       51.  ,  2.  , 14.  , 45.5 ,  4.  , 40.5 , 40.  , 18.  , 12.  ,
       11.  ,  1.  ,  7.  , 70.5 , 34.  , 70.  , 42.  , 48.  , 80.  ,
       55.5 , 14.5 , 10.  , 53.  , 32.5 , 74.  , 64.  ,  6.  ,  5.  ,
       24.5 ,  0.42, 61.  ,  0.67, 13.  ,  0.83])

In [16]:
mean_age = X_train["Age"].mean()
mean_age

29.525983263598327

In [17]:
X_train["Age"].fillna(mean_age, inplace = True)
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Name         596 non-null    object 
 4   Sex          596 non-null    object 
 5   Age          596 non-null    float64
 6   SibSp        596 non-null    int64  
 7   Parch        596 non-null    int64  
 8   Ticket       596 non-null    object 
 9   Fare         596 non-null    float64
 10  Cabin        134 non-null    object 
 11  Embarked     595 non-null    object 
 12  male         596 non-null    int64  
 13  Pclass1      596 non-null    int64  
 14  Pclass2      596 non-null    int64  
 15  Pclass3      596 non-null    int64  
dtypes: float64(2), int64(9), object(5)
memory usage: 79.2+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Pclass1,Pclass2,Pclass3
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1,0,0
718,719,0,3,"McEvoy, Mr. Michael",male,29.525983,0,0,36568,15.5,,Q,1,0,0,3
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C,1,0,2,0
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C,1,0,0,3
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,0,0,0,3


In [18]:
def apply_embarked_S(embarked):
    if embarked != "S":
        embarked = 0
    else:
        embarked = 1
    return embarked

In [19]:
X_train["Embarked_S"] = X_train["Embarked"].apply(apply_embarked_S)
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Name         596 non-null    object 
 4   Sex          596 non-null    object 
 5   Age          596 non-null    float64
 6   SibSp        596 non-null    int64  
 7   Parch        596 non-null    int64  
 8   Ticket       596 non-null    object 
 9   Fare         596 non-null    float64
 10  Cabin        134 non-null    object 
 11  Embarked     595 non-null    object 
 12  male         596 non-null    int64  
 13  Pclass1      596 non-null    int64  
 14  Pclass2      596 non-null    int64  
 15  Pclass3      596 non-null    int64  
 16  Embarked_S   596 non-null    int64  
dtypes: float64(2), int64(10), object(5)
memory usage: 83.8+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Pclass1,Pclass2,Pclass3,Embarked_S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1,0,0,1
718,719,0,3,"McEvoy, Mr. Michael",male,29.525983,0,0,36568,15.5,,Q,1,0,0,3,0
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C,1,0,2,0,0
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C,1,0,0,3,0
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,0,0,0,3,1


In [20]:
def apply_embarked_Q(embarked):
    if embarked != "Q":
        embarked = 0
    else:
        embarked = 1
    return embarked

In [21]:
X_train["Embarked_Q"] = X_train["Embarked"].apply(apply_embarked_Q)
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Name         596 non-null    object 
 4   Sex          596 non-null    object 
 5   Age          596 non-null    float64
 6   SibSp        596 non-null    int64  
 7   Parch        596 non-null    int64  
 8   Ticket       596 non-null    object 
 9   Fare         596 non-null    float64
 10  Cabin        134 non-null    object 
 11  Embarked     595 non-null    object 
 12  male         596 non-null    int64  
 13  Pclass1      596 non-null    int64  
 14  Pclass2      596 non-null    int64  
 15  Pclass3      596 non-null    int64  
 16  Embarked_S   596 non-null    int64  
 17  Embarked_Q   596 non-null    int64  
dtypes: float64(2), int64(11), object(5)
memory usage: 88.5+

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Pclass1,Pclass2,Pclass3,Embarked_S,Embarked_Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1,0,0,1,0
718,719,0,3,"McEvoy, Mr. Michael",male,29.525983,0,0,36568,15.5,,Q,1,0,0,3,0,1
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C,1,0,2,0,0,0
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C,1,0,0,3,0,0
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,0,0,0,3,1,0


In [22]:
def apply_embarked_C(embarked):
    if embarked != "C":
        embarked = 0
    else:
        embarked = 1
    return embarked

In [23]:
X_train["Embarked_C"] = X_train["Embarked"].apply(apply_embarked_C)
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Name         596 non-null    object 
 4   Sex          596 non-null    object 
 5   Age          596 non-null    float64
 6   SibSp        596 non-null    int64  
 7   Parch        596 non-null    int64  
 8   Ticket       596 non-null    object 
 9   Fare         596 non-null    float64
 10  Cabin        134 non-null    object 
 11  Embarked     595 non-null    object 
 12  male         596 non-null    int64  
 13  Pclass1      596 non-null    int64  
 14  Pclass2      596 non-null    int64  
 15  Pclass3      596 non-null    int64  
 16  Embarked_S   596 non-null    int64  
 17  Embarked_Q   596 non-null    int64  
 18  Embarked_C   596 non-null    int64  
dtypes: float64(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,male,Pclass1,Pclass2,Pclass3,Embarked_S,Embarked_Q,Embarked_C
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,1,1,0,0,1,0,0
718,719,0,3,"McEvoy, Mr. Michael",male,29.525983,0,0,36568,15.5,,Q,1,0,0,3,0,1,0
685,686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25.0,1,2,SC/Paris 2123,41.5792,,C,1,0,2,0,0,0,1
73,74,0,3,"Chronopoulos, Mr. Apostolos",male,26.0,1,0,2680,14.4542,,C,1,0,0,3,0,0,1
882,883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22.0,0,0,7552,10.5167,,S,0,0,0,3,1,0,0


In [24]:
X_train = X_train.drop(columns = ["Name", "Sex", "Ticket", "Cabin", "Embarked"])
X_train.info()
X_train.head()

<class 'pandas.core.frame.DataFrame'>
Index: 596 entries, 6 to 102
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  596 non-null    int64  
 1   Survived     596 non-null    int64  
 2   Pclass       596 non-null    int64  
 3   Age          596 non-null    float64
 4   SibSp        596 non-null    int64  
 5   Parch        596 non-null    int64  
 6   Fare         596 non-null    float64
 7   male         596 non-null    int64  
 8   Pclass1      596 non-null    int64  
 9   Pclass2      596 non-null    int64  
 10  Pclass3      596 non-null    int64  
 11  Embarked_S   596 non-null    int64  
 12  Embarked_Q   596 non-null    int64  
 13  Embarked_C   596 non-null    int64  
dtypes: float64(2), int64(12)
memory usage: 69.8 KB


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,male,Pclass1,Pclass2,Pclass3,Embarked_S,Embarked_Q,Embarked_C
6,7,0,1,54.0,0,0,51.8625,1,1,0,0,1,0,0
718,719,0,3,29.525983,0,0,15.5,1,0,0,3,0,1,0
685,686,0,2,25.0,1,2,41.5792,1,0,2,0,0,0,1
73,74,0,3,26.0,1,0,14.4542,1,0,0,3,0,0,1
882,883,0,3,22.0,0,0,10.5167,0,0,0,3,1,0,0


In [25]:
# classification pb:

# - logistic regression
# - decision tree
# - random forest
# - support vector machine
# - K Nearest Neighbour
# - Naive Bayes

# Start by decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, X_test)

ValueError: Input contains NaN