In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn import tree

In [None]:
gender = pd.read_csv("../data/raw/titanic/gender_submission.csv")
gender.info()
gender.head()

In [None]:
train = pd.read_csv("../data/raw/titanic/train.csv")
train.info()
train.head()

In [None]:
test = pd.read_csv("../data/raw/titanic/test.csv") # dataset used by Kaggle to evaluate our model performance egal to everyone
test.info()
test.head()

In [None]:
train["Sex"].unique()

In [None]:
train["Pclass"].unique()

In [None]:
train["Embarked"].unique()

In [None]:
fig = px.histogram(train, x = "Sex", color = "Survived", barmode = "group", title = "Proportions of men and women who survived and died")
fig.show()

In [None]:
fig = px.histogram(train, x = "Sex", color = "Survived", barmode = "group", facet_col = "Pclass",
                  title = "Proportions of men and women who survived and died according to their social class")
fig.show()

In [None]:
fig = px.histogram(train, x = "Embarked", color = "Survived", barmode = "group", facet_col = "Pclass",
                  title = "Proportions of people who survived and died according to the embarked place and their social class")
fig.show()

In [None]:
fig = px.histogram(train, x = "Embarked", color = "Sex", barmode = "group", facet_col = "Pclass",
                  title="Repartition of men and women in different embarked places according to their social class")
fig.show()

In [None]:
# Splitting the train dataset, to evaluate my model's performance
# Good practice: splitting the earlier as possible, to avoid that data from dataset could be found in the test dataset
# ex: filling missing data with mean of all data of the whole "Age" column: calculate the mean in the train dataset and report in
# in the test one
# ex: Embarked: missing values at 0 => to verify the impact on the dataset
X_train, X_test = train_test_split(
    train, test_size = 0.33, random_state = 42
)

In [None]:
X_train

In [None]:
X_test

In [None]:
def apply_male_int(sex):
    if sex == "male":
        sex = 1
    else:
        sex = 0
    return sex

In [None]:
X_train["male"] = X_train["Sex"].apply(apply_male_int)
X_train.info()
X_train.head()

In [None]:
def apply_Pclass_int(Pclass):
    if Pclass != 1:
        Pclass = 0
    return Pclass

In [None]:
X_train["Pclass1"] = X_train["Pclass"].apply(apply_Pclass_int)
X_train.info()
X_train.head()

In [None]:
def apply_Pclass(Pclass):
    if Pclass != 2:
        Pclass = 0
    else:
        Pclass = 1
    return Pclass

In [None]:
X_train["Pclass2"] = X_train["Pclass"].apply(apply_Pclass)
X_train.info()
X_train.head()

In [None]:
def apply_Pclass_last(Pclass):
    if Pclass != 3:
        Pclass = 0
    else:
        Pclass = 1
    return Pclass

In [None]:
X_train["Pclass3"] = X_train["Pclass"].apply(apply_Pclass_last)
X_train.info()
X_train.head()

In [None]:
# replacing NaN values by the mean of the column in Age one
X_train["Age"].unique()

In [None]:
mean_age = X_train["Age"].mean()
mean_age

In [None]:
X_train["Age"].fillna(mean_age, inplace = True)
X_train.info()
X_train.head()

In [None]:
def apply_embarked_S(embarked):
    if embarked != "S":
        embarked = 0
    else:
        embarked = 1
    return embarked

In [None]:
X_train["Embarked_S"] = X_train["Embarked"].apply(apply_embarked_S)
X_train.info()
X_train.head()

In [None]:
def apply_embarked_Q(embarked):
    if embarked != "Q":
        embarked = 0
    else:
        embarked = 1
    return embarked

In [None]:
X_train["Embarked_Q"] = X_train["Embarked"].apply(apply_embarked_Q)
X_train.info()
X_train.head()

In [None]:
def apply_embarked_C(embarked):
    if embarked != "C":
        embarked = 0
    else:
        embarked = 1
    return embarked

In [None]:
X_train["Embarked_C"] = X_train["Embarked"].apply(apply_embarked_C)
X_train.info()
X_train.head()

In [None]:
X_train = X_train.drop(columns = ["Name", "Sex", "Ticket", "Cabin", "Embarked"])
X_train.info()
X_train.head()

In [None]:
# same transformations on the X_test dataset
X_test["male"] = X_test["Sex"].apply(apply_male_int)
X_test.info()
X_test.head()

In [None]:
X_test["Pclass1"] = X_test["Pclass"].apply(apply_Pclass_int)
X_test.info()
X_test.head()

In [None]:
X_test["Pclass2"] = X_test["Pclass"].apply(apply_Pclass)
X_test.info()
X_test.head()

In [None]:
X_test["Pclass3"] = X_test["Pclass"].apply(apply_Pclass_last)
X_test.info()
X_test.head()

In [None]:
X_test["Age"].fillna(mean_age, inplace = True)
X_test.info()
X_test.head()

In [None]:
X_test["Embarked_S"] = X_test["Embarked"].apply(apply_embarked_S)
X_test.info()
X_test.head()

In [None]:
X_test["Embarked_Q"] = X_test["Embarked"].apply(apply_embarked_Q)
X_test.info()
X_test.head()

In [None]:
X_test["Embarked_C"] = X_test["Embarked"].apply(apply_embarked_C)
X_test.info()
X_test.head()

In [None]:
X_test = X_test.drop(columns = ["Name", "Sex", "Ticket", "Cabin", "Embarked"])
X_test.info()
X_test.head()

In [None]:
# classification pb:

# - logistic regression
# - decision tree
# - random forest
# - support vector machine
# - K Nearest Neighbour
# - Naive Bayes

# Start by decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, X_test)