In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Alexiscbook's tutorial

In [2]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

In [3]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [4]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

In [5]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)



In [6]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission_1.csv', index=False)
print("Your submission was successfully saved!")

In [7]:
from sklearn.model_selection import cross_validate
cv_result = cross_validate(model, X, y, cv=5)
cv_result

In [8]:
predictions

In [9]:
scores = cv_result["test_score"]
print(
    "The mean cross-validation accuracy is: "
    f"{scores.mean():.3f} +/- {scores.std():.3f}"
)

# My solution

Now that we've tried the example suggested by alexisbcook, let's try to find our own solution.
First of all, let's clean our dataset

In [10]:
train_data.head().T

In [11]:
missing_values = train_data.isna().sum().to_frame(name='Missing Values').sort_values('Missing Values', ascending=False)
missing_values

Let's consider that Name and PassengerId are non-relevant columns. Since Cabin has a lot of NaN values, we are ignoring this column aswell.

In [12]:
cols_to_drop = ['PassengerId', 'Cabin', 'Name']
train_data = train_data.drop(columns=cols_to_drop)

Embarked has only two NaN values, so let's replace that with the most common value in the dataset

In [13]:
train_data.Embarked.value_counts().to_frame()

In [14]:
train_data.Embarked.fillna('S', inplace=True)

In [15]:
train_data.Age.describe()

For our NaN values in age columns, we'll replace it with the median value

In [16]:
train_data.Age.fillna(train_data.Age.median(), inplace=True)

In [17]:
train_data.head().T

In [18]:
missing_values = train_data.isna().sum().to_frame(name='Missing Values').sort_values('Missing Values', ascending=False)
missing_values

In [19]:
#train_data.Age.round(1)

Ticket number doesn't seem to have an important correlation, considering the amount of different tickets.

In [20]:
train_data.Ticket.value_counts().to_frame()

In [21]:
train_data = train_data.drop(columns='Ticket')

In [22]:
target_name = "Survived"
target = train_data[target_name]

data = train_data.drop(columns=[target_name])

In [23]:
from sklearn.compose import make_column_selector as selector
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

In [24]:
numerical_columns

In [25]:
categorical_columns

Sex and Embarked are both ordinal categories. Let's encode them using Ordinal Encoder

In [26]:
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

categorical_preprocessor = OrdinalEncoder()
numerical_preprocessor = StandardScaler()

In [27]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline


model = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1))

In [29]:
from sklearn import set_config
set_config(display='diagram')
model

In [30]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(
    data, target, random_state=42)

In [31]:
_ = model.fit(data_train, target_train)

In [32]:
model.score(data_test, target_test)

In our test data, we have an 81,6% of success

In [33]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head().T

In [34]:
cols_to_drop_test = ['PassengerId', 'Cabin', 'Name', 'Ticket']
test_passenger_id = test_data['PassengerId']
test_data = test_data.drop(columns=cols_to_drop_test)

In [35]:
test_data.head().T

In [36]:
test_data[categorical_columns]

In [37]:
test_missing_values = test_data.isna().sum().to_frame(name='Missing Values').sort_values('Missing Values', ascending=False)
test_missing_values

In [38]:
test_data.Fare.value_counts().to_frame()

In [39]:
test_data.Fare.dtype

In [40]:
test_data.Fare.fillna(7.75, inplace=True)

In [41]:
test_data.Age.fillna(test_data.Age.median(), inplace=True)

In [42]:
test_missing_values = test_data.isna().sum().to_frame(name='Missing Values').sort_values('Missing Values', ascending=False)
test_missing_values

In [43]:
predictions = model.predict(test_data)

In [44]:
output = pd.DataFrame({'PassengerId': test_passenger_id, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")