<a href="https://colab.research.google.com/github/edcote/kaggle/blob/main/Titantic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab setup

In [None]:
!pip install kaggle black[jupyter] --quiet
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

# Kaggle setup

Go to the 'Account' tab of your [user profile](https://www.kaggle.com/settings/account) and select 'Create New Token'. This will trigger the download of `kaggle.json`, a file containing your API credentials. Upload the file to local storage before continuing.

In [None]:
from google.colab import file
files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c titanic --force --quiet
!unzip -qf titanic.zip -d /content

# Code formatting

In [None]:
!black /content/gdrive/MyDrive/'Colab Notebooks'/Titantic_Machine_Learning_from_Disaster.ipynb

# Data cleaning

In [None]:
train_data = pd.read_csv('/content/train.csv', index_col='PassengerId')
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].median())
train_data['Embarked'] = train_data['Embarked'].fillna('S')
train_data['Sex'] = train_data['Sex'].map({'female': 0, 'male': 1}).astype(int)
train_data['Embarked_S'] = train_data['Embarked'] == 'S'
train_data['Embarked_C'] = train_data['Embarked'] == 'C'
train_data['Embarked_Q'] = train_data['Embarked'] == 'Q'

age_bins = np.linspace(train_data['Age'].min(),
                       train_data['Age'].max(),
                       num=8,
                       endpoint=True)
age_labels = [ f"{age_bins[i-1]:.0f}_{age_bins[i]:.0f}" for i in range(1, len(age_bins)) ]
train_data['Age Group'] = pd.cut(train_data['Age'],
                                 bins = age_bins,
                                 labels = age_labels)

fare_bins = np.linspace(train_data['Fare'].min(),
                       train_data['Fare'].max(),
                       num=8,
                       endpoint=True)
fare_labels = [ f"{fare_bins[i-1]:.0f}_{fare_bins[i]:.0f}" for i in range(1, len(fare_bins)) ]
train_data['Fare Group'] = pd.cut(train_data['Fare'],
                                  bins = fare_bins,
                                  labels = fare_labels)

def title(x):
  title_categories = {
      1: ["Mr", "Rev", "Major", "Capt", "Col", "Sir", "Jonkheer", "Dr", "Don"],
      2: ["Miss", "Ms", "Lady", "Mlle"],
      3: ["Mrs", "Mme", "Countess"],
      4: ["Master"]
  }
  for category, titles in title_categories.items():
    if x in titles:
      return category
  return 5  # Default category if not found

train_data['Title'] = train_data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False).map(title)

train_data = train_data.drop(['Ticket', 'Name', 'Cabin', 'Embarked', 'Age', 'Fare'], axis=1)
train_data.head()

# TODO: Continue following this tutorial ...
# https://www.kaggle.com/code/niklasdonges/end-to-end-project-with-python#Creating-Categories


# Explore data

In [None]:
women = train_data.loc[train_data.Sex == 0]["Survived"]
rate_women = 100*sum(women)/len(women)
print(f"{rate_women:.2f}% of women survived")

In [None]:
men = train_data.loc[train_data.Sex == 1]["Survived"]
rate_men = 100*sum(men)/len(men)
print(f"{rate_men:.2f}% of men survived")
