<a href="https://colab.research.google.com/github/edcote/kaggle/blob/main/Titantic_Machine_Learning_from_Disaster.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Colab setup

In [None]:
!pip install --quiet kaggle black[jupyter]
import pandas as pd
import numpy as np
from google.colab import drive

In [None]:
drive.mount("/content/gdrive", force_remount=True)

# Kaggle setup

Go to the 'Account' tab of your [user profile](https://www.kaggle.com/settings/account) and select 'Create New Token'. This will trigger the download of `kaggle.json`, a file containing your API credentials. Upload the file to local storage before continuing.

In [None]:
from google.colab import files

files.upload()

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c titanic --force --quiet
!unzip -o titanic.zip -d /content

# Code formatting

In [None]:
!black /content/gdrive/MyDrive/'Colab Notebooks'/Titantic_Machine_Learning_from_Disaster.ipynb

# Data analysis

In [None]:
train_df = pd.read_csv("/content/train.csv", index_col="PassengerId")

## Does dataset contain any missing values?

A: Yes. Age and Cabin.

In [None]:
train_df.info()

## How many passengers survived?

In [None]:
survived = train_df[train_df["Survived"] == 1].Survived.value_counts()  # 342
perished = train_df[train_df["Survived"] == 0].Survived.value_counts()  # 549

survival_df = pd.DataFrame([survived, perished], index=["Survived", "Perished"])
survival_df.plot(kind="bar", stacked=True)

## Is the likelihood of survival dependent on gender?

In [None]:
# Passenger count based on gender
train_df["Sex"].value_counts()  # 577 male, 314 female
# Survival ratio based on gender
train_df.groupby("Sex").Survived.mean()  # 18.8% male, 74.2% female

survived = train_df[train_df["Survived"] == 1].Sex.value_counts()
perished = train_df[train_df["Survived"] == 0].Sex.value_counts()

survival_df = pd.DataFrame([survived, perished], index=["Survived", "Perished"])
survival_df.plot(kind="bar", stacked=True)

## Does passenger class correlate to the probability of survival?

In [None]:
# Passenger count as a function of class
pd.pivot_table(
    train_df, index="Survived", columns="Pclass", values="Ticket", aggfunc="count"
)
train_df.groupby(
    ["Pclass"]
).Survived.mean()  # 63% 1st class, 47% 2nd class, 24% 3rd class

survived = train_df[train_df["Survived"] == 1].Pclass.value_counts()
perished = train_df[train_df["Survived"] == 0].Pclass.value_counts()

survival_df = pd.DataFrame([survived, perished], index=["Survived", "Perished"])
survival_df.plot(kind="bar", stacked=True)

More info: https://anelmusic13.medium.com/how-to-score-top-3-in-kaggles-titanic-machine-learning-from-disaster-competition-13d056e262b1

# Data cleaning

In [None]:
# nan_rows = data[data.isnull().any(axis=1)]

train_df["Age"] = train_df["Age"].fillna(train_df["Age"].median())
train_df["Fare"] = train_df["Fare"].fillna(train_df["Fare"].median())
train_df["Fare"] = train_df["Fare"].replace(0, train_df["Fare"].median())
train_df["Embarked"] = train_df["Embarked"].fillna("S")
train_df["Embarked"] = train_df["Embarked"].map({"S": 0, "C": 1, "Q": 2})
train_df["Sex"] = train_df["Sex"].map({"female": 0, "male": 1}).astype(int)

train_df["Survived"] = train_df["Survived"].fillna(0).astype(int)

age_bins = np.linspace(
    train_df["Age"].min(), train_df["Age"].max(), num=8, endpoint=True
)
age_labels = [f"{age_bins[i-1]:.0f}_{age_bins[i]:.0f}" for i in range(1, len(age_bins))]
train_df["Age Group"] = pd.cut(
    train_df["Age"], bins=age_bins, labels=age_labels, include_lowest=True
)
fare_bins = np.linspace(
    train_df["Fare"].min(), train_df["Fare"].max(), num=8, endpoint=True
)
fare_labels = [
    f"{fare_bins[i-1]:.0f}_{fare_bins[i]:.0f}" for i in range(1, len(fare_bins))
]
train_df["Fare Group"] = pd.cut(
    train_df["Fare"], bins=fare_bins, labels=fare_labels, include_lowest=True
)


def title(x):
    title_categories = {
        1: ["Mr", "Rev", "Major", "Capt", "Col", "Sir", "Jonkheer", "Dr", "Don"],
        2: ["Miss", "Ms", "Lady", "Mlle"],
        3: ["Mrs", "Mme", "Countess"],
        4: ["Master"],
    }
    for category, titles in title_categories.items():
        if x in titles:
            return category
    return 5  # Default category if not found


train_df["Title"] = (
    train_df["Name"].str.extract(" ([A-Za-z]+)\.", expand=False).map(title)
)

train_df = train_df.drop(["Ticket", "Name", "Cabin", "Age", "Fare"], axis=1)

train_df.head()

In [None]:
from sklearn import linear_model

sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(x_train, y_train)
y_pred = sgd.predict(x_test)
sgd.score(x_train, y_train)

# acc_score = round(sgd.score(x_train, y_train) * 100, 2)
# print(round(acc_score,2,), "%")