## Imports

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from xgboost import XGBClassifier
import sklearn

pd.set_option('display.max_columns', None)

## Functions

In [None]:
def get_clean_cabin_nums(cabin_nums):
    if isinstance(cabin_nums, float):
        if pd.isna(cabin_nums):
            return []
        else:
            return [cabin_nums]
    elif isinstance(cabin_nums, list):
        if len(cabin_nums) == 0:
            return []
        else:
            return [int(c) for c in cabin_nums if len(c) > 0]
                
def get_cabin_nums(df):
    cabin_nums_list = (
        df['Cabin']
        .str.replace('[a-zA-Z]', '', regex=True)
        .str.strip()
        .str.split(' ')
    )
    
    return cabin_nums_list.apply(get_clean_cabin_nums)

def is_any_cabin_within_range(cabin_nums, low_exc, high_inc):
    cabin_nums = np.array(cabin_nums)
    
    return np.any(
        (cabin_nums > low_exc)
        & (cabin_nums <= high_inc)
    )

def get_has_cabin_number_in_range_onehot(df, bins):
    cabin_nums_list = get_cabin_nums(df)
    
    cabin_number_range_map = {}
    
    for i in range(len(bins) - 1):
        low_exc = bins[i]
        high_inc = bins[i + 1]
        
        cabin_number_range_map[f'has_cabin_number_between_{low_exc + 1}_{high_inc}'] = (
            cabin_nums_list.apply(
                lambda cabin_nums : is_any_cabin_within_range(cabin_nums, low_exc, high_inc)
            )
        )

    return pd.DataFrame.from_dict(cabin_number_range_map)

def get_cabin_letter_onehot(df):
    cabin_letters = {}
    
    for cabin_letter in 'ABCDEFGT':
        cabin_letters[f'is_cabin_letter_' + cabin_letter] = df['Cabin'].fillna('').str.contains(cabin_letter)

    return pd.DataFrame.from_dict(cabin_letters)

def get_input_data(df, mean_age):
    clean_cols = [
        'SibSp',
        'Parch',
        'Fare'
    ]

    # WARNING: ASSUMPTION
    clean_age = df['Age'].fillna(mean_age)
    
    is_male = df['Sex'] == 'male'
    pclass_onehot = pd.get_dummies(df['Pclass'], prefix='Pclass')
    embarked_onehot = pd.get_dummies(df['Embarked'], prefix='Embarked')
    has_cabin = ~df['Cabin'].isna()
    
    num_of_cabins = (
        df['Cabin'].str.split(' ')
        .str.len()
        .fillna(0)
    )
    
    cabin_letters_onehot = get_cabin_letter_onehot(df)
    
    has_cabin_number_in_range_onehot = get_has_cabin_number_in_range_onehot(df, np.arange(0, 151, 10))

    return pd.concat(
        (
            df[clean_cols],
            clean_age.rename('clean_age'),
            is_male.rename('is_male'),
            pclass_onehot,
            embarked_onehot,
            has_cabin.rename('has_cabin'),
            num_of_cabins.rename('num_of_cabins'),
            cabin_letters_onehot,
            has_cabin_number_in_range_onehot,
        ),
        axis=1
    )

## Prepare Data

In [None]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
mean_age = df['Age'].mean()

input_df = get_input_data(df, mean_age)
input_df_test = get_input_data(df_test, mean_age)

In [None]:
input_df.head()

In [None]:
input_df.mean()

In [None]:
input_df.describe()

In [None]:
input_df_test.head()

In [None]:
input_df_test.mean()

In [None]:
input_df_test.describe()

## Train Model

In [None]:
bst = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
bst.fit(input_df, df['Survived'])

## Evaluate Model

In [None]:
y_true = df['Survived']
y_score = bst.predict(input_df)

In [None]:
sklearn.metrics.accuracy_score(y_true, y_score)

In [None]:
sklearn.metrics.average_precision_score(y_true, y_score)

In [None]:
sklearn.metrics.precision_score(y_true, y_score > 0.5)

In [None]:
sklearn.metrics.recall_score(y_true, y_score > 0.5)

## Test

In [None]:
pred_test = bst.predict(input_df_test)

In [None]:
submission_df = pd.concat((df_test['PassengerId'], pd.Series(pred_test).rename('Survived')), axis=1)

In [None]:
submission_df.describe()

In [None]:
submission_df.to_csv('submission.csv', index=False)

## Appendix - EDA

In [None]:
df.head()

In [None]:
(~df['Cabin'].isna()).mean()

In [None]:
df['Sex'].unique()

In [None]:
df['Pclass'].unique()

In [None]:
pd.get_dummies(df['Pclass'], prefix='Pclass').head()

In [None]:
df['SibSp'].describe()

In [None]:
df['Parch'].describe()

In [None]:
df['Fare'].describe()

In [None]:
df['Cabin'].unique()

In [None]:
num_of_cabins = df['Cabin'].str.split(' ').str.len()

In [None]:
num_of_cabins.describe()

In [None]:
num_of_cabins.fillna(0).describe()

In [None]:
_df = df[df['Cabin'].isna()]

In [None]:
_df.head()

In [None]:
df['Cabin'].str.replace('\d', '', regex=True).unique()

In [None]:
df_test['Cabin'].str.replace('\d', '', regex=True).unique()

In [None]:
a = [1, 3, 5]
np.any((np.array(a) <= 1) & (np.array(a) > 0))

In [None]:
get_cabin_nums(df).explode().fillna(-1).describe()

In [None]:
get_cabin_nums(df_test).explode().fillna(-1).describe()

In [None]:
pd.get_dummies(df['Embarked'], prefix='Embarked').head()

In [None]:
df.isna().sum()

In [None]:
df[df['Age'].isna()].head()

In [None]:
df_test.isna().sum()

In [None]:
df_test[df_test['Age'].isna()].head()

In [None]:
df[df['Fare'] == 0].head()