## Imports

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from xgboost import XGBClassifier
import sklearn

pd.set_option('display.max_columns', None)

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


## Functions

In [2]:
def get_clean_cabin_nums(cabin_nums):
    if isinstance(cabin_nums, float):
        if pd.isna(cabin_nums):
            return []
        else:
            return [cabin_nums]
    elif isinstance(cabin_nums, list):
        if len(cabin_nums) == 0:
            return []
        else:
            return [int(c) for c in cabin_nums if len(c) > 0]
                
def get_cabin_nums(df):
    cabin_nums_list = (
        df['Cabin']
        .str.replace('[a-zA-Z]', '', regex=True)
        .str.strip()
        .str.split(' ')
    )
    
    return cabin_nums_list.apply(get_clean_cabin_nums)

def is_any_cabin_within_range(cabin_nums, low_exc, high_inc):
    cabin_nums = np.array(cabin_nums)
    
    return np.any(
        (cabin_nums > low_exc)
        & (cabin_nums <= high_inc)
    )

def get_has_cabin_number_in_range_onehot(df, bins):
    cabin_nums_list = get_cabin_nums(df)
    
    cabin_number_range_map = {}
    
    for i in range(len(bins) - 1):
        low_exc = bins[i]
        high_inc = bins[i + 1]
        
        cabin_number_range_map[f'has_cabin_number_between_{low_exc + 1}_{high_inc}'] = (
            cabin_nums_list.apply(
                lambda cabin_nums : is_any_cabin_within_range(cabin_nums, low_exc, high_inc)
            )
        )

    return pd.DataFrame.from_dict(cabin_number_range_map)

def get_cabin_letter_onehot(df):
    cabin_letters = {}
    
    for cabin_letter in 'ABCDEFGT':
        cabin_letters[f'is_cabin_letter_' + cabin_letter] = df['Cabin'].fillna('').str.contains(cabin_letter)

    return pd.DataFrame.from_dict(cabin_letters)

def get_input_data(df, mean_age):
    clean_cols = [
        'SibSp',
        'Parch',
        'Fare'
    ]

    # WARNING: ASSUMPTION
    clean_age = df['Age'].fillna(mean_age)
    
    is_male = df['Sex'] == 'male'
    pclass_onehot = pd.get_dummies(df['Pclass'], prefix='Pclass')
    embarked_onehot = pd.get_dummies(df['Embarked'], prefix='Embarked')
    has_cabin = ~df['Cabin'].isna()
    
    num_of_cabins = (
        df['Cabin'].str.split(' ')
        .str.len()
        .fillna(0)
    )
    
    cabin_letters_onehot = get_cabin_letter_onehot(df)
    
    has_cabin_number_in_range_onehot = get_has_cabin_number_in_range_onehot(df, np.arange(0, 151, 10))

    return pd.concat(
        (
            df[clean_cols],
            clean_age.rename('clean_age'),
            is_male.rename('is_male'),
            pclass_onehot,
            # embarked_onehot,
            # has_cabin.rename('has_cabin'),
            # num_of_cabins.rename('num_of_cabins'),
            cabin_letters_onehot,
            # has_cabin_number_in_range_onehot,
        ),
        axis=1
    )

## Prepare Data

In [3]:
df_original = pd.read_csv('/kaggle/input/titanic/train.csv')
df, df_val = sklearn.model_selection.train_test_split(df_original, test_size=0.1, stratify=df_original['Survived'])

df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [4]:
df['Survived'].mean()

0.383270911360799

In [5]:
df_val['Survived'].mean()

0.3888888888888889

In [6]:
mean_age = df['Age'].mean()

input_df = get_input_data(df, mean_age)
input_df_val = get_input_data(df_val, mean_age)
input_df_test = get_input_data(df_test, mean_age)

In [7]:
input_df.head()

Unnamed: 0,SibSp,Parch,Fare,clean_age,is_male,Pclass_1,Pclass_2,Pclass_3,is_cabin_letter_A,is_cabin_letter_B,is_cabin_letter_C,is_cabin_letter_D,is_cabin_letter_E,is_cabin_letter_F,is_cabin_letter_G,is_cabin_letter_T
256,0,0,79.2,29.371336,False,True,False,False,False,False,False,False,False,False,False,False
48,2,0,21.6792,29.371336,True,False,False,True,False,False,False,False,False,False,False,False
678,1,6,46.9,43.0,False,False,False,True,False,False,False,False,False,False,False,False
792,8,2,69.55,29.371336,False,False,False,True,False,False,False,False,False,False,False,False
172,1,1,11.1333,1.0,False,False,False,True,False,False,False,False,False,False,False,False


In [8]:
input_df.mean()

SibSp                 0.523096
Parch                 0.388265
Fare                 32.197762
clean_age            29.371336
is_male               0.650437
Pclass_1              0.242197
Pclass_2              0.204744
Pclass_3              0.553059
is_cabin_letter_A     0.018727
is_cabin_letter_B     0.052434
is_cabin_letter_C     0.064919
is_cabin_letter_D     0.037453
is_cabin_letter_E     0.034956
is_cabin_letter_F     0.014981
is_cabin_letter_G     0.007491
is_cabin_letter_T     0.001248
dtype: float64

In [9]:
input_df.describe()

Unnamed: 0,SibSp,Parch,Fare,clean_age
count,801.0,801.0,801.0,801.0
mean,0.523096,0.388265,32.197762,29.371336
std,1.096485,0.818726,48.571969,12.79135
min,0.0,0.0,0.0,0.42
25%,0.0,0.0,7.925,22.0
50%,0.0,0.0,14.4583,29.371336
75%,1.0,0.0,31.275,35.0
max,8.0,6.0,512.3292,80.0


In [10]:
input_df_test.head()

Unnamed: 0,SibSp,Parch,Fare,clean_age,is_male,Pclass_1,Pclass_2,Pclass_3,is_cabin_letter_A,is_cabin_letter_B,is_cabin_letter_C,is_cabin_letter_D,is_cabin_letter_E,is_cabin_letter_F,is_cabin_letter_G,is_cabin_letter_T
0,0,0,7.8292,34.5,True,False,False,True,False,False,False,False,False,False,False,False
1,1,0,7.0,47.0,False,False,False,True,False,False,False,False,False,False,False,False
2,0,0,9.6875,62.0,True,False,True,False,False,False,False,False,False,False,False,False
3,0,0,8.6625,27.0,True,False,False,True,False,False,False,False,False,False,False,False
4,1,1,12.2875,22.0,False,False,False,True,False,False,False,False,False,False,False,False


In [11]:
input_df_test.mean()

SibSp                 0.447368
Parch                 0.392344
Fare                 35.627188
clean_age            30.087165
is_male               0.636364
Pclass_1              0.255981
Pclass_2              0.222488
Pclass_3              0.521531
is_cabin_letter_A     0.016746
is_cabin_letter_B     0.043062
is_cabin_letter_C     0.083732
is_cabin_letter_D     0.031100
is_cabin_letter_E     0.026316
is_cabin_letter_F     0.019139
is_cabin_letter_G     0.004785
is_cabin_letter_T     0.000000
dtype: float64

In [12]:
input_df_test.describe()

Unnamed: 0,SibSp,Parch,Fare,clean_age
count,418.0,418.0,417.0,418.0
mean,0.447368,0.392344,35.627188,30.087165
std,0.89676,0.981429,55.907576,12.639798
min,0.0,0.0,0.0,0.17
25%,0.0,0.0,7.8958,23.0
50%,0.0,0.0,14.4542,29.371336
75%,1.0,0.0,31.5,35.75
max,8.0,9.0,512.3292,76.0


## Train Model

In [13]:
bst = XGBClassifier(n_estimators=20, max_depth=100, learning_rate=1, objective='binary:logistic')
bst.fit(input_df, df['Survived'])

## Evaluate Model

In [14]:
y_true = df_val['Survived']
y_score = bst.predict(input_df_val)

In [15]:
sklearn.metrics.accuracy_score(y_true, y_score)

0.8555555555555555

In [16]:
sklearn.metrics.average_precision_score(y_true, y_score)

0.7341269841269842

In [17]:
sklearn.metrics.precision_score(y_true, y_score > 0.5)

0.8055555555555556

In [18]:
sklearn.metrics.recall_score(y_true, y_score > 0.5)

0.8285714285714286

## Test

In [19]:
pred_test = bst.predict(input_df_test)

In [20]:
submission_df = pd.concat((df_test['PassengerId'], pd.Series(pred_test).rename('Survived')), axis=1)

In [21]:
submission_df.describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.368421
std,120.810458,0.482954
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


In [22]:
submission_df.to_csv('submission.csv', index=False)

## Appendix - EDA

In [23]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
256,257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C
48,49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C
678,679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43.0,1,6,CA 2144,46.9,,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
172,173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S


In [24]:
(~df['Cabin'].isna()).mean()

0.2272159800249688

In [25]:
df['Sex'].unique()

array(['female', 'male'], dtype=object)

In [26]:
df['Pclass'].unique()

array([1, 3, 2])

In [27]:
pd.get_dummies(df['Pclass'], prefix='Pclass').head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
256,True,False,False
48,False,False,True
678,False,False,True
792,False,False,True
172,False,False,True


In [28]:
df['SibSp'].describe()

count    801.000000
mean       0.523096
std        1.096485
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64

In [29]:
df['Parch'].describe()

count    801.000000
mean       0.388265
std        0.818726
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64

In [30]:
df['Fare'].describe()

count    801.000000
mean      32.197762
std       48.571969
min        0.000000
25%        7.925000
50%       14.458300
75%       31.275000
max      512.329200
Name: Fare, dtype: float64

In [31]:
df['Cabin'].unique()

array([nan, 'D', 'B96 B98', 'E49', 'D35', 'E67', 'C47', 'C125', 'C68',
       'C23 C25 C27', 'F38', 'C30', 'B22', 'B42', 'C110', 'F E69', 'A36',
       'A14', 'E24', 'F2', 'D47', 'D36', 'A10', 'B102', 'C82',
       'B57 B59 B63 B66', 'D26', 'C22 C26', 'D7', 'C123', 'B30', 'D45',
       'C2', 'E34', 'E46', 'D28', 'A16', 'B77', 'A24', 'E121', 'A34',
       'B58 B60', 'B86', 'E33', 'G6', 'E40', 'C93', 'C83', 'A7', 'C70',
       'E31', 'D30', 'C45', 'B69', 'C52', 'E101', 'B79', 'B82 B84',
       'C106', 'B80', 'E25', 'E44', 'F33', 'B5', 'C99', 'A26', 'A6',
       'C126', 'F G73', 'C101', 'C54', 'T', 'A31', 'E10', 'C95', 'E77',
       'B51 B53 B55', 'E8', 'B19', 'C78', 'C104', 'C65', 'B94', 'B35',
       'D37', 'B49', 'C103', 'C62 C64', 'C32', 'A23', 'E36', 'E58', 'B18',
       'B101', 'A5', 'C124', 'A19', 'C49', 'B20', 'F4', 'C50', 'B4',
       'F G63', 'C7', 'B38', 'B78', 'E12', 'E68', 'A32', 'D15', 'D33',
       'C128', 'B28', 'D20', 'D17', 'C46', 'E63', 'E17', 'C148', 'C86',
       'D6'

In [32]:
num_of_cabins = df['Cabin'].str.split(' ').str.len()

In [33]:
num_of_cabins.describe()

count    182.000000
mean       1.170330
std        0.513464
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        4.000000
Name: Cabin, dtype: float64

In [34]:
num_of_cabins.fillna(0).describe()

count    801.000000
mean       0.265918
std        0.548133
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        4.000000
Name: Cabin, dtype: float64

In [35]:
_df = df[df['Cabin'].isna()]

In [36]:
_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
256,257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C
48,49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C
678,679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43.0,1,6,CA 2144,46.9,,S
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
172,173,1,3,"Johnson, Miss. Eleanor Ileen",female,1.0,1,1,347742,11.1333,,S


In [37]:
df['Cabin'].str.replace('\d', '', regex=True).unique()

array([nan, 'D', 'B B', 'E', 'C', 'C C C', 'F', 'B', 'F E', 'A',
       'B B B B', 'C C', 'G', 'F G', 'T', 'B B B', 'D D'], dtype=object)

In [38]:
df_test['Cabin'].str.replace('\d', '', regex=True).unique()

array([nan, 'B', 'E', 'B B B B', 'A', 'C', 'D', 'C C C', 'F G', 'C C',
       'F', 'G', 'B B', 'F E', 'B B B', 'D D', 'E E'], dtype=object)

In [39]:
a = [1, 3, 5]
np.any((np.array(a) <= 1) & (np.array(a) > 0))

True

In [40]:
get_cabin_nums(df).explode().fillna(-1).describe()

  get_cabin_nums(df).explode().fillna(-1).describe()


count    828.000000
mean      11.688406
std       27.983101
min       -1.000000
25%       -1.000000
50%       -1.000000
75%       -1.000000
max      148.000000
Name: Cabin, dtype: float64

In [41]:
get_cabin_nums(df_test).explode().fillna(-1).describe()

  get_cabin_nums(df_test).explode().fillna(-1).describe()


count    442.000000
mean      11.572398
std       25.794504
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        4.000000
max      132.000000
Name: Cabin, dtype: float64

In [42]:
pd.get_dummies(df['Embarked'], prefix='Embarked').head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
256,True,False,False
48,True,False,False
678,False,False,True
792,False,False,True
172,False,False,True


In [43]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            165
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          619
Embarked         1
dtype: int64

In [44]:
df[df['Age'].isna()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
256,257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C
48,49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C
792,793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
578,579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C
29,30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S


In [45]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [46]:
df_test[df_test['Age'].isna()].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
10,902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
22,914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
29,921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
33,925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S
36,928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S


In [47]:
df[df['Fare'] == 0].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,,S
806,807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S
815,816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0.0,B102,S
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,,S
822,823,0,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0,,S
