## Imports

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from xgboost import XGBClassifier
import sklearn

pd.set_option('display.max_columns', None)

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


## Functions

In [2]:
def get_clean_cabin_nums(cabin_nums):
    if isinstance(cabin_nums, float):
        if pd.isna(cabin_nums):
            return []
        else:
            return [cabin_nums]
    elif isinstance(cabin_nums, list):
        if len(cabin_nums) == 0:
            return []
        else:
            return [int(c) for c in cabin_nums if len(c) > 0]
                
def get_cabin_nums(df):
    cabin_nums_list = (
        df['Cabin']
        .str.replace('[a-zA-Z]', '', regex=True)
        .str.strip()
        .str.split(' ')
    )
    
    return cabin_nums_list.apply(get_clean_cabin_nums)

def is_any_cabin_within_range(cabin_nums, low_exc, high_inc):
    cabin_nums = np.array(cabin_nums)
    
    return np.any(
        (cabin_nums > low_exc)
        & (cabin_nums <= high_inc)
    )

def get_has_cabin_number_in_range_onehot(df, bins):
    cabin_nums_list = get_cabin_nums(df)
    
    cabin_number_range_map = {}
    
    for i in range(len(bins) - 1):
        low_exc = bins[i]
        high_inc = bins[i + 1]
        
        cabin_number_range_map[f'has_cabin_number_between_{low_exc + 1}_{high_inc}'] = (
            cabin_nums_list.apply(
                lambda cabin_nums : is_any_cabin_within_range(cabin_nums, low_exc, high_inc)
            )
        )

    return pd.DataFrame.from_dict(cabin_number_range_map)

def get_cabin_letter_onehot(df):
    cabin_letters = {}
    
    for cabin_letter in 'ABCDEFGT':
        cabin_letters[f'is_cabin_letter_' + cabin_letter] = df['Cabin'].fillna('').str.contains(cabin_letter)

    return pd.DataFrame.from_dict(cabin_letters)

def get_input_data(df, mean_age):
    clean_cols = [
        'SibSp',
        'Parch',
        'Fare'
    ]

    # WARNING: ASSUMPTION
    clean_age = df['Age'].fillna(mean_age)
    
    is_male = df['Sex'] == 'male'
    pclass_onehot = pd.get_dummies(df['Pclass'], prefix='Pclass')
    embarked_onehot = pd.get_dummies(df['Embarked'], prefix='Embarked')
    has_cabin = ~df['Cabin'].isna()
    
    num_of_cabins = (
        df['Cabin'].str.split(' ')
        .str.len()
        .fillna(0)
    )
    
    cabin_letters_onehot = get_cabin_letter_onehot(df)
    
    has_cabin_number_in_range_onehot = get_has_cabin_number_in_range_onehot(df, np.arange(0, 151, 10))

    return pd.concat(
        (
            df[clean_cols],
            clean_age.rename('clean_age'),
            is_male.rename('is_male'),
            pclass_onehot,
            embarked_onehot,
            # has_cabin.rename('has_cabin'),
            num_of_cabins.rename('num_of_cabins'),
            cabin_letters_onehot,
            has_cabin_number_in_range_onehot,
        ),
        axis=1
    )

## Prepare Data

In [3]:
df_original = pd.read_csv('/kaggle/input/titanic/train.csv')
df, df_val = sklearn.model_selection.train_test_split(df_original, test_size=0.1, stratify=df_original['Survived'])

df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [4]:
df['Survived'].mean()

0.383270911360799

In [5]:
df_val['Survived'].mean()

0.3888888888888889

In [6]:
mean_age = df['Age'].mean()

input_df = get_input_data(df, mean_age)
input_df_val = get_input_data(df_val, mean_age)
input_df_test = get_input_data(df_test, mean_age)

In [7]:
input_df.head()

Unnamed: 0,SibSp,Parch,Fare,clean_age,is_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,num_of_cabins,is_cabin_letter_A,is_cabin_letter_B,is_cabin_letter_C,is_cabin_letter_D,is_cabin_letter_E,is_cabin_letter_F,is_cabin_letter_G,is_cabin_letter_T,has_cabin_number_between_1_10,has_cabin_number_between_11_20,has_cabin_number_between_21_30,has_cabin_number_between_31_40,has_cabin_number_between_41_50,has_cabin_number_between_51_60,has_cabin_number_between_61_70,has_cabin_number_between_71_80,has_cabin_number_between_81_90,has_cabin_number_between_91_100,has_cabin_number_between_101_110,has_cabin_number_between_111_120,has_cabin_number_between_121_130,has_cabin_number_between_131_140,has_cabin_number_between_141_150
682,0,0,9.225,20.0,True,False,False,True,False,False,True,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
847,0,0,7.8958,35.0,True,False,False,True,True,False,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
103,0,0,8.6542,33.0,True,False,False,True,False,False,True,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
115,0,0,7.925,21.0,True,False,False,True,False,False,True,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
402,1,0,9.825,21.0,False,False,False,True,False,False,True,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
input_df.mean()

SibSp                                0.514357
Parch                                0.372035
Fare                                32.400987
clean_age                           29.939003
is_male                              0.646692
Pclass_1                             0.249688
Pclass_2                             0.207241
Pclass_3                             0.543071
Embarked_C                           0.196005
Embarked_Q                           0.082397
Embarked_S                           0.719101
num_of_cabins                        0.279650
is_cabin_letter_A                    0.018727
is_cabin_letter_B                    0.053683
is_cabin_letter_C                    0.068664
is_cabin_letter_D                    0.041199
is_cabin_letter_E                    0.041199
is_cabin_letter_F                    0.014981
is_cabin_letter_G                    0.007491
is_cabin_letter_T                    0.001248
has_cabin_number_between_1_10        0.033708
has_cabin_number_between_11_20    

In [9]:
input_df.describe()

Unnamed: 0,SibSp,Parch,Fare,clean_age,num_of_cabins
count,801.0,801.0,801.0,801.0,801.0
mean,0.514357,0.372035,32.400987,29.939003,0.27965
std,1.063064,0.797757,50.245005,12.93149,0.5447
min,0.0,0.0,0.0,0.42,0.0
25%,0.0,0.0,7.925,22.0,0.0
50%,0.0,0.0,14.5,29.939003,0.0
75%,1.0,0.0,31.275,35.0,0.0
max,8.0,6.0,512.3292,80.0,4.0


In [10]:
input_df_test.head()

Unnamed: 0,SibSp,Parch,Fare,clean_age,is_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,num_of_cabins,is_cabin_letter_A,is_cabin_letter_B,is_cabin_letter_C,is_cabin_letter_D,is_cabin_letter_E,is_cabin_letter_F,is_cabin_letter_G,is_cabin_letter_T,has_cabin_number_between_1_10,has_cabin_number_between_11_20,has_cabin_number_between_21_30,has_cabin_number_between_31_40,has_cabin_number_between_41_50,has_cabin_number_between_51_60,has_cabin_number_between_61_70,has_cabin_number_between_71_80,has_cabin_number_between_81_90,has_cabin_number_between_91_100,has_cabin_number_between_101_110,has_cabin_number_between_111_120,has_cabin_number_between_121_130,has_cabin_number_between_131_140,has_cabin_number_between_141_150
0,0,0,7.8292,34.5,True,False,False,True,False,True,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,0,7.0,47.0,False,False,False,True,False,False,True,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,0,0,9.6875,62.0,True,False,True,False,False,True,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,0,0,8.6625,27.0,True,False,False,True,False,False,True,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,1,1,12.2875,22.0,False,False,False,True,False,False,True,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [11]:
input_df_test.mean()

SibSp                                0.447368
Parch                                0.392344
Fare                                35.627188
clean_age                           30.203958
is_male                              0.636364
Pclass_1                             0.255981
Pclass_2                             0.222488
Pclass_3                             0.521531
Embarked_C                           0.244019
Embarked_Q                           0.110048
Embarked_S                           0.645933
num_of_cabins                        0.282297
is_cabin_letter_A                    0.016746
is_cabin_letter_B                    0.043062
is_cabin_letter_C                    0.083732
is_cabin_letter_D                    0.031100
is_cabin_letter_E                    0.026316
is_cabin_letter_F                    0.019139
is_cabin_letter_G                    0.004785
is_cabin_letter_T                    0.000000
has_cabin_number_between_1_10        0.023923
has_cabin_number_between_11_20    

In [12]:
input_df_test.describe()

Unnamed: 0,SibSp,Parch,Fare,clean_age,num_of_cabins
count,418.0,418.0,417.0,418.0,418.0
mean,0.447368,0.392344,35.627188,30.203958,0.282297
std,0.89676,0.981429,55.907576,12.635256,0.628441
min,0.0,0.0,0.0,0.17,0.0
25%,0.0,0.0,7.8958,23.0,0.0
50%,0.0,0.0,14.4542,29.939003,0.0
75%,1.0,0.0,31.5,35.75,0.0
max,8.0,9.0,512.3292,76.0,4.0


## Train Model

In [13]:
bst = XGBClassifier(n_estimators=50, max_depth=5, learning_rate=1, objective='binary:logistic')
bst.fit(input_df, df['Survived'])

## Evaluate Model

In [14]:
y_true = df_val['Survived']
y_score = bst.predict(input_df_val)

In [15]:
sklearn.metrics.accuracy_score(y_true, y_score)

0.8222222222222222

In [16]:
sklearn.metrics.average_precision_score(y_true, y_score)

0.6897099069512863

In [17]:
sklearn.metrics.precision_score(y_true, y_score > 0.5)

0.8275862068965517

In [18]:
sklearn.metrics.recall_score(y_true, y_score > 0.5)

0.6857142857142857

## Test

In [19]:
pred_test = bst.predict(input_df_test)

In [20]:
submission_df = pd.concat((df_test['PassengerId'], pd.Series(pred_test).rename('Survived')), axis=1)

In [21]:
submission_df.describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.366029
std,120.810458,0.482295
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


In [22]:
submission_df.to_csv('submission.csv', index=False)

## Appendix - EDA

In [23]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
682,683,0,3,"Olsvigen, Mr. Thor Anderson",male,20.0,0,0,6563,9.225,,S
847,848,0,3,"Markoff, Mr. Marin",male,35.0,0,0,349213,7.8958,,C
103,104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
115,116,0,3,"Pekoniemi, Mr. Edvard",male,21.0,0,0,STON/O 2. 3101294,7.925,,S
402,403,0,3,"Jussila, Miss. Mari Aina",female,21.0,1,0,4137,9.825,,S


In [24]:
(~df['Cabin'].isna()).mean()

0.24344569288389514

In [25]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [26]:
df['Pclass'].unique()

array([3, 1, 2])

In [27]:
pd.get_dummies(df['Pclass'], prefix='Pclass').head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
682,False,False,True
847,False,False,True
103,False,False,True
115,False,False,True
402,False,False,True


In [28]:
df['SibSp'].describe()

count    801.000000
mean       0.514357
std        1.063064
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64

In [29]:
df['Parch'].describe()

count    801.000000
mean       0.372035
std        0.797757
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64

In [30]:
df['Fare'].describe()

count    801.000000
mean      32.400987
std       50.245005
min        0.000000
25%        7.925000
50%       14.500000
75%       31.275000
max      512.329200
Name: Fare, dtype: float64

In [31]:
df['Cabin'].unique()

array([nan, 'D19', 'C86', 'F4', 'B58 B60', 'A36', 'A14', 'A10', 'C126',
       'C123', 'D36', 'C101', 'A20', 'C22 C26', 'C128', 'C62 C64',
       'B96 B98', 'G6', 'E33', 'B28', 'B38', 'B41', 'D6', 'B5', 'B19',
       'F2', 'B102', 'C70', 'B71', 'C2', 'C92', 'E34', 'B4', 'A6', 'B86',
       'E24', 'D', 'D33', 'F33', 'E101', 'C125', 'A34', 'B79', 'C103',
       'C91', 'B94', 'B30', 'C68', 'C95', 'D35', 'C54', 'B39', 'D15',
       'E44', 'C104', 'E68', 'C45', 'D46', 'B69', 'D17', 'A26', 'C52',
       'C65', 'F G63', 'E77', 'D37', 'D47', 'C124', 'B77', 'B101', 'E12',
       'D48', 'C23 C25 C27', 'E63', 'E25', 'C83', 'D7', 'E36', 'A5',
       'B57 B59 B63 B66', 'C90', 'C118', 'C50', 'B51 B53 B55', 'E10',
       'D26', 'B49', 'B20', 'D11', 'C32', 'E67', 'C111', 'E121', 'F38',
       'C93', 'C99', 'B50', 'A16', 'E46', 'D9', 'D56', 'E8', 'F E69',
       'C30', 'C87', 'E50', 'E58', 'C47', 'B37', 'E31', 'D28', 'D21', 'T',
       'E17', 'D10 D12', 'C7', 'D45', 'B42', 'E38', 'F G73', 'A31', 'A24',

In [32]:
num_of_cabins = df['Cabin'].str.split(' ').str.len()

In [33]:
num_of_cabins.describe()

count    195.000000
mean       1.148718
std        0.469081
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        4.000000
Name: Cabin, dtype: float64

In [34]:
num_of_cabins.fillna(0).describe()

count    801.00000
mean       0.27965
std        0.54470
min        0.00000
25%        0.00000
50%        0.00000
75%        0.00000
max        4.00000
Name: Cabin, dtype: float64

In [35]:
_df = df[df['Cabin'].isna()]

In [36]:
_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
682,683,0,3,"Olsvigen, Mr. Thor Anderson",male,20.0,0,0,6563,9.225,,S
847,848,0,3,"Markoff, Mr. Marin",male,35.0,0,0,349213,7.8958,,C
103,104,0,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S
115,116,0,3,"Pekoniemi, Mr. Edvard",male,21.0,0,0,STON/O 2. 3101294,7.925,,S
402,403,0,3,"Jussila, Miss. Mari Aina",female,21.0,1,0,4137,9.825,,S


In [37]:
df['Cabin'].str.replace('\d', '', regex=True).unique()

array([nan, 'D', 'C', 'F', 'B B', 'A', 'C C', 'G', 'E', 'B', 'F G',
       'C C C', 'B B B B', 'B B B', 'F E', 'T', 'D D'], dtype=object)

In [38]:
df_test['Cabin'].str.replace('\d', '', regex=True).unique()

array([nan, 'B', 'E', 'B B B B', 'A', 'C', 'D', 'C C C', 'F G', 'C C',
       'F', 'G', 'B B', 'F E', 'B B B', 'D D', 'E E'], dtype=object)

In [39]:
a = [1, 3, 5]
np.any((np.array(a) <= 1) & (np.array(a) > 0))

True

In [40]:
get_cabin_nums(df).explode().fillna(-1).describe()

  get_cabin_nums(df).explode().fillna(-1).describe()


count    827.000000
mean      12.431681
std       28.678752
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        5.000000
max      148.000000
Name: Cabin, dtype: float64

In [41]:
get_cabin_nums(df_test).explode().fillna(-1).describe()

  get_cabin_nums(df_test).explode().fillna(-1).describe()


count    442.000000
mean      11.572398
std       25.794504
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        4.000000
max      132.000000
Name: Cabin, dtype: float64

In [42]:
pd.get_dummies(df['Embarked'], prefix='Embarked').head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
682,False,False,True
847,True,False,False
103,False,False,True
115,False,False,True
402,False,False,True


In [43]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            159
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          606
Embarked         2
dtype: int64

In [44]:
df[df['Age'].isna()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
547,548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C
643,644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S
47,48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q
48,49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C
490,491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S


In [45]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [46]:
df_test[df_test['Age'].isna()].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
10,902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
22,914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
29,921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
33,925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S
36,928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S


In [47]:
df[df['Fare'] == 0].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
806,807,0,1,"Andrews, Mr. Thomas Jr",male,39.0,0,0,112050,0.0,A36,S
822,823,0,1,"Reuchlin, Jonkheer. John George",male,38.0,0,0,19972,0.0,,S
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,,S
815,816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0.0,B102,S
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S
