## Todos
- Split train and validation data.

## Imports

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from xgboost import XGBClassifier
import sklearn

pd.set_option('display.max_columns', None)

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


## Functions

In [2]:
def get_clean_cabin_nums(cabin_nums):
    if isinstance(cabin_nums, float):
        if pd.isna(cabin_nums):
            return []
        else:
            return [cabin_nums]
    elif isinstance(cabin_nums, list):
        if len(cabin_nums) == 0:
            return []
        else:
            return [int(c) for c in cabin_nums if len(c) > 0]
                
def get_cabin_nums(df):
    cabin_nums_list = (
        df['Cabin']
        .str.replace('[a-zA-Z]', '', regex=True)
        .str.strip()
        .str.split(' ')
    )
    
    return cabin_nums_list.apply(get_clean_cabin_nums)

def is_any_cabin_within_range(cabin_nums, low_exc, high_inc):
    cabin_nums = np.array(cabin_nums)
    
    return np.any(
        (cabin_nums > low_exc)
        & (cabin_nums <= high_inc)
    )

def get_has_cabin_number_in_range_onehot(df, bins):
    cabin_nums_list = get_cabin_nums(df)
    
    cabin_number_range_map = {}
    
    for i in range(len(bins) - 1):
        low_exc = bins[i]
        high_inc = bins[i + 1]
        
        cabin_number_range_map[f'has_cabin_number_between_{low_exc + 1}_{high_inc}'] = (
            cabin_nums_list.apply(
                lambda cabin_nums : is_any_cabin_within_range(cabin_nums, low_exc, high_inc)
            )
        )

    return pd.DataFrame.from_dict(cabin_number_range_map)

def get_cabin_letter_onehot(df):
    cabin_letters = {}
    
    for cabin_letter in 'ABCDEFGT':
        cabin_letters[f'is_cabin_letter_' + cabin_letter] = df['Cabin'].fillna('').str.contains(cabin_letter)

    return pd.DataFrame.from_dict(cabin_letters)

def get_input_data(df, mean_age):
    clean_cols = [
        'SibSp',
        'Parch',
        'Fare'
    ]

    # WARNING: ASSUMPTION
    clean_age = df['Age'].fillna(mean_age)
    
    is_male = df['Sex'] == 'male'
    pclass_onehot = pd.get_dummies(df['Pclass'], prefix='Pclass')
    embarked_onehot = pd.get_dummies(df['Embarked'], prefix='Embarked')
    has_cabin = ~df['Cabin'].isna()
    
    num_of_cabins = (
        df['Cabin'].str.split(' ')
        .str.len()
        .fillna(0)
    )
    
    cabin_letters_onehot = get_cabin_letter_onehot(df)
    
    has_cabin_number_in_range_onehot = get_has_cabin_number_in_range_onehot(df, np.arange(0, 151, 10))

    return pd.concat(
        (
            df[clean_cols],
            clean_age.rename('clean_age'),
            is_male.rename('is_male'),
            pclass_onehot,
            embarked_onehot,
            has_cabin.rename('has_cabin'),
            num_of_cabins.rename('num_of_cabins'),
            cabin_letters_onehot,
            has_cabin_number_in_range_onehot,
        ),
        axis=1
    )

## Prepare Data

In [3]:
df = pd.read_csv('/kaggle/input/titanic/train.csv')
df_test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [4]:
mean_age = df['Age'].mean()

input_df = get_input_data(df, mean_age)
input_df_test = get_input_data(df_test, mean_age)

In [5]:
input_df.head()

Unnamed: 0,SibSp,Parch,Fare,clean_age,is_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,has_cabin,num_of_cabins,is_cabin_letter_A,is_cabin_letter_B,is_cabin_letter_C,is_cabin_letter_D,is_cabin_letter_E,is_cabin_letter_F,is_cabin_letter_G,is_cabin_letter_T,has_cabin_number_between_1_10,has_cabin_number_between_11_20,has_cabin_number_between_21_30,has_cabin_number_between_31_40,has_cabin_number_between_41_50,has_cabin_number_between_51_60,has_cabin_number_between_61_70,has_cabin_number_between_71_80,has_cabin_number_between_81_90,has_cabin_number_between_91_100,has_cabin_number_between_101_110,has_cabin_number_between_111_120,has_cabin_number_between_121_130,has_cabin_number_between_131_140,has_cabin_number_between_141_150
0,1,0,7.25,22.0,True,False,False,True,False,False,True,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,0,71.2833,38.0,False,True,False,False,True,False,False,True,1.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2,0,0,7.925,26.0,False,False,False,True,False,False,True,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,1,0,53.1,35.0,False,True,False,False,False,False,True,True,1.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,0,0,8.05,35.0,True,False,False,True,False,False,True,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
input_df.mean()

SibSp                                0.523008
Parch                                0.381594
Fare                                32.204208
clean_age                           29.699118
is_male                              0.647587
Pclass_1                             0.242424
Pclass_2                             0.206510
Pclass_3                             0.551066
Embarked_C                           0.188552
Embarked_Q                           0.086420
Embarked_S                           0.722783
has_cabin                            0.228956
num_of_cabins                        0.267116
is_cabin_letter_A                    0.016835
is_cabin_letter_B                    0.052750
is_cabin_letter_C                    0.066218
is_cabin_letter_D                    0.037037
is_cabin_letter_E                    0.037037
is_cabin_letter_F                    0.014590
is_cabin_letter_G                    0.007856
is_cabin_letter_T                    0.001122
has_cabin_number_between_1_10     

In [7]:
input_df.describe()

Unnamed: 0,SibSp,Parch,Fare,clean_age,num_of_cabins
count,891.0,891.0,891.0,891.0,891.0
mean,0.523008,0.381594,32.204208,29.699118,0.267116
std,1.102743,0.806057,49.693429,13.002015,0.547134
min,0.0,0.0,0.0,0.42,0.0
25%,0.0,0.0,7.9104,22.0,0.0
50%,0.0,0.0,14.4542,29.699118,0.0
75%,1.0,0.0,31.0,35.0,0.0
max,8.0,6.0,512.3292,80.0,4.0


In [8]:
input_df_test.head()

Unnamed: 0,SibSp,Parch,Fare,clean_age,is_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,has_cabin,num_of_cabins,is_cabin_letter_A,is_cabin_letter_B,is_cabin_letter_C,is_cabin_letter_D,is_cabin_letter_E,is_cabin_letter_F,is_cabin_letter_G,is_cabin_letter_T,has_cabin_number_between_1_10,has_cabin_number_between_11_20,has_cabin_number_between_21_30,has_cabin_number_between_31_40,has_cabin_number_between_41_50,has_cabin_number_between_51_60,has_cabin_number_between_61_70,has_cabin_number_between_71_80,has_cabin_number_between_81_90,has_cabin_number_between_91_100,has_cabin_number_between_101_110,has_cabin_number_between_111_120,has_cabin_number_between_121_130,has_cabin_number_between_131_140,has_cabin_number_between_141_150
0,0,0,7.8292,34.5,True,False,False,True,False,True,False,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,0,7.0,47.0,False,False,False,True,False,False,True,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,0,0,9.6875,62.0,True,False,True,False,False,True,False,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,0,0,8.6625,27.0,True,False,False,True,False,False,True,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,1,1,12.2875,22.0,False,False,False,True,False,False,True,False,0.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [9]:
input_df_test.mean()

SibSp                                0.447368
Parch                                0.392344
Fare                                35.627188
clean_age                           30.154603
is_male                              0.636364
Pclass_1                             0.255981
Pclass_2                             0.222488
Pclass_3                             0.521531
Embarked_C                           0.244019
Embarked_Q                           0.110048
Embarked_S                           0.645933
has_cabin                            0.217703
num_of_cabins                        0.282297
is_cabin_letter_A                    0.016746
is_cabin_letter_B                    0.043062
is_cabin_letter_C                    0.083732
is_cabin_letter_D                    0.031100
is_cabin_letter_E                    0.026316
is_cabin_letter_F                    0.019139
is_cabin_letter_G                    0.004785
is_cabin_letter_T                    0.000000
has_cabin_number_between_1_10     

In [10]:
input_df_test.describe()

Unnamed: 0,SibSp,Parch,Fare,clean_age,num_of_cabins
count,418.0,418.0,417.0,418.0,418.0
mean,0.447368,0.392344,35.627188,30.154603,0.282297
std,0.89676,0.981429,55.907576,12.636666,0.628441
min,0.0,0.0,0.0,0.17,0.0
25%,0.0,0.0,7.8958,23.0,0.0
50%,0.0,0.0,14.4542,29.699118,0.0
75%,1.0,0.0,31.5,35.75,0.0
max,8.0,9.0,512.3292,76.0,4.0


## Train Model

In [46]:
bst = XGBClassifier(n_estimators=50, max_depth=5, learning_rate=1, objective='binary:logistic')
bst.fit(input_df, df['Survived'])

## Evaluate Model

In [47]:
y_true = df['Survived']
y_score = bst.predict(input_df)

In [48]:
sklearn.metrics.accuracy_score(y_true, y_score)

0.9797979797979798

In [49]:
sklearn.metrics.average_precision_score(y_true, y_score)

0.9632220179673149

In [50]:
sklearn.metrics.precision_score(y_true, y_score > 0.5)

0.9879518072289156

In [51]:
sklearn.metrics.recall_score(y_true, y_score > 0.5)

0.9590643274853801

## Test

In [52]:
pred_test = bst.predict(input_df_test)

In [53]:
submission_df = pd.concat((df_test['PassengerId'], pd.Series(pred_test).rename('Survived')), axis=1)

In [54]:
submission_df.describe()

Unnamed: 0,PassengerId,Survived
count,418.0,418.0
mean,1100.5,0.38756
std,120.810458,0.487777
min,892.0,0.0
25%,996.25,0.0
50%,1100.5,0.0
75%,1204.75,1.0
max,1309.0,1.0


In [55]:
submission_df.to_csv('submission.csv', index=False)

## Appendix - EDA

In [21]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
(~df['Cabin'].isna()).mean()

0.22895622895622897

In [23]:
df['Sex'].unique()

array(['male', 'female'], dtype=object)

In [24]:
df['Pclass'].unique()

array([3, 1, 2])

In [25]:
pd.get_dummies(df['Pclass'], prefix='Pclass').head()

Unnamed: 0,Pclass_1,Pclass_2,Pclass_3
0,False,False,True
1,True,False,False
2,False,False,True
3,True,False,False
4,False,False,True


In [26]:
df['SibSp'].describe()

count    891.000000
mean       0.523008
std        1.102743
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        8.000000
Name: SibSp, dtype: float64

In [27]:
df['Parch'].describe()

count    891.000000
mean       0.381594
std        0.806057
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        6.000000
Name: Parch, dtype: float64

In [28]:
df['Fare'].describe()

count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
Name: Fare, dtype: float64

In [29]:
df['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [30]:
num_of_cabins = df['Cabin'].str.split(' ').str.len()

In [31]:
num_of_cabins.describe()

count    204.000000
mean       1.166667
std        0.507740
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        4.000000
Name: Cabin, dtype: float64

In [32]:
num_of_cabins.fillna(0).describe()

count    891.000000
mean       0.267116
std        0.547134
min        0.000000
25%        0.000000
50%        0.000000
75%        0.000000
max        4.000000
Name: Cabin, dtype: float64

In [33]:
_df = df[df['Cabin'].isna()]

In [34]:
_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S


In [35]:
df['Cabin'].str.replace('\d', '', regex=True).unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'C C C', 'B', 'F', 'F G', 'D D',
       'B B', 'F E', 'C C', 'B B B B', 'T', 'B B B'], dtype=object)

In [36]:
df_test['Cabin'].str.replace('\d', '', regex=True).unique()

array([nan, 'B', 'E', 'B B B B', 'A', 'C', 'D', 'C C C', 'F G', 'C C',
       'F', 'G', 'B B', 'F E', 'B B B', 'D D', 'E E'], dtype=object)

In [37]:
a = [1, 3, 5]
np.any((np.array(a) <= 1) & (np.array(a) > 0))

True

In [38]:
get_cabin_nums(df).explode().fillna(-1).describe()

  get_cabin_nums(df).explode().fillna(-1).describe()


count    921.000000
mean      11.896851
std       28.151940
min       -1.000000
25%       -1.000000
50%       -1.000000
75%       -1.000000
max      148.000000
Name: Cabin, dtype: float64

In [39]:
get_cabin_nums(df_test).explode().fillna(-1).describe()

  get_cabin_nums(df_test).explode().fillna(-1).describe()


count    442.000000
mean      11.572398
std       25.794504
min       -1.000000
25%       -1.000000
50%       -1.000000
75%        4.000000
max      132.000000
Name: Cabin, dtype: float64

In [40]:
pd.get_dummies(df['Embarked'], prefix='Embarked').head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,False,False,True
1,True,False,False
2,False,False,True
3,False,False,True
4,False,False,True


In [41]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [42]:
df[df['Age'].isna()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q


In [43]:
df_test.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [44]:
df_test[df_test['Age'].isna()].head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
10,902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
22,914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
29,921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
33,925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S
36,928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S


In [45]:
df[df['Fare'] == 0].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
179,180,0,3,"Leonard, Mr. Lionel",male,36.0,0,0,LINE,0.0,,S
263,264,0,1,"Harrison, Mr. William",male,40.0,0,0,112059,0.0,B94,S
271,272,1,3,"Tornquist, Mr. William Henry",male,25.0,0,0,LINE,0.0,,S
277,278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0.0,,S
302,303,0,3,"Johnson, Mr. William Cahoone Jr",male,19.0,0,0,LINE,0.0,,S
