In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

In [3]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [7]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [8]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
train_data.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Gustafsson, Mr. Johan Birger",male,347082,G6,S
freq,1,577,7,4,644


In [10]:
train_data = train_data.drop("PassengerId", axis=1)

In [11]:
train_data['Sex'] = train_data['Sex'].map({'male':0,'female':1})
test_data['Sex'] = test_data['Sex'].map({'male':0,'female':1})
train_data.head()
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S


In [12]:
#We don't really care about the name, but their status
def extract_status(name):
    status = name.map(lambda n : n.split(',')[1].split('.')[0].replace(' ',''))
    return status

In [13]:
train_data['Status'] = extract_status(train_data['Name'])
train_data['Status'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
       'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'theCountess',
       'Jonkheer'], dtype=object)

In [14]:
test_data['Status'] = extract_status(test_data['Name'])
test_data['Status'].unique()

array(['Mr', 'Mrs', 'Miss', 'Master', 'Ms', 'Col', 'Rev', 'Dr', 'Dona'],
      dtype=object)

In [15]:
#Now we can drop the column Name
train_data = train_data.drop(columns = ['Name'])
test_data = test_data.drop(columns = ['Name'])

In [16]:
# We have too much categories for status, let's do a reduction
status_dict = {
    'Mr': 'Mr',
    'Miss': 'Woman',
    'Mrs': 'Woman',
    'Master': 'Master',
    'Dr': 'Rare',
    'Rev': 'Rare',
    'Major': 'Rare',
    'Col': 'Rare',
    'Mlle': 'Woman',
    'theCountess': 'Rare',
    'Don': 'Rare',
    'Mme': 'Woman',
    'Jonkheer': 'Rare',
    'Sir': 'Rare',
    'Ms': 'Woman',
    'Lady': 'Rare',
    'Capt': 'Rare',
    'Dona': 'Rare'
}

In [17]:
def reduce_status(status):
    new_status = status.map(lambda s : status_dict[s])
    return new_status

In [18]:
train_data['Status'] = reduce_status(train_data['Status'])
test_data['Status'] = reduce_status(test_data['Status'])
print(train_data['Status'].unique())
print(test_data['Status'].unique())

['Mr' 'Woman' 'Master' 'Rare']
['Mr' 'Woman' 'Master' 'Rare']


In [19]:
def b_or_f(data):
    boy = (data.Status == 'Master') | ((data.Sex == 0) & (data.Age<13))
    female = data.Sex == 1
    boy_or_female = boy | female
    data["BorF"] = boy_or_female.map(lambda x : 1 if x else 0)
    return data["BorF"]
train_data["BorF"] = b_or_f(train_data)
test_data["BorF"] = b_or_f(test_data)

In [20]:
# Now we try to correct the missing values in both datasets
reference_age = train_data.groupby(['Pclass','Sex','Status']).mean().reset_index()[['Pclass','Sex','Status','Age']]
reference_age['Age'] = reference_age['Age'].map(lambda a : math.ceil(a))
reference_age

Unnamed: 0,Pclass,Sex,Status,Age
0,1,0,Master,6
1,1,0,Mr,42
2,1,0,Rare,49
3,1,1,Rare,44
4,1,1,Woman,35
5,2,0,Master,3
6,2,0,Mr,33
7,2,0,Rare,42
8,2,1,Woman,29
9,3,0,Master,6


In [21]:
def correct_data(r,reference,var):
    return reference[(reference['Pclass']==r['Pclass']) & 
                    (reference['Status']==r['Status']) &
                    (reference['Sex']==r['Sex'])][var].values[0]

In [22]:
train_data['Age'] = train_data.apply(lambda r : correct_data(r,reference_age,'Age') if np.isnan(r['Age']) else r['Age'], axis=1 )
test_data['Age'] = test_data.apply(lambda r : correct_data(r,reference_age,'Age') if np.isnan(r['Age']) else r['Age'], axis=1 )

In [23]:
reference_fare = train_data.groupby(['Pclass','Sex','Status']).mean().reset_index()[['Pclass','Sex','Status','Fare']]
reference_fare

Unnamed: 0,Pclass,Sex,Status,Fare
0,1,0,Master,117.802767
1,1,0,Mr,68.034385
2,1,0,Rare,47.375
3,1,1,Rare,50.6764
4,1,1,Woman,107.9538
5,2,0,Master,27.306022
6,2,0,Mr,19.252564
7,2,0,Rare,16.796875
8,2,1,Woman,21.970121
9,3,0,Master,28.177232


In [24]:
train_data['Fare'] = train_data.apply(lambda r : correct_data(r,reference_fare,'Fare') if np.isnan(r['Fare']) else r['Fare'], axis=1 )
test_data['Fare'] = test_data.apply(lambda r : correct_data(r,reference_fare,'Fare') if np.isnan(r['Fare']) else r['Fare'], axis=1 )

In [25]:
# S is the most frequent value of the variable Embarked, 
# so we gonna correct all missing values of Embarked by S
train_data['Embarked'].fillna('S', inplace=True)

In [26]:
def fill_cabin(data):
    data.fillna('U', inplace=True)
    data = data.map(lambda c: c[0])
    return data
train_data['Cabin'] = fill_cabin(train_data['Cabin'])
test_data['Cabin'] = fill_cabin(test_data['Cabin'])

In [27]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Status,BorF
0,0,3,0,22.0,1,0,A/5 21171,7.25,U,S,Mr,0
1,1,1,1,38.0,1,0,PC 17599,71.2833,C,C,Woman,1
2,1,3,1,26.0,0,0,STON/O2. 3101282,7.925,U,S,Woman,1
3,1,1,1,35.0,1,0,113803,53.1,C,S,Woman,1
4,0,3,0,35.0,0,0,373450,8.05,U,S,Mr,0


In [28]:
def cleanTicket(ticket):
    #extract ticket prefix, if not 'XXX'
    ticket = ticket.replace('.', '')
    ticket = ticket.replace('/', '')
    ticket = ticket.split()
    ticket = map(lambda t : t.strip(), ticket)
    ticket = list(filter(lambda t : not t.isdigit(), ticket))
    if len(ticket) > 0:
        return ticket[0]
    else: 
        return 'XXX'
train_data['Ticket'] = train_data['Ticket'].map(cleanTicket)
test_data['Ticket'] = test_data['Ticket'].map(cleanTicket)

In [29]:
train_data['Ticket'].unique()

array(['A5', 'PC', 'STONO2', 'XXX', 'PP', 'CA', 'SCParis', 'SCA4', 'A4',
       'SP', 'SOC', 'WC', 'SOTONOQ', 'WEP', 'STONO', 'C', 'SCPARIS',
       'SOP', 'Fa', 'LINE', 'FCC', 'SWPP', 'SCOW', 'PPP', 'SC', 'SCAH',
       'AS', 'SOPP', 'FC', 'SOTONO2', 'CASOTON'], dtype=object)

In [30]:
#Look pretty clean now
print(train_data.isnull().sum())
print(test_data.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Ticket      0
Fare        0
Cabin       0
Embarked    0
Status      0
BorF        0
dtype: int64
PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Status         0
BorF           0
dtype: int64


In [31]:
train_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Status,BorF
0,0,3,0,22.0,1,0,A5,7.25,U,S,Mr,0
1,1,1,1,38.0,1,0,PC,71.2833,C,C,Woman,1
2,1,3,1,26.0,0,0,STONO2,7.925,U,S,Woman,1
3,1,1,1,35.0,1,0,XXX,53.1,C,S,Woman,1
4,0,3,0,35.0,0,0,XXX,8.05,U,S,Mr,0


In [32]:
train_data['FamilySize'] = train_data.Parch + train_data.SibSp + 1
test_data['FamilySize'] = train_data.Parch + train_data.SibSp + 1
train_data = train_data.drop(columns = ['Parch','SibSp'])
test_data = test_data.drop(columns = ['Parch','SibSp'])

In [33]:
train_data['FamilySize'].value_counts()

1     537
2     161
3     102
4      29
6      22
5      15
7      12
11      7
8       6
Name: FamilySize, dtype: int64

In [34]:
def family_type(row):
    if row['FamilySize'] == 1:
        return 'alone'
    elif row['FamilySize'] <=3:
        return 'small'
    elif row['FamilySize'] <=6:
        return 'medium'
    else:
        return 'big'
    
train_data['FamilyType'] = train_data.apply(lambda row: family_type(row), axis=1)
test_data['FamilyType'] = test_data.apply(lambda row: family_type(row), axis=1)
train_data = train_data.drop(columns = ['FamilySize'])
test_data = test_data.drop(columns = ['FamilySize'])

In [35]:
train_data['FamilyType'].value_counts()

alone     537
small     263
medium     66
big        25
Name: FamilyType, dtype: int64

In [36]:
pd.qcut(train_data['Age'],q=5).to_frame().value_counts()

Age          
(0.419, 20.0]    183
(26.0, 30.0]     182
(20.0, 26.0]     182
(30.0, 40.0]     173
(40.0, 80.0]     171
dtype: int64

In [37]:
pd.qcut(train_data['Fare'],q=5).to_frame().value_counts()

Fare             
(7.854, 10.5]        184
(21.679, 39.688]     180
(-0.001, 7.854]      179
(39.688, 512.329]    176
(10.5, 21.679]       172
dtype: int64

In [38]:
def fare_type(row):
    if row['Fare'] <= 7.854:
        return '1'
    elif row['Fare'] <= 10.5:
        return '2'
    elif row['Fare'] <= 21.679:
        return '3'
    elif row['Fare'] <= 39.688:
        return '4'
    else :
        return '5'

train_data['FareType'] = train_data.apply(lambda row: fare_type(row), axis=1)
test_data['FareType'] = test_data.apply(lambda row: fare_type(row), axis=1)
train_data = train_data.drop(columns = ['Fare'])
test_data = test_data.drop(columns = ['Fare'])

In [39]:
def age_type(row):
    if row['Age'] <= 19:
        return 0
    elif row['Age'] <= 26:
        return 1
    elif row['Age'] <= 30:
        return 2
    elif row['Age'] <= 40.5:
        return 3
    else :
        return 4

train_data['AgeType'] = train_data.apply(lambda row: age_type(row), axis=1)
test_data['AgeType'] = test_data.apply(lambda row: age_type(row), axis=1)
train_data = train_data.drop(columns = ['Age'])
test_data = test_data.drop(columns = ['Age'])

In [40]:
print(train_data.Cabin.unique())
print(test_data.Cabin.unique())


['U' 'C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']
['U' 'B' 'E' 'A' 'C' 'D' 'F' 'G']


In [41]:
X_train = train_data.drop('Survived',axis = 1)
Y_train = train_data['Survived']
X_test = test_data.drop("PassengerId", axis=1).copy()
print('X_train shape :',X_train.shape)
print('X_test shape :',X_test.shape)
print('Y_train shape :',Y_train.shape)

X_train shape : (891, 10)
X_test shape : (418, 10)
Y_train shape : (891,)


In [42]:
X_train

Unnamed: 0,Pclass,Sex,Ticket,Cabin,Embarked,Status,BorF,FamilyType,FareType,AgeType
0,3,0,A5,U,S,Mr,0,small,1,1
1,1,1,PC,C,C,Woman,1,small,5,3
2,3,1,STONO2,U,S,Woman,1,alone,2,1
3,1,1,XXX,C,S,Woman,1,small,5,3
4,3,0,XXX,U,S,Mr,0,alone,2,3
...,...,...,...,...,...,...,...,...,...,...
886,2,0,XXX,U,S,Rare,0,alone,3,2
887,1,1,XXX,B,S,Woman,1,alone,4,0
888,3,1,WC,U,S,Woman,1,medium,4,1
889,1,0,XXX,C,C,Mr,0,alone,4,1


In [43]:
# since we don't have the same value for train.cabin and test.cabin 
# and train.ticket and test.ticket, we have to combine this 2 dataset to one-hot encode
combined = X_train.append(X_test)
combined = combined.drop(columns = 'Sex')
combined.head()

Unnamed: 0,Pclass,Ticket,Cabin,Embarked,Status,BorF,FamilyType,FareType,AgeType
0,3,A5,U,S,Mr,0,small,1,1
1,1,PC,C,C,Woman,1,small,5,3
2,3,STONO2,U,S,Woman,1,alone,2,1
3,1,XXX,C,S,Woman,1,small,5,3
4,3,XXX,U,S,Mr,0,alone,2,3


In [44]:
categorical_features = ['Pclass', 'Embarked', 'Status', 'FamilyType', 'FareType', 'AgeType','Ticket' ,'Cabin']
print(categorical_features)

['Pclass', 'Embarked', 'Status', 'FamilyType', 'FareType', 'AgeType', 'Ticket', 'Cabin']


In [45]:
pipeline = ColumnTransformer([
    ("cat", OneHotEncoder(), categorical_features) # Perform One-Hot encoding on the category attributes
])

In [46]:
combined = pipeline.fit_transform(combined)

In [47]:
X_train = combined[:891] 
X_test = combined[891:]

In [48]:
param ={'max_depth' : [4, 6, 8, 10, None],
        'n_estimators': [50, 100, 150],
        'max_features': ['sqrt', 'auto', 'log2'],
        'min_samples_split': [2, 3, 10],
        'min_samples_leaf': [1, 3, 10],
        'bootstrap': [True, False]}

In [49]:
'''
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
rnd_search = RandomizedSearchCV(RandomForestClassifier(), param, cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42),scoring='accuracy')
rnd_search.fit(X_train,Y_train)
print(rnd_search.best_params_)
print(rnd_search.best_score_)
'''

"\nfrom sklearn.model_selection import RandomizedSearchCV\nfrom sklearn.model_selection import RepeatedStratifiedKFold\nrnd_search = RandomizedSearchCV(RandomForestClassifier(), param, cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=2, random_state=42),scoring='accuracy')\nrnd_search.fit(X_train,Y_train)\nprint(rnd_search.best_params_)\nprint(rnd_search.best_score_)\n"

In [50]:
parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 150, 
                  'min_samples_split': 10, 'max_features': 'auto', 'max_depth': 10}
best_model = RandomForestClassifier(**parameters)
best_model.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=False, max_depth=10, min_samples_leaf=3,
                       min_samples_split=10, n_estimators=150)

In [51]:
Y_pred = best_model.predict(X_test)
best_model.score(X_train, Y_train)

0.8664421997755332

In [52]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(best_model, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.78888889 0.83146067 0.74157303 0.87640449 0.8988764  0.85393258
 0.85393258 0.79775281 0.83146067 0.85393258]
Mean: 0.8328214731585518
Standard Deviation: 0.04369168702061913


In [53]:
output = pd.DataFrame({'PassengerId': test_data['PassengerId'], 'Survived': Y_pred})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [54]:
output.Survived.value_counts()

0    276
1    142
Name: Survived, dtype: int64