In [69]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
plt.style.use("ggplot")
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV

import xgboost as xgb
from xgboost import XGBClassifier

In [10]:
import warnings

warnings.filterwarnings('ignore')

# Import data 

In [23]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.shape

(891, 12)

In [24]:
test.shape

(418, 11)

In [25]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [26]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [27]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [28]:
train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [29]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

# Find relations between fields and Clean Data

In [31]:
X_full = pd.concat([train.drop('Survived', axis = 1), test], axis = 0)

In [32]:
X_full.shape

(1309, 11)

In [33]:
X_full.drop('PassengerId', axis = 1, inplace=True)

In [34]:
X_full.isnull().sum()

Pclass         0
Name           0
Sex            0
Age          263
SibSp          0
Parch          0
Ticket         0
Fare           1
Cabin       1014
Embarked       2
dtype: int64

In [35]:
# This people probably died because we could not collect there data
(X_full.Age.isnull() & X_full.Cabin.isnull()).sum()

240

In [39]:
train.Survived.mean()

0.3838383838383838

In [38]:
train.Cabin.notnull().mean()

0.22895622895622897

In [40]:
(train.Cabin.isnull() & (train.Survived == 0)).mean()

0.5398428731762065

In [41]:
selector = (train.Cabin.isnull() & train.Age.isnull())

train[selector].Survived.mean()

0.25949367088607594

In [42]:
selector = (train.Cabin.isnull())

train[selector].Survived.mean()

0.29985443959243085

In [43]:
# We can conclude that not cabin_null is a good indicator of not_survived, but cabin_null and age_null is even better.
X_full['Nulls'] = X_full.Cabin.isnull().astype('int') + X_full.Age.isnull().astype('int')

In [44]:
# We can further divide the cabin category by simply extracting the first lettern and transform the letter into numbers.
X_full['Cabin_mapped'] = X_full['Cabin'].astype(str).str[0] # this captures the letter

# this transforms the letters into numbers
cabin_dict = {k:i for i, k in enumerate(X_full.Cabin_mapped.unique())} 
X_full.loc[:, 'Cabin_mapped'] = X_full.loc[:, 'Cabin_mapped'].map(cabin_dict)
cabin_dict

{'n': 0, 'C': 1, 'E': 2, 'G': 3, 'D': 4, 'A': 5, 'B': 6, 'F': 7, 'T': 8}

In [45]:
X_full.columns

Index(['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare',
       'Cabin', 'Embarked', 'Nulls', 'Cabin_mapped'],
      dtype='object')

In [46]:
X_full.drop(['Age', 'Cabin'], inplace = True, axis = 1)

In [47]:
fare_mean = X_full[X_full.Pclass == 3].Fare.mean()

X_full['Fare'].fillna(fare_mean, inplace = True)

In [48]:
X_full.isnull().sum()

Pclass          0
Name            0
Sex             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        2
Nulls           0
Cabin_mapped    0
dtype: int64

In [49]:
X_full[X_full.Embarked.isnull()]

Unnamed: 0,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked,Nulls,Cabin_mapped
61,1,"Icard, Miss. Amelie",1,0,0,113572,80.0,,0,6
829,1,"Stone, Mrs. George Nelson (Martha Evelyn)",1,0,0,113572,80.0,,0,6


In [50]:
X_full[X_full['Pclass'] == 1].Embarked.value_counts()

S    177
C    141
Q      3
Name: Embarked, dtype: int64

In [51]:
# switch null to most common data
X_full['Embarked'].fillna('S', inplace = True)

In [52]:
X_full.isnull().sum()

Pclass          0
Name            0
Sex             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Embarked        0
Nulls           0
Cabin_mapped    0
dtype: int64

In [53]:
X_full.drop(['Name', 'Ticket'], axis = 1, inplace = True)

In [56]:
X_full.dtypes

Pclass            int64
Sex              object
SibSp             int64
Parch             int64
Fare            float64
Embarked         object
Nulls             int32
Cabin_mapped      int64
dtype: object

In [57]:
# add dummies 
X_dummies = pd.get_dummies(X_full, columns = ['Sex', 'Nulls', 'Cabin_mapped', 'Embarked'], drop_first= True)
X_dummies.dtypes

Pclass              int64
SibSp               int64
Parch               int64
Fare              float64
Sex_1               uint8
Sex_female          uint8
Sex_male            uint8
Nulls_1             uint8
Nulls_2             uint8
Cabin_mapped_1      uint8
Cabin_mapped_2      uint8
Cabin_mapped_3      uint8
Cabin_mapped_4      uint8
Cabin_mapped_5      uint8
Cabin_mapped_6      uint8
Cabin_mapped_7      uint8
Cabin_mapped_8      uint8
Embarked_Q          uint8
Embarked_S          uint8
dtype: object

# Train the data

In [58]:
X = X_dummies[:len(train)]; new_X = X_dummies[len(train):]
y = train.Survived

In [60]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size = .3,
                                                    random_state = 5,
                                                   stratify = y)

In [62]:
RF = RandomForestClassifier()
RF.fit(X_train, y_train)
RF.score(X_test, y_test)

0.7835820895522388

In [65]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)

In [66]:
xgb.score(X_test, y_test)

0.8097014925373134

In [68]:
from sklearn.linear_model import LogisticRegression
LG = LogisticRegression()
LG.fit(X_train, y_train)
LG.score(X_test, y_test)

0.7985074626865671

In [70]:
# Create the parameter grid: gbm_param_grid 
gbm_param_grid = {
    'n_estimators': range(8, 20),
    'max_depth': range(6, 10),
    'learning_rate': [.4, .45, .5, .55, .6],
    'colsample_bytree': [.6, .7, .8, .9, 1]
}

# Instantiate the regressor: gbm
gbm = XGBClassifier(n_estimators=10)

# Perform random search: grid_mse
xgb_random = RandomizedSearchCV(param_distributions=gbm_param_grid, 
                                    estimator = gbm, scoring = "accuracy", 
                                    verbose = 1, n_iter = 50, cv = 4)


# Fit randomized_mse to the data
xgb_random.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", xgb_random.best_params_)
print("Best accuracy found: ", xgb_random.best_score_)

Fitting 4 folds for each of 50 candidates, totalling 200 fits
Best parameters found:  {'n_estimators': 10, 'max_depth': 6, 'learning_rate': 0.55, 'colsample_bytree': 1}
Best accuracy found:  0.8193148305255928


In [71]:
xgb_pred = xgb_random.predict(new_X)

In [72]:
submission = pd.concat([test.PassengerId, pd.DataFrame(xgb_pred)], axis = 'columns')

In [73]:
submission.columns = ["PassengerId", "Survived"]

In [74]:
submission.to_csv('titanic_submission.csv', header = True, index = False)