# Titanic Survival Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/sumathi16/Datasets/master/titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Creating a new column Family in the dataset  by adding SibSp column and Parch column
df['Family'] = df['SibSp'] + df['Parch']
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,1


In [None]:
# Ticket--> As it is just a random number, we can skip this column from our analysis
df.drop('Ticket',axis=1,inplace=True)

In [None]:
## Imputing with missing values
df.isnull().mean().sort_values(ascending=False)

Cabin          0.771044
Age            0.198653
Embarked       0.002245
Family         0.000000
Fare           0.000000
Parch          0.000000
SibSp          0.000000
Gender         0.000000
Name           0.000000
Pclass         0.000000
Survived       0.000000
PassengerId    0.000000
dtype: float64

In [None]:
## Imputing with missing values
df.isnull().mean().sort_values(ascending=False)

Cabin          0.771044
Age            0.198653
Embarked       0.002245
Family         0.000000
Fare           0.000000
Parch          0.000000
SibSp          0.000000
Gender         0.000000
Name           0.000000
Pclass         0.000000
Survived       0.000000
PassengerId    0.000000
dtype: float64

In [None]:
# Median---> sorting in the order (middle value) NaN in the age column should be substituted with median of age
df.Age.fillna(df.Age.median(), inplace=True)
df.Age.isnull().sum()

0

In [None]:
## Imputing the Embarked Column---> Mode Imputation
df.Embarked.fillna(df.Embarked.mode()[0], inplace=True)
df.Embarked.isnull().sum()

0

In [None]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           0
Age              0
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         0
Family           0
dtype: int64

In [None]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Gender          object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Cabin           object
Embarked        object
Family           int64
dtype: object

In [None]:
## Remove passenger ID
df.drop('PassengerId',axis=1,inplace=True)

In [None]:
df.drop('Name',axis=1,inplace=True)

In [None]:
df.nunique()

Survived      2
Pclass        3
Gender        2
Age          88
SibSp         7
Parch         7
Fare        248
Cabin       147
Embarked      3
Family        9
dtype: int64

In [None]:
## Seperating the Input and Output Data, dropping the Survived column from the data
X = df.drop('Survived', axis=1)
X.shape

(891, 9)

In [None]:
X= pd.get_dummies(X)
X.shape

(891, 158)

In [None]:
## Output Column
y = df.iloc[:,0]
y.shape

(891,)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=23) 
                                                        # random_state = to select the constant rows

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(668, 158)
(223, 158)
(668,)
(223,)


In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression() # creating an object for Logistic Regression
log_reg.fit(X_train, y_train)
y_train_pred = log_reg.predict(X_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
## accuracy_score---> With help of this metric, we can evaluate the overall 
## performance of the model
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_train_pred)

0.8203592814371258

In [None]:
# Importing the package
from sklearn.ensemble import AdaBoostClassifier
# Instance creation
adc = AdaBoostClassifier()
# Train the model
adc.fit(X_train,y_train)
# prediction
y_pred = adc.predict(X_test)

In [None]:
accuracy_score(y_pred, y_test)

0.7937219730941704

In [None]:
accuracy_score(y_train,adc.predict(X_train))

0.8532934131736527

In [None]:
adc

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

# try it with different parameters
EX: learning rate = 0.5 n_estimators = 25

In [None]:
# importing the packages
from sklearn.ensemble import AdaBoostClassifier
# Instance creation
adc = AdaBoostClassifier(learning_rate = 1,n_estimators = 75)
# Train the model
adc.fit(X_train,y_train)
# prediction
y_pred = adc.predict(X_test)
print("test accuarcy",accuracy_score(y_pred, y_test))
print("train accuarcy",accuracy_score(y_train,adc.predict(X_train) ))

test accuarcy 0.8116591928251121
train accuarcy 0.8622754491017964


# With base learner

In [None]:
# Importing the package
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
# Instance creation with base estimator
adc = AdaBoostClassifier(base_estimator= LogisticRegression())
# Train the model
adc.fit(X_train,y_train)
# prediction
y_pred = adc.predict(X_test)
print("test accuarcy",accuracy_score(y_pred, y_test))
print("train accuarcy",accuracy_score(y_train,adc.predict(X_train) ))

test accuarcy 0.7623318385650224
train accuarcy 0.8083832335329342


In [None]:
# import the algo
from sklearn.ensemble import GradientBoostingClassifier
gdc = GradientBoostingClassifier()
gdc.fit(X_train,y_train)
# prediction
y_pred = gdc.predict(X_test)
print("test accuarcy",accuracy_score(y_pred, y_test))
print("train accuarcy",accuracy_score(y_train,gdc.predict(X_train) ))

test accuarcy 0.8340807174887892
train accuarcy 0.8967065868263473


#Scores with different parameters
Sai Rohit
learning_rate=1
test accuracy:  0.8116591928251121
Train accuracy:  0.9835329341317365
Venkat
train :0.905688622754491
test :0.8026905829596412
SAI
n_estimators=40
test accuracy: 0.820627802690583
train accuracy: 0.8622754491017964

In [None]:
### XGBoost

In [None]:
pip install xgboost

Collecting xgboost
  Using cached xgboost-1.1.1-py3-none-win_amd64.whl (54.4 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import xgboost as xgb
from xgboost import XGBClassifier
xgb = XGBClassifier()
xgb.fit(X_train,y_train)
# make prediction
preds = xgb.predict(X_test)
print("test accuarcy",accuracy_score(preds, y_test))
print("train accuarcy",accuracy_score(y_train,xgb.predict(X_train) ))

test accuarcy 0.7937219730941704
train accuarcy 0.9730538922155688


In [None]:
xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)