# Titanic; Logistic Regression 

In [1]:
#importing libraries 
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
#import datasets and study it 
train = pd.read_csv("train.csv")
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
trainset_len = len(train)

In [4]:
test = pd.read_csv("test.csv")
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


save PassengerId and drop from the datasets since it does not have any effect on the data and it will be used later on when saving the prediction output 

In [5]:
PassengerId = test["PassengerId"]

drop passengerids in both datasets 

In [6]:
train = train.drop("PassengerId", axis = 1)
test = test.drop("PassengerId", axis  = 1)

the main aim of this is to predict the survival on the titanic. therefore survival is the target varaible. First, performing summary statistics.

In [7]:
train["Survived"].describe()

count    891.000000
mean       0.383838
std        0.486592
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        1.000000
Name: Survived, dtype: float64

From the summary statistics performed, it can be seen that the mean of the survived is 0.383838 with mean of zero and maximum of 1. It is also known that, females and children have a higher chances of surviving so as the higher class. 
Finding correlation of the other variables and the target variable to know the effect they have on the target variable

In [8]:
#finding the relationship between the survived and other variables 
correlation = train.corr()
correlation.sort_values(["Survived"])
correlation.Survived

Survived    1.000000
Pclass     -0.338481
Age        -0.077221
SibSp      -0.035322
Parch       0.081629
Fare        0.257307
Name: Survived, dtype: float64

from the above, it can be seen that Pclass has higher correlation to survived compared to the rest. therefore the higher the class(1), the higher the change of surviving(1). that is, if the class has a high number 3, the lower the chances of surviving 

In [9]:
#concating the trainset and the testset 
data = pd.concat((train, test))

In [10]:
data.tail()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
413,,,S,8.05,"Spector, Mr. Woolf",0,3,male,0,,A.5. 3236
414,39.0,C105,C,108.9,"Oliva y Ocana, Dona. Fermina",0,1,female,0,,PC 17758
415,38.5,,S,7.25,"Saether, Mr. Simon Sivertsen",0,3,male,0,,SOTON/O.Q. 3101262
416,,,S,8.05,"Ware, Mr. Frederick",0,3,male,0,,359309
417,,,C,22.3583,"Peter, Master. Michael J",1,3,male,1,,2668


In [11]:
#identifying missing values 
data.isnull().values.any()

True

In [12]:
#check for missing values
data.isnull().sum()

Age          263
Cabin       1014
Embarked       2
Fare           1
Name           0
Parch          0
Pclass         0
Sex            0
SibSp          0
Survived     418
Ticket         0
dtype: int64

In [13]:
len(data)

1309

out of 1309 entires, there 1014 missing cabin values, imputing this variable and using it in the prediction must not be advisable. Hence it will be ignored and dropped.

In [14]:
dataset = data.drop("Cabin", axis = 1)

for the age, median would be used to impute the missing values

In [15]:
median = dataset["Age"].median()
dataset["Age"].fillna(median, inplace = True)

Same would be done for fare 

In [16]:
median = dataset["Fare"].median()
dataset["Fare"].fillna(median, inplace = True)

for embarked, the mode would be used to fill out the missing values 

In [17]:
dataset["Embarked"].mode()

0    S
dtype: object

In [18]:
dataset["Embarked"].fillna("S", inplace = True)

In [19]:
#checking if all the missing values have been imputed 
dataset.isnull().values.any()

True

In [20]:
dataset.sample(5)

Unnamed: 0,Age,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
6,54.0,S,51.8625,"McCarthy, Mr. Timothy J",0,1,male,0,0.0,17463
689,15.0,S,211.3375,"Madill, Miss. Georgette Alexandra",1,1,female,0,1.0,24160
610,39.0,S,31.275,"Andersson, Mrs. Anders Johan (Alfrida Konstant...",5,3,female,1,0.0,347082
337,41.0,C,134.5,"Burns, Miss. Elizabeth Margaret",0,1,female,0,1.0,16966
94,59.0,S,7.25,"Coxon, Mr. Daniel",0,3,male,0,0.0,364500


In [21]:
#finding out which sex has the higher chances of suviving 
dataset.groupby(["Sex"]).mean()

Unnamed: 0_level_0,Age,Fare,Parch,Pclass,SibSp,Survived
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,28.572082,46.198097,0.633047,2.154506,0.652361,0.742038
male,30.017888,26.140721,0.247924,2.372479,0.413998,0.188908


In [22]:
#finding which class has the hugher chances of surviving 
dataset.groupby(["Pclass"]).mean()

Unnamed: 0_level_0,Age,Fare,Parch,SibSp,Survived
Pclass,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,37.812446,87.508992,0.365325,0.436533,0.62963
2,29.419675,21.179196,0.368231,0.393502,0.472826
3,25.750353,13.304513,0.400564,0.568406,0.242363


the upper class(1st) has the higher chances of surviving. It is inline with the above correlation found. where by when the class is lower that is 3, the chances of surviving it less.

from the above, females have a higher chances of surviving as compared to males with a survival rate of 0.74

In [23]:
#creating a new variable to tell if the passenger was traveling alone or with either a parent or a sibling 
dataset["Alone"]=np.where((dataset["SibSp"]+dataset["Parch"]) >0, 0, 1)

In [24]:
#finding out the chances of surviving if the passenger traveled alone 
dataset.groupby(["Alone"]).mean()

Unnamed: 0_level_0,Age,Fare,Parch,Pclass,SibSp,Survived
Alone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,27.798979,50.831801,0.971098,2.142582,1.258189,0.50565
1,30.622785,21.750932,0.0,2.394937,0.0,0.303538


there is a 50.6% chances that the passenger will survival if traveling alone.

In [25]:
#creating a new feature using the titles of the individual 
dataset["Title"] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
dataset.head()

Unnamed: 0,Age,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Alone,Title
0,22.0,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171,0,Mr
1,38.0,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599,0,Mrs
2,26.0,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282,1,Miss
3,35.0,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803,0,Mrs
4,35.0,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,373450,1,Mr


In [26]:
#making some adjustments to the titles 
for i in dataset:
    dataset["Title"] = dataset["Title"].replace("Mlle", "Miss")
    dataset["Title"] = dataset["Title"].replace("Ms", "Miss")
    dataset["Title"] = dataset["Title"].replace("Mme", "Mrs")
    dataset["Title"] = dataset["Title"].replace(["Lady", "Sir", "Countess", "Jonkheer"], "Royalty")
    dataset["Title"] = dataset["Title"].replace(["Capt", "Col", "Don", "Dona", "Major", 
                                                 "Dr","Rev", "Master"], "High rank")

In [27]:
#finding out the chances of surviving based on the title of the individual 
dataset[["Title","Survived"]].groupby(["Title"]).mean()

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
High rank,0.474576
Miss,0.702703
Mr,0.156673
Mrs,0.793651
Royalty,0.75


Royalties have higher chances of surviving as compared to the other titles.

In [28]:
#dropping the name column and the ticket column
dataset = dataset.drop(["Name", "Ticket"], axis = 1)
dataset.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Alone,Title
0,22.0,S,7.25,0,3,male,1,0.0,0,Mr
1,38.0,C,71.2833,0,1,female,1,1.0,0,Mrs
2,26.0,S,7.925,0,3,female,0,1.0,1,Miss
3,35.0,S,53.1,0,1,female,1,1.0,0,Mrs
4,35.0,S,8.05,0,3,male,0,0.0,1,Mr


In [29]:
#age is a categorical data not numerical in this case. therefore, grouping the into children, youngster ... and 
#converting them into proper numerical values 
dataset["Age"].astype(int)
dataset.loc[ dataset['Age'] <= 11, 'Age'] = 0                             #Children
dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1       #Teens
dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2       #Youngsters
dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3      #Young Adults
dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4      #Adults
dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5      #Middle Age
dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6      #Senior
dataset.loc[ dataset['Age'] > 66, 'Age'] = 7                             #Retired

In [30]:
#for fare
dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0                                 #Extremely low
dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1  #very low
dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2   #low
dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99), 'Fare']   = 3        #high
dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250), 'Fare']   = 4       #very high
dataset.loc[ dataset['Fare'] > 250, 'Fare'] = 5                                     #extremely high

In [31]:
dataset.head()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Alone,Title
0,2.0,S,0.0,0,3,male,1,0.0,0,Mr
1,5.0,C,3.0,0,1,female,1,1.0,0,Mrs
2,3.0,S,1.0,0,3,female,0,1.0,1,Miss
3,5.0,S,3.0,0,1,female,1,1.0,0,Mrs
4,5.0,S,1.0,0,3,male,0,0.0,1,Mr


In [32]:
dataset = dataset.drop("Survived", axis = 1)

Changing categorical data into numerical data

In [33]:
dataset=pd.get_dummies(dataset, columns=["Pclass","Embarked","Sex", "Title", "Age", "Fare"])

drop one of the columns for sex and rename the other column since it is depicts the sex of male and female 

In [34]:
dataset.head()

Unnamed: 0,Parch,SibSp,Alone,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S,Sex_female,...,Age_4.0,Age_5.0,Age_6.0,Age_7.0,Fare_0.0,Fare_1.0,Fare_2.0,Fare_3.0,Fare_4.0,Fare_5.0
0,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
1,0,1,0,1,0,0,1,0,0,1,...,0,1,0,0,0,0,0,1,0,0
2,0,0,1,0,0,1,0,0,1,1,...,0,0,0,0,0,1,0,0,0,0
3,0,1,0,1,0,0,0,0,1,1,...,0,1,0,0,0,0,0,1,0,0
4,0,0,1,0,0,1,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0


## Fitting the Model

In [35]:
y = train["Survived"]
x_train = dataset[:trainset_len]
x_test = dataset[trainset_len:]

In [36]:
from sklearn import linear_model
logistic = linear_model.LogisticRegression()
logistic.fit(x_train, y)
y_pred_log = logistic.predict(x_test)
R_squared = logistic.score(x_train, y)
print("score of R_squared :", R_squared)

score of R_squared : 0.8260381593714927




In [37]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(x_train, y)
y_pred_rf = rf.predict(x_test)
R_squared = rf.score(x_train, y)
print("score of R_squared :", R_squared)

score of R_squared : 0.9079685746352413




In [38]:
# Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(x_train, y)
y_pred_gbc = gbc.predict(x_test)
R_squared = gbc.score(x_train, y)
print("score of R_squared :", R_squared)

score of R_squared : 0.867564534231201


In [39]:
#decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y)
y_pred_dt = dt.predict(x_test)
R_squared = dt.score(x_train, y)
print("score of R_squared :", R_squared)

score of R_squared : 0.9124579124579124


In [40]:
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
adaboost = AdaBoostClassifier()
adaboost.fit(x_train, y)
y_pred_adaboost= adaboost.predict(x_test)
R_squared = adaboost.score(x_train, y)
print("score of R_squared :", R_squared)

score of R_squared : 0.8237934904601572


In [41]:
from sklearn.ensemble import BaggingClassifier
bag = BaggingClassifier()
bag.fit(x_train, y)
y_pred_bag = bag.predict(x_test)
R_squared = bag.score(x_train, y)
print("score of R_squared :", R_squared)

score of R_squared : 0.9068462401795735


In [42]:
from sklearn.linear_model import RidgeClassifier
rcl = RidgeClassifier()
rcl.fit(x_train, y)
y_pred_rcl = rcl.predict(x_test)
R_squared = rcl.score(x_train, y)
print("score of R_squared :", R_squared)

score of R_squared : 0.8215488215488216


In [43]:
from sklearn.ensemble import ExtraTreesClassifier
ext = ExtraTreesClassifier()
ext.fit(x_train, y)
y_pred_ext = ext.predict(x_test)
R_squared = ext.score(x_train, y)
print("score of R_squared :", R_squared)

score of R_squared : 0.9124579124579124




In [45]:
#Survived