In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics

In [2]:
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

## Data pre-processing/cleaning

In [3]:
#Removing un-necessary attributes
test = test.drop(["Name", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)
train = train.drop(["Name", "Ticket", "Fare", "Cabin", "Embarked"], axis=1)

In [4]:
#Getting Nan values
pd.isnull(test).sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
dtype: int64

In [5]:
#Getting Nan values
pd.isnull(train).sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
dtype: int64

In [6]:
#Handling null values. 
#Age is one of the survival factors we will fill median values in it. 
test["Age"] = test["Age"].fillna(test['Age'].median())
train["Age"] = train["Age"].fillna(test['Age'].median())

In [7]:
#Feature Engineering on Siblings and Parents
def f(row):
    if row['SibSp'] > 0 and row['Parch'] > 0:
        val = 'Joint family'
    elif row['SibSp'] == 0 and row['Parch'] == 0:
        val = 'Single'
    else:
        val = 'Family'
    return val

In [8]:
train['Family'] = train.apply(f, axis =1)
test['Family'] = test.apply(f, axis =1)

In [9]:
#dropping Sib and Parents
train = train.drop(["SibSp", "Parch"], axis=1)
test = test.drop(["SibSp", "Parch"], axis=1)

In [10]:
train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Family
0,1,0,3,male,22.0,Family
1,2,1,1,female,38.0,Family
2,3,1,3,female,26.0,Single
3,4,1,1,female,35.0,Family
4,5,0,3,male,35.0,Single
...,...,...,...,...,...,...
886,887,0,2,male,27.0,Single
887,888,1,1,female,19.0,Single
888,889,0,3,female,27.0,Joint family
889,890,1,1,male,26.0,Single


## Data Analysis

### Survival as per Gender(Sex)

In [11]:
gender_survival = train[['Sex', 'Survived']]
gender_survival

Unnamed: 0,Sex,Survived
0,male,0
1,female,1
2,female,1
3,female,1
4,male,0
...,...,...
886,male,0
887,female,1
888,female,0
889,male,1


In [12]:
gender_survival.groupby(['Sex', 'Survived']).Survived.count()

Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64

### Survival by Pclass

In [13]:
Class_survival = train[['Pclass', 'Survived']]
Class_survival

Unnamed: 0,Pclass,Survived
0,3,0
1,1,1
2,3,1
3,1,1
4,3,0
...,...,...
886,2,0
887,1,1
888,3,0
889,1,1


In [14]:
Class_survival.groupby(['Pclass', 'Survived']).Survived.count()

Pclass  Survived
1       0            80
        1           136
2       0            97
        1            87
3       0           372
        1           119
Name: Survived, dtype: int64

### Survival by Family

In [15]:
Family_Survival = train[['Family', 'Survived']]
Family_Survival

Unnamed: 0,Family,Survived
0,Family,0
1,Family,1
2,Single,1
3,Family,1
4,Single,0
...,...,...
886,Single,0
887,Single,1
888,Joint family,0
889,Single,1


In [16]:
Family_Survival.groupby(['Family', 'Survived']).Survived.count()

Family        Survived
Family        0            95
              1           117
Joint family  0            80
              1            62
Single        0           374
              1           163
Name: Survived, dtype: int64

In [17]:
from sklearn.preprocessing import LabelEncoder

sex_encoder = LabelEncoder()
sex_encoder.fit(train['Sex'])
sex_encoder.fit(test['Sex'])
new_train = sex_encoder.transform(train['Sex'])
new_test = sex_encoder.transform(test['Sex'])
train['Sex'] = new_train
test['Sex'] = new_test

family_encoder = LabelEncoder()
family_encoder.fit(train['Family'])
family_encoder.fit(test['Family'])
new_tr = family_encoder.transform(train['Family'])
new_ts = family_encoder.transform(test['Family'])
train['Family'] = new_tr
test['Family'] = new_ts

train

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Family
0,1,0,3,1,22.0,0
1,2,1,1,0,38.0,0
2,3,1,3,0,26.0,2
3,4,1,1,0,35.0,0
4,5,0,3,1,35.0,2
...,...,...,...,...,...,...
886,887,0,2,1,27.0,2
887,888,1,1,0,19.0,2
888,889,0,3,0,27.0,1
889,890,1,1,1,26.0,2


In [18]:
y = train.Survived

In [19]:
features = ['Pclass', 'Sex', 'Age', 'Family']
X = train[features]

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test= train_test_split(X,y,test_size=0.2 , random_state=1)

In [22]:
classifier = LogisticRegression(random_state = 1)
classifier.fit(X_train, y_train)
print("Logistic Regression score: ", classifier.score(X_test,y_test))

Logistic Regression score:  0.7877094972067039


In [23]:
y_lpred=classifier.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_lpred))

Accuracy: 0.7877094972067039


Types of Naive Bayes Classifiers
Naive Bayes Classifiers are classified into three categories —

i) Gaussian Naive Bayes

This classifier is employed when the predictor values are continuous and are expected to follow a Gaussian distribution.

ii) Bernoulli Naive Bayes

When the predictors are boolean in nature and are supposed to follow the Bernoulli distribution, this classifier is utilized.

iii) Multinomial Naive Bayes

This classifier makes use of a multinomial distribution and is often used to solve issues involving document or text classification.

In [24]:
 from sklearn.naive_bayes import GaussianNB

In [25]:
nb = GaussianNB()
nb.fit(X_train, y_train)

GaussianNB()

In [26]:
print("Naive Bayes score: ",nb.score(X_test, y_test))

Naive Bayes score:  0.7821229050279329


In [27]:
y_nbpred=nb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_nbpred))

Accuracy: 0.7821229050279329


In [28]:
from sklearn.tree import DecisionTreeClassifier
model=DecisionTreeClassifier(criterion="entropy")
model.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy')

In [29]:
print("Decision Tree accuracy: ",model.score(X_test, y_test))

Decision Tree accuracy:  0.7374301675977654


In [30]:
y_mpred=model.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_mpred))

Accuracy: 0.7374301675977654


In [31]:
# Training the K-NN model on the Training set
from sklearn.neighbors import KNeighborsClassifier
knnclassifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knnclassifier.fit(X_train, y_train)

print("KNN score: ",knnclassifier.score(X_test, y_test))

KNN score:  0.7206703910614525


In [32]:
y_knnpred=y_knnpred=knnclassifier.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_knnpred))

Accuracy: 0.7206703910614525


In [33]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

SVC(kernel='linear')

In [34]:
print("Support Vector Machine score: ",clf.score(X_test, y_test))

Support Vector Machine score:  0.776536312849162


In [35]:
y_svmpred=clf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_svmpred))

Accuracy: 0.776536312849162


In [36]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
rclf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
rclf.fit(X_train,y_train)

print("Random Forest score: ",rclf.score(X_test, y_test))

Random Forest score:  0.7374301675977654


In [37]:
y_rfpred=rclf.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_rfpred))

Accuracy: 0.7374301675977654


In [38]:
print("Logistic Regression score %: ", classifier.score(X_test,y_test)*100)
print("Naive Bayes score %: ",nb.score(X_test, y_test)*100)
print("Decision Tree score %: ",model.score(X_test, y_test)*100)
print("KNN score %: ",knnclassifier.score(X_test, y_test)*100)
print("Support Vector Machine score %: ",clf.score(X_test, y_test)*100)
print("Random Forest score: ",rclf.score(X_test, y_test)*100)

Logistic Regression score %:  78.77094972067039
Naive Bayes score %:  78.2122905027933
Decision Tree score %:  73.74301675977654
KNN score %:  72.06703910614524
Support Vector Machine score %:  77.6536312849162
Random Forest score:  73.74301675977654


Highest score of Logistic Regression, so using it. 

In [39]:
test.columns

Index(['PassengerId', 'Pclass', 'Sex', 'Age', 'Family'], dtype='object')

In [40]:
ptest = test[['Pclass', 'Sex', 'Age', 'Family']]

In [41]:
prediction = classifier.predict(ptest)

In [42]:
final_pred = pd.DataFrame(prediction)

In [43]:
sub = pd.DataFrame()
sub['PassengerId']= test['PassengerId']
sub['Survival'] = final_pred
sub.to_csv('SurvivalPrediction.csv',index=False)