In [83]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression

from sklearn import svm
from sklearn.svm import SVC, LinearSVC

from sklearn import ensemble
from sklearn.neighbors import KNeighborsClassifier

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.linear_model import SGDClassifier

from sklearn import naive_bayes
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')


In [84]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [85]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [86]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Performing Data Cleaning and Analysis
<!-- #### 1. Understanding meaning of each column: -->
<!-- <br>Data Dictionary: -->
<br>**Variable        Description**</br>
1. Survived	- Survived (1) or died (0)
2. Pclass -	Passenger’s class (1 = 1st, 2 = 2nd, 3 = 3rd)
3. Name	- Passenger’s name
4. Sex -	Passenger’s sex
5. Age	- Passenger’s age
6. SibSp -	Number of siblings/spouses aboard
7. Parch -	Number of parents/children aboard (Some children travelled only with a nanny, therefore parch=0 for them.)
8. Ticket -	Ticket number
9. Fare -	Fare
10. Cabin -	Cabin
11. Embarked -	Port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)


In [87]:
"""
Deleting the columns not required for determining the survival of a person
"""

del train['PassengerId']
del train['Ticket']
del train['Fare']
del train['Cabin']
del train['Name']


In [88]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [89]:
train.shape

(891, 7)

In [90]:
train.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
count,891.0,891.0,714.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594
std,0.486592,0.836071,14.526497,1.102743,0.806057
min,0.0,1.0,0.42,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0
50%,0.0,3.0,28.0,0.0,0.0
75%,1.0,3.0,38.0,1.0,0.0
max,1.0,3.0,80.0,8.0,6.0


In [91]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Embarked      2
dtype: int64

In [92]:
"""Since there are ONLY 2 rows whose Embarked data is not known,
therefore we can neglect those 2 rows as they will not make much of a difference"""


"""
Similarly, deleting the columns not required for determining the survival of a person for the data stored in test.csv
"""

del test['Ticket']
del test['Fare']
del test['Cabin']
del test['Name']

test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked
0,892,3,male,34.5,0,0,Q
1,893,3,female,47.0,1,0,S
2,894,2,male,62.0,0,0,Q
3,895,3,male,27.0,0,0,S
4,896,3,female,22.0,1,1,S


In [93]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch
count,418.0,418.0,332.0,418.0,418.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344
std,120.810458,0.841838,14.181209,0.89676,0.981429
min,892.0,1.0,0.17,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0
50%,1100.5,3.0,27.0,0.0,0.0
75%,1204.75,3.0,39.0,1.0,0.0
max,1309.0,3.0,76.0,8.0,9.0


In [94]:
test.isnull().sum()

PassengerId     0
Pclass          0
Sex             0
Age            86
SibSp           0
Parch           0
Embarked        0
dtype: int64

In [95]:
#We will create a new column called "Gender" and
#fill it with values 1 ,2 based on the values of sex column in which male = 1 and female = 2

def getNum(str):
    if str == 'male':
        return '1'
    if str == 'female':
        return '2'

train['Gender'] = train['Sex'].apply(getNum)
print(train.head())

print("#"*50)

test['Gender'] = test['Sex'].apply(getNum)
test.head()

   Survived  Pclass     Sex   Age  SibSp  Parch Embarked Gender
0         0       3    male  22.0      1      0        S      1
1         1       1  female  38.0      1      0        C      2
2         1       3  female  26.0      0      0        S      2
3         1       1  female  35.0      1      0        S      2
4         0       3    male  35.0      0      0        S      1
##################################################


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Embarked,Gender
0,892,3,male,34.5,0,0,Q,1
1,893,3,female,47.0,1,0,S,2
2,894,2,male,62.0,0,0,Q,1
3,895,3,male,27.0,0,0,S,1
4,896,3,female,22.0,1,1,S,2


In [96]:
del train['Sex']
del test['Sex']

## Probability of Survived to Pclass, Gender, SibSp and Parch


In [102]:
train[['Pclass', 'Survived']].groupby(['Pclass']).mean().sort_values(by='Survived', ascending=True)

Unnamed: 0_level_0,Survived
Pclass,Unnamed: 1_level_1
3,0.242363
2,0.472826
1,0.62963


### Insight:
    It was revealed that passenger in the 1st have the higher survival passenger with 62.96% compare to other class

In [103]:

train[['Gender', 'Survived']].groupby(['Gender']).mean().sort_values(by='Survived', ascending=True)

Unnamed: 0_level_0,Survived
Gender,Unnamed: 1_level_1
1,0.188908
2,0.742038


## Insight
    Female which is represented by 2 have the higer percent of survival with 74.20 % compare to their male counterpart

In [104]:
train[['Parch', 'Survived']].groupby(['Parch']).mean().sort_values(by='Survived', ascending=True)

Unnamed: 0_level_0,Survived
Parch,Unnamed: 1_level_1
4,0.0
6,0.0
5,0.2
0,0.343658
2,0.5
1,0.550847
3,0.6


## Insight:
    Passenger in which parent and children are 3 have 60% of survival 

In [105]:
train[['SibSp', 'Survived']].groupby(['SibSp']).mean().sort_values(by='Survived', ascending=True)

Unnamed: 0_level_0,Survived
SibSp,Unnamed: 1_level_1
5,0.0
8,0.0
4,0.166667
3,0.25
0,0.345395
2,0.464286
1,0.535885


## Insight:
    Passenger 

## Analyzing Data by Visualization

Inorder to understand who would have had a better probability of survival, we should visualize the patients who survived based on age, passenger class and etc.


In [None]:
sns.countplot(train['Survived'])

In [None]:
age_hist = sns.FacetGrid(train, col='Survived')
age_hist.map(plt.hist, 'Age')
age_hist.set_ylabels('Number')