![](https://blog.socialcops.com/wp-content/uploads/2016/07/OG-MachineLearning-Python-Titanic-Kaggle.png)

In [305]:
import numpy as np
import pandas as pd

import  matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")

In [306]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [307]:
df_train.drop(['Name'],1,inplace =True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,male,35.0,0,0,373450,8.05,,S


In [308]:
df_test.drop(["Name"],1, inplace =True)
df_test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,male,34.5,0,0,330911,7.8292,,Q
1,893,3,female,47.0,1,0,363272,7.0,,S
2,894,2,male,62.0,0,0,240276,9.6875,,Q
3,895,3,male,27.0,0,0,315154,8.6625,,S
4,896,3,female,22.0,1,1,3101298,12.2875,,S


In [309]:
print("Shape Of Training Data:", df_train.shape)
print("Shape Of Test Data :", df_test.shape)

Shape Of Training Data: (891, 11)
Shape Of Test Data : (418, 10)


In [310]:
print("Missing Values In Training Dataset: ", df_train.isnull().sum())
print("-"*50)
print("Missing Values In Test Dataset : ", df_test.isnull().sum())

Missing Values In Training Dataset:  PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
--------------------------------------------------
Missing Values In Test Dataset :  PassengerId      0
Pclass           0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


#### Removing Missing Values In "Embarked"
> Data can have missing values for a number of reasons such as observations that were not recorded and data corruption.
Handling missing data is important as many machine learning algorithms do not support data with missing values.

In [311]:
df_train.drop(['Cabin'], 1 ,inplace =True)
df_test.drop(['Cabin'],1 , inplace =True)

In [312]:
df_train.drop(['Ticket'], 1, inplace = True)
df_test.drop(['Ticket'], 1, inplace = True)

In [313]:
df_train.dropna(subset=['Embarked'],inplace =True)
df_test.dropna(subset=['Fare'],inplace =True)

In [314]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [315]:
df_train['Age'].describe()

count    712.000000
mean      29.642093
std       14.492933
min        0.420000
25%       20.000000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [316]:
df_train.groupby('Survived')['Survived'].value_counts()

Survived  Survived
0         0           549
1         1           340
Name: Survived, dtype: int64

In [317]:
df_train.groupby('Survived')['Age'].mean()

Survived
0    30.626179
1    28.193299
Name: Age, dtype: float64

In [318]:
df_train.groupby(['Survived', 'Sex'])['Age'].mean()

Survived  Sex   
0         female    25.046875
          male      31.618056
1         female    28.630769
          male      27.276022
Name: Age, dtype: float64

In [319]:
df_train.groupby(['Survived','Sex','Pclass'])['Age'].mean()

Survived  Sex     Pclass
0         female  1         25.666667
                  2         36.000000
                  3         23.818182
          male    1         44.581967
                  2         33.369048
                  3         27.255814
1         female  1         34.562500
                  2         28.080882
                  3         19.329787
          male    1         36.248000
                  2         16.022000
                  3         22.274211
Name: Age, dtype: float64

In [320]:
#Training Data
age_groupby_train = df_train.groupby(['Survived','Sex','Embarked','Pclass'])['Age'].mean()

In [321]:
#Test Data
age_groupby_test = df_test.groupby(['Sex','Embarked','Pclass'])['Age'].mean()

## Train Data Age Missing Values Replacement By Mean Values Basis On
* Sex
* Embarked
* Pclass

In [322]:
df_train['Age'].fillna(value = -1,inplace =True)

In [323]:
for row in range(len(age_groupby_train.index)):
    df_train.loc[(df_train['Survived'] == age_groupby_train.index[row][0]) &
           (df_train['Sex']== age_groupby_train.index[row][1]) &
           (df_train['Embarked']== age_groupby_train.index[row][2])&
           (df_train['Pclass']== age_groupby_train.index[row][3])&
           (df_train['Age']==-1),'Age']=age_groupby_train.values[row]

## Test Data Age Missing Values Replacement By Mean Values Basis On
* Sex
* Embarked
* Pclass

In [324]:
df_test['Age'].fillna(value = -1,inplace =True)

In [325]:
for row in range(len(age_groupby_test.index)):
    df_test.loc[(df_test['Sex']== age_groupby_test.index[row][0]) &
           (df_test['Embarked']== age_groupby_test.index[row][1])&
           (df_test['Pclass']== age_groupby_test.index[row][2])&
           (df_test['Age']==-1),'Age']=age_groupby_test.values[row]

In [326]:
print("Missing Values of Age in Training Dataset :", df_train.isnull().sum() )
print("Missing Values of Age in Test Dataset :", df_test.isnull().sum() )

Missing Values of Age in Training Dataset : PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64
Missing Values of Age in Test Dataset : PassengerId    0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64


In [327]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


### Label Enconder 

In [328]:
from sklearn.preprocessing import LabelEncoder

### Label Encoding

###### Male   ->  1

###### Female ->  0

In [329]:
le = LabelEncoder()
le.fit(df_train["Sex"])
df_train.loc[:,'Sex'] = le.transform(df_train['Sex'])

le.fit(df_train["Embarked"])
df_train.loc[:,'Embarked'] = le.transform(df_train['Embarked'])

le = LabelEncoder()
le.fit(df_test["Sex"])
df_test.loc[:,'Sex'] = le.transform(df_test['Sex'])

le.fit(df_test["Embarked"])
df_test.loc[:,'Embarked'] = le.transform(df_test['Embarked'])

In [330]:
le = LabelEncoder()
le.fit(df_test["Sex"])
df_test.loc[:,'Sex'] = le.transform(df_test['Sex'])

In [331]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


### Split Data In X ,y 

In [332]:
train_y = df_train["Survived"]

In [333]:
df_train.drop(['Survived'],1,inplace =True)

In [334]:
train_x = df_train

In [335]:
from sklearn.linear_model import LogisticRegression

In [336]:
lr = LogisticRegression()

In [341]:
lr.fit(train_x,train_y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [346]:
preds_y = lr.predict(df_test)

In [357]:
PassengerId =df_test['PassengerId']

In [358]:
submission = pd.DataFrame({'PassengerId':PassengerId,'Survived':preds_y})

#Visualize the first 5 rows
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [359]:
filename = 'Titanic Predictions 1.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Titanic Predictions 1.csv
