In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train=pd.read_csv("../input/train.csv")
train.head()

In [None]:
test_df=pd.read_csv("../input/test.csv")
test_df.head()

In [None]:
#The response variable: Survived
fig, axarr = plt.subplots(1, 2, figsize=(8, 4))

train['Survived'].value_counts().plot.bar(ax=axarr[0])
(train['Survived'].value_counts()/len(train['Survived'])).plot.bar(ax=axarr[1])


In [None]:
print(train.describe())
print(train.isnull().sum()) #checking for total null values

In [None]:
#Missing Value Imputation
#Imputate missing value of Age by mean value of each Sex
#Approach: use groupby and apply fillna with mean. 
#Then get NaN if some category has only NaN values, so use mean of all values of column for filling NaN
train.Age = train.groupby('Sex')['Age'].apply(lambda x: x.fillna(x.mean()))
#train.Age = train.Age.fillna(train.Age.mean())

#Cabin has 687/891 missing values. Hence we will not consider this value for further analysis
del train["Cabin"]

#There are two observations with missing values of Embarked. We can delete them
#train.Embarked = train.Embarked.fillna(train.Embarked.mode(), inplace=True)
train.dropna(inplace=True)
#train['Embarked'].fillna(train['Embarked'].mode(), inplace=True)

In [None]:
#Plot distribution of all categorical independent variables (Pclass,Sex,SibSp,Parch,Embarked)
#with the dependent variable
fig, ax = plt.subplots(5, 1, figsize=(8, 17))
sns.barplot(x="Sex",y='Survived',data=train,estimator = (lambda x: sum(x==1)/len(x)*100),ci=0,ax=ax[0])
sns.barplot(x="Pclass",y='Survived',data=train,estimator = (lambda x: sum(x==1)/len(x)*100),ci=0,ax=ax[1])
sns.barplot(x="Embarked",y='Survived',data=train,estimator = (lambda x: sum(x==1)/len(x)*100),ci=0,ax=ax[2])
sns.barplot(x="Parch",y='Survived',data=train,estimator = (lambda x: sum(x==1)/len(x)*100),ci=0,ax=ax[3])
sns.barplot(x="SibSp",y='Survived',data=train,estimator = (lambda x: sum(x==1)/len(x)*100),ci=0,ax=ax[4])

In [None]:
#Plot distribution of all continuous independent variables (Age,Fare) with the dependent variable
fig,ax = plt.subplots(2,1,figsize=(8,10))
sns.boxplot(x='Survived',y='Age',data=train,ax=ax[0])
sns.boxplot(x="Survived", y="Fare",data=train,ax=ax[1])

In [None]:
#Check for outliers in continuous variables
print(train.Age.quantile([0, 0.1,.33, 0.5, 0.67,0.9, 1.]))
print(train.Fare.quantile([0, 0.1,.33, 0.5, 0.67,0.9, 1.]))

#There seem to be no outliers in the data

In [None]:
#Create dummy variables from categorical variables
train2 = pd.get_dummies(train, columns =['Pclass','Sex','Embarked'])
train2.head()

In [None]:
#Remove variables which will not be used for further analysis
train2.drop(train2.columns[[0,2,6]], axis=1, inplace=True)
train2.head()

In [None]:
#Check the independence between the independent variables for multicollinearity
sns.heatmap(train2.corr())
plt.show()

In [None]:
X = train2.drop('Survived', axis=1)
y=train2["Survived"]
X.head()

In [None]:
from sklearn.model_selection import train_test_split
#Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
#Check out training data is sufficient
X_train.shape

In [None]:
#Model 1: Logistic Regression
#Fit logistic regression to the training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

In [None]:
#Predicting the train set results and creating confusion matrix
y_pred = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred)
print(confusion_matrix)
#Accuracy
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(classifier.score(X_train, y_train)))

In [None]:
#Predicting the test set results and creating confusion matrix
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)
#Accuracy
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

In [None]:
#Model 2: Decision Tree
from sklearn.tree import DecisionTreeClassifier
classifier= DecisionTreeClassifier(random_state=0)
classifier.fit(X_train,y_train)

#Predicting the train set results and creating confusion matrix
y_pred_train = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)
#Accuracy
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(classifier.score(X_train, y_train)))

#Predicting the test set results and creating confusion matrix
y_pred_test = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred_test)
print(confusion_matrix)
#Accuracy
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

In [None]:
#Model 3: Random Forest
from sklearn.ensemble import RandomForestClassifier
classifier= RandomForestClassifier(random_state=0)
classifier.fit(X_train,y_train)

#Predicting the train set results and creating confusion matrix
y_pred_train = classifier.predict(X_train)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_train, y_pred_train)
print(confusion_matrix)
#Accuracy
print('Accuracy of logistic regression classifier on train set: {:.2f}'.format(classifier.score(X_train, y_train)))

#Predicting the test set results and creating confusion matrix
y_pred_test = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred_test)
print(confusion_matrix)
#Accuracy
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(classifier.score(X_test, y_test)))

In [None]:
#Apply model to predict on test dataset
test_df.head()
print(test_df.describe())
print(test_df.isnull().sum()) #checking for total null values

In [None]:
#Missing Value Imputation
test_df.Age = test_df.groupby('Sex')['Age'].apply(lambda x: x.fillna(x.mean()))
del test_df["Cabin"]
test_df.Fare = test_df.Fare.fillna(test_df.Fare.mean())

In [None]:
#Create dummy variables from categorical variables
test_df2 = pd.get_dummies(test_df, columns =['Pclass','Sex','Embarked'])
test_df2.head()

In [None]:
#Remove variables which will not be used for further analysis
test_df2.drop(test_df2.columns[[0,1,5]], axis=1, inplace=True)
test_df2.head()

In [None]:
#Predicting the test_df2 set results and creating confusion matrix
y_pred = classifier.predict(test_df2)

In [None]:
y_pred

In [None]:
submission=pd.read_csv("../input/gender_submission.csv")
submission.head()

In [None]:
predicted= pd.DataFrame(y_pred)
predicted.rename(columns={ predicted.columns[0]: "predicted" }, inplace=True)
#pred_test_arr.head()
predicted.head()

In [None]:
submission_final= pd.concat([submission, predicted], axis=1)
submission_final.head()

In [None]:
#Creating confusion matrix
confusion_matrix = pd.crosstab(submission_final.Survived, submission_final.predicted)
print(confusion_matrix)

In [None]:
del submission_final["Survived"]
submission_final.head()

In [None]:
submission_final.rename(columns={ submission_final.columns[1]: "Survived" }, inplace=True)
submission_final.head()

In [None]:
submission_final.to_csv("submission.csv",  index=False)