In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#ML
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import *
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
from scipy import stats
from sklearn.metrics import *
from sklearn.preprocessing import MinMaxScaler

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.
'''
When people ask what the problem is ?
- Talk about what and why you want to predict it
'''

# 1. Importing the train and test data 


In [None]:
trainData = pd.read_csv("../input/train.csv")
testData = pd.read_csv("../input/test.csv")
combine = [trainData, testData]

In [None]:
trainData.head()

 2. Analyse the data 
===============================

2.1. How many numerical, categorical, ordinal, and mixed features are present?  
Based on the previous cells, we see there are:  
    a. Numerical: Age, Fare, SibSp, Parch, *(Survived)  
    b. Categorical: Sex, Embarked  
    c. Mixed: Cabin, Ticket  
    d. Ordinal: Pclass  


In [None]:
#Information about categorical data. 
#Observation: 
# - There are only 204 cabin values -> Discard it ?
# - 2 missing values for Embarked
trainData.describe(include=['O'])


# 3. Assumptions made about Data:
a. Correlation: Features that can possibly be correlated to Survival -> Age, PClass, Fare  
b. Completion: Age and Embarked should be completed  
c. Correct: - Discard Cabin - too less data  
            - Ticket can be dropped as there are too many unique values that it may not follow a pattern.
            - Passenger Id not needed for survival
            - Name not required -> unique
d. Converting: Do we need to convert existing data to a new format?  
e. Create: Do we need to create new data? -> Things that can be suitably made into a range  
            - Age groups instead of Age?
            - Fare range instead of fare?
            - Combining Parch and SibSp as they are both family based values ?

# 4. Confirming our assumptions
Observation: Pclass seems to be correlated with Survival  
Taking mean of ordinal value doesn't mean anything  


In [None]:
#SPEARMAN CORRELATION
trainData.corr(method='spearman').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)


In [None]:
trainData.corr(method='pearson').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

In [None]:
trainData.corr(method='kendall').style.format("{:.2}").background_gradient(cmap=plt.get_cmap('coolwarm'), axis=1)

## Visualizing Data

### Fare distribution

In [None]:
plt.subplots(figsize=(20,15))
bplot = sns.boxplot(y='Fare', x='Survived', 
                 data=trainData, 
                 width=0.5,
                 palette="colorblind")

In [None]:
plt.subplots(figsize=(20,15))
ax = sns.violinplot(x="Survived", y="Fare", data=trainData, palette="muted")

### Age Distribution

In [None]:
plt.subplots(figsize=(20,15))
bplot = sns.boxplot(y='Age', x='Survived', 
                 data=trainData, 
                 width=0.5,
                 palette="colorblind")

In [None]:
sns.set(style="darkgrid")
g = sns.jointplot("Age", "Survived", data=trainData, kind="reg",
                  xlim=(0, 100), ylim=(-1, 2), color="m")
g.fig.set_size_inches(20,20)

In [None]:
plt.subplots(figsize=(20,15))
ax = sns.violinplot(x="Survived", y="Age", data=trainData, palette="muted")

### PClass Distribution

In [None]:
sns.factorplot('Survived',data=trainData,kind='count',hue='Pclass')

### Embarked Distribution

In [None]:
sns.factorplot('Survived',data=trainData,kind='count',hue='Embarked')

### Sibsp Distribution

In [None]:
plt.subplots(figsize=(20,15))
ax = sns.violinplot(x="Survived", y="SibSp", data=trainData, palette="muted")

In [None]:
sns.factorplot('Survived',data=trainData,kind='count',hue='SibSp')

### Parch Distribution

In [None]:
sns.factorplot('Survived',data=trainData,kind='count',hue='Parch')

# Feature Engineering
- Feature not needed: PassengerId

### Logistic Regression Model

In [None]:
def logisticRegressionPerformance(trainSet, valSet):
    
    Y_train = trainSet['Survived'].copy()
    X_train = trainSet.drop(columns = ['Survived'])
    Y_validation = valSet['Survived'].copy()
    X_validation = valSet.drop(columns = ['Survived'])
   
    predictionModel = LogisticRegression()
    predictionModel.fit(X_train, Y_train) #Trains the model
    predictions = predictionModel.predict(X_validation) #Predicts the model
    
    Accuracy = predictionModel.score(X_validation, Y_validation)*100
    matrix = confusion_matrix(Y_validation, predictions)
    report = classification_report(Y_validation, predictions)
    coeffModel = pd.DataFrame(trainSet.columns.delete(0))
    coeffModel.columns = ['Feature']
    coeffModel["Coefficients"] = pd.Series(predictionModel.coef_[0])
    
    print("Accuracy:" + str(Accuracy))
    print("\n Confusion Matrix:\n" + str(matrix))
    print("\n Precision, Recall and F1-score: \n" + str(report))
    print("\nCoefficients:" + str(coeffModel.sort_values(by='Coefficients', ascending=False)))
    
    return predictionModel




In [None]:
def RandomForestPerformance(trainSet, valSet):
    
    Y_train = trainSet['Survived'].copy()
    X_train = trainSet.drop(columns = ['Survived'])
    Y_validation = valSet['Survived'].copy()
    X_validation = valSet.drop(columns = ['Survived'])
   
    predictionModel = RandomForestClassifier(n_estimators=100)
    predictionModel.fit(X_train, Y_train) #Trains the model
    predictions = predictionModel.predict(X_validation) #Predicts the model
    
    Accuracy = predictionModel.score(X_validation, Y_validation)*100
    matrix = confusion_matrix(Y_validation, predictions)
    report = classification_report(Y_validation, predictions)
    
    print("Accuracy:" + str(Accuracy))
    print("\n Confusion Matrix:\n" + str(matrix))
    print("\n Precision, Recall and F1-score: \n" + str(report))
    
    return predictionModel




## Model performance on one-hot encoded existing features - NaN values are dropped 
### Model accuracy - 77-80%
### Precision, recall and f1-score 78-80%

In [None]:
modelTrainData,modelValidationData =  train_test_split(combine[0].copy(), test_size=0.5)

In [None]:
simpleModelTrain = pd.get_dummies(modelTrainData, columns=["Sex", "Pclass","Embarked"], prefix=["Sex","Pclass","Embarked"])
simpleModelVal = pd.get_dummies(modelValidationData, columns=["Sex", "Pclass","Embarked"], prefix=["Sex","Pclass","Embarked"])
simpleModelTrain.head()

In [None]:
#Simple model
simpleModelTrain = simpleModelTrain.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
simpleModelVal = simpleModelVal.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()

In [None]:
simpleModel = logisticRegressionPerformance(simpleModelTrain,simpleModelVal)

## Does completing the missing values help ?
## 1. Without completing cabin
### Model Accuracy -  less than 1% increase
### Model Precision, recall, f1- score - less than 1% increase
## 2. With Cabin completed
### Model Accuracy - reduces by atmost 1% - Doesn't seem to be helping the model
### Model precision, recall, f1-score - reduces by 1% on average

In [None]:
completeFeaturesTrain = modelTrainData.copy()
completeFeaturesVal = modelValidationData.copy()
completeFeaturesTrain['Fare'] = completeFeaturesTrain['Fare'].fillna(completeFeaturesTrain['Fare'].median())
completeFeaturesTrain['Age'] = completeFeaturesTrain['Age'].fillna(completeFeaturesTrain['Age'].median())
completeFeaturesTrain['Embarked'] = completeFeaturesTrain['Embarked'].fillna(completeFeaturesTrain['Embarked'].mode())
completeFeaturesTrain = pd.get_dummies(completeFeaturesTrain, columns=["Sex", "Pclass","Embarked"], prefix=["Sex","Pclass","Embarked"])
completeFeaturesVal['Fare'] = completeFeaturesVal['Fare'].fillna(completeFeaturesVal['Fare'].median())
completeFeaturesVal['Age'] = completeFeaturesVal['Age'].fillna(completeFeaturesVal['Age'].median())
completeFeaturesVal['Embarked'] = completeFeaturesVal['Embarked'].fillna(completeFeaturesVal['Embarked'].mode())
completeFeaturesVal = pd.get_dummies(completeFeaturesVal, columns=["Sex", "Pclass","Embarked"], prefix=["Sex","Pclass","Embarked"])
completeFeaturesTrain.describe()

In [None]:
completeFeaturesTrain.describe(include=['O'])

In [None]:
completeFeaturesTrain_M0 = completeFeaturesTrain.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
completeFeaturesVal_M0 = completeFeaturesVal.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()

In [None]:
copmletedFeaturesModel = logisticRegressionPerformance(completeFeaturesTrain_M0,completeFeaturesVal_M0)

## What if we complete 'Cabin' feature as well? 
### Completing the Cabin column seems to be reducing the accuracy

In [None]:
completeFeaturesTrain_cabin = completeFeaturesTrain.copy()
completeFeaturesVal_cabin = completeFeaturesVal.copy()
completeFeaturesTrain_cabin['Cabin'] =  completeFeaturesTrain_cabin['Cabin'].fillna('N')
completeFeaturesVal_cabin['Cabin'] =  completeFeaturesVal_cabin['Cabin'].fillna('N')
completeFeaturesTrain_cabin['Cabin'] =  completeFeaturesTrain_cabin['Cabin'].str[0]
completeFeaturesVal_cabin['Cabin'] =  completeFeaturesVal_cabin['Cabin'].str[0]
completeFeaturesTrain_cabin = pd.get_dummies(completeFeaturesTrain_cabin, columns=["Cabin"], prefix=["Cabin"])
completeFeaturesVal_cabin = pd.get_dummies(completeFeaturesVal_cabin, columns=["Cabin"], prefix=["Cabin"])

In [None]:
if completeFeaturesTrain_cabin.shape[1] == 25:
    completeFeaturesVal_cabin['Cabin_T'] = 0
else:
    completeFeaturesTrain_cabin['Cabin_T'] = 0
completeFeaturesVal_cabin.head()

In [None]:
completeFeaturesTrain_cabin = completeFeaturesTrain_cabin.drop(columns = ['PassengerId','Ticket','Name'])
completeFeaturesVal_cabin = completeFeaturesVal_cabin.drop(columns = ['PassengerId','Ticket','Name'])

In [None]:
copmletedFeaturesModel_cabin = logisticRegressionPerformance(completeFeaturesTrain_cabin,completeFeaturesVal_cabin)

## Feature Set A 
- Adding an 'Alone' Column

In [None]:
M1TrainData = completeFeaturesTrain.copy()
M1ValData = completeFeaturesVal.copy()
featureSet_combine = [M1TrainData,M1ValData]

In [None]:
Alone = []
for dataset in featureSet_combine:
    for index, row in dataset.iterrows():
        if(row['SibSp'] + row['Parch'] > 0): Alone.append(0)
        else: Alone.append(1)
    dataset['Alone'] = Alone
    Alone = []

In [None]:
M1TrainData.head()

In [None]:
M1TrainData = M1TrainData.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
M1ValData = M1ValData.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()

In [None]:
M1model = logisticRegressionPerformance(M1TrainData,M1ValData)

In [None]:
#Collecting all features
finalFeaturesTrain = completeFeaturesTrain.copy()
finalFeaturesVal = completeFeaturesVal.copy()

In [None]:
sns.factorplot('Survived',data=M1TrainData,kind='count',hue='Alone')

## Feature Set B
- Extracting the Title from name


In [None]:
M1TrainData_M2 = completeFeaturesTrain.copy()
M1ValData_M2 = completeFeaturesVal.copy()
featureSet_combine_M2 = [M1TrainData_M2,M1ValData_M2]

In [None]:
Titles = []
for dataset in featureSet_combine_M2:
    dataset['Title'] = dataset['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

In [None]:
M1TrainData_M2.head()

In [None]:
for dataset in featureSet_combine_M2:
    dataset['Title'] = dataset['Title'].replace(['Dr','Rev','Col','Major','Mlle','Sir','Jonkheer','Lady','Mme','Countess','Don','Capt','Ms'], 'Other')

In [None]:
finalFeaturesTrain['Title'] = M1TrainData_M2['Title']
finalFeaturesVal['Title'] = M1ValData_M2['Title']

In [None]:
sns.factorplot('Survived',data=M1TrainData_M2,kind='count',hue='Title')

In [None]:
M1TrainData_M2 =  pd.get_dummies(M1TrainData_M2, columns=["Title"], prefix=["Title"])
M1ValData_M2 =  pd.get_dummies(M1ValData_M2, columns=["Title"], prefix=["Title"])

In [None]:
M1TrainData_M2 = M1TrainData_M2.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
M1ValData_M2 = M1ValData_M2.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
M1TrainData_M2.head()

In [None]:
M2model = logisticRegressionPerformance(M1TrainData_M2,M1ValData_M2)

### Conclusion: Titles Mr, Mrs, AND Miss seems to have a strong correlation with Survival

## Feature Set C
- Encoding Fare

### Normalizing the fare

In [None]:
featureSet_trainData_normalized = completeFeaturesTrain.copy()
featureSet_testData_normalized = completeFeaturesVal.copy()
featureSet_combine_normalized = [featureSet_trainData_normalized,featureSet_testData_normalized] 

In [None]:
scaler = MinMaxScaler()
featureSet_trainData_normalized[['Fare']] = scaler.fit_transform(featureSet_trainData_normalized[['Fare']])
featureSet_testData_normalized[['Fare']] = scaler.fit_transform(featureSet_testData_normalized[['Fare']])
featureSet_combine_normalized = [featureSet_trainData_normalized,featureSet_testData_normalized] 
featureSet_trainData_normalized.head()

In [None]:
finalFeaturesTrain['Fare'] = featureSet_trainData_normalized['Fare']
finalFeaturesVal['Fare'] = featureSet_testData_normalized['Fare']

In [None]:
for dataset in featureSet_combine_normalized:
    dataset.loc[ dataset['Fare'] <= 0.0154, 'EncodedFare'] = 0
    dataset.loc[(dataset['Fare'] > 0.0154) & (dataset['Fare'] <= 0.0254), 'EncodedFare'] = 1
    dataset.loc[(dataset['Fare'] > 0.0254) & (dataset['Fare'] <= 0.0595), 'EncodedFare']   = 2
    dataset.loc[ dataset['Fare'] > 0.0595, 'EncodedFare'] = 3

In [None]:
featureSet_trainData_normalized.head()

In [None]:
M3TrainData_normalized = featureSet_trainData_normalized.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
M3ValData_normalized = featureSet_testData_normalized.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()

In [None]:
M3model_normalized = logisticRegressionPerformance(M3TrainData_normalized,M3ValData_normalized)

### Not normalized data

In [None]:
featureSet_trainData = completeFeaturesTrain.copy()
featureSet_testData = completeFeaturesVal.copy()
featureSet_combine = [featureSet_trainData,featureSet_testData] 

In [None]:
pd.qcut(featureSet_trainData['Fare'], 4).value_counts()

In [None]:
for dataset in featureSet_combine:
    dataset.loc[ dataset['Fare'] <= 7.925, 'EncodedFare'] = 0
    dataset.loc[(dataset['Fare'] > 7.925) & (dataset['Fare'] <= 14.454), 'EncodedFare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 30.0), 'EncodedFare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'EncodedFare'] = 3

In [None]:
featureSet_trainData.head()

In [None]:
featureSet_trainData[['EncodedFare', 'Survived']].groupby(['EncodedFare'], as_index=False).mean()

In [None]:
M3TrainData = featureSet_trainData.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
M3ValData = featureSet_testData.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()

In [None]:
M3model = logisticRegressionPerformance(M3TrainData,M3ValData)

In [None]:
sns.factorplot('Survived',data=M3TrainData,kind='count',hue='EncodedFare')

#### One hot encoding of Fare

In [None]:
featureSet_trainData_1hot =  pd.get_dummies(featureSet_trainData.copy(), columns=["EncodedFare"], prefix=["Fare"])
featureSet_testData_1hot =  pd.get_dummies(featureSet_testData.copy(), columns=["EncodedFare"], prefix=["Fare"])

In [None]:
M3aTrainData = featureSet_trainData_1hot.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
M3aValData = featureSet_testData_1hot.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()

In [None]:
M3amodel = logisticRegressionPerformance(M3aTrainData,M3aValData)

### Conclusion: We can definetly see that there is correlation between Fare and Survival
- Should I be encoding these values? - correlation decreases

## Normalizing Age

In [None]:
age_normalized = completeFeaturesTrain.copy()
age_test_normalized = completeFeaturesVal.copy()
age_combine_normalized = [age_normalized,age_test_normalized] 

In [None]:
scaler = MinMaxScaler()
age_normalized[['Age']] = scaler.fit_transform(age_normalized[['Age']])
age_test_normalized[['Age']] = scaler.fit_transform(age_test_normalized[['Age']])
age_combine_normalized = [age_normalized,age_test_normalized] 
age_normalized.head()

In [None]:
pd.qcut(age_normalized['Age'], 4).value_counts()

In [None]:
finalFeaturesTrain['Age'] = age_normalized['Age']
finalFeaturesVal['Age'] = age_test_normalized['Age']


In [None]:
for dataset in age_combine_normalized:
    dataset.loc[ dataset['Age'] <= 0.302, 'EncodedAge'] = 0
    dataset.loc[(dataset['Age'] > 0.302) & (dataset['Fare'] <= 0.388), 'EncodedAge'] = 1
    dataset.loc[(dataset['Age'] > 0.388) & (dataset['Fare'] <= 0.488), 'EncodedAge']   = 2
    dataset.loc[ dataset['Age'] > 0.488, 'EncodedAge'] = 3

In [None]:
age_normalized[['EncodedAge', 'Survived']].groupby(['EncodedAge'], as_index=False).mean()

In [None]:
M4TrainData = age_normalized.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
M4ValData = age_test_normalized.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()

In [None]:
M4model = logisticRegressionPerformance(M4TrainData,M4ValData)

## Not normalized Age

In [None]:
ageTrain = completeFeaturesTrain.copy()
ageTest = completeFeaturesVal.copy()
age_combine = [ageTrain,ageTest] 

In [None]:
pd.qcut(ageTrain['Age'], 4).value_counts()

In [None]:
for dataset in age_combine:
    dataset.loc[ dataset['Age'] <=  22.0, 'EncodedAge'] = 0
    dataset.loc[(dataset['Age'] > 22.0) & (dataset['Fare'] <= 28.0), 'EncodedAge'] = 1
    dataset.loc[(dataset['Age'] > 28.0) & (dataset['Fare'] <= 35.0), 'EncodedAge']   = 2
    dataset.loc[ dataset['Age'] > 35.0, 'EncodedAge'] = 3

In [None]:
M5TrainData = ageTrain.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()
M5ValData = ageTest.drop(columns = ['PassengerId','Ticket','Name','Cabin']).dropna()

In [None]:
M5model = logisticRegressionPerformance(M5TrainData,M5ValData)

## Dropping unused columns

In [None]:
finalFeaturesTrain.tail()

In [None]:
cleanTrainData = finalFeaturesTrain.drop(columns = ['Name','Ticket','PassengerId','Cabin'])
cleanvalidationData = finalFeaturesVal.drop(columns = ['Name','Ticket','PassengerId','Cabin'])
cleanTrainData =  pd.get_dummies(cleanTrainData, columns=["Title"], prefix=["Title"])
cleanvalidationData =  pd.get_dummies(cleanvalidationData, columns=["Title"], prefix=["Title"])
featureSetCombine = [finalFeaturesTrain,finalFeaturesVal]

# 7. Model and predict

# Baseline Model

In [None]:
"""
Base line model calculations
Calculate the most frequent appearing class and predict that all the time for validation set
"""
output_values = cleanTrainData['Survived']
baselinePredictionValue = output_values.value_counts().idxmax()
baselinePredicted = [baselinePredictionValue for i in range(len(cleanvalidationData))]

In [None]:
"""
Performance of baseline model
"""
baselineScore = accuracy_score(cleanvalidationData['Survived'], baselinePredicted)
baselineScore

In [None]:
"""
Confusion Matrix
"""
matrix = confusion_matrix(cleanvalidationData['Survived'], baselinePredicted)
print(matrix)

In [None]:
"""
Precision, recall and F1-score
"""
report = classification_report(cleanvalidationData['Survived'], baselinePredicted)
print(report)

# Logistic Regression Model

In [None]:
#Model
allFeaturesModel = logisticRegressionPerformance(cleanTrainData,cleanvalidationData)

In [None]:
RFModel = RandomForestPerformance(cleanTrainData,cleanvalidationData)

# WHAT ASSUMPTIONS DOES LOGISTIC REGRESSION TAKE INTO ACCOUNT?
 
1. There need not be a linear relationship bw dependent and independent variables  
2. Logistic regression requires the observations to be independent of each other.    
   In other words, the observations should not come from repeated measurements or matched data.  
3. Error terms do not need to be normally distributed.  
4. Independent variables should not be too highly correlated with each other.  
5. Usually requires a large sample size  
 

# Independent vs. Dependent Variables:

Independent variables:  
It is a variable that stands alone and isn't changed by the other variables you are trying to measure.  
For example, someone's age might be an independent variable.   
Other factors (such as what they eat, how much they go to school,  
how much television they watch) aren't going to change a person's age.  

Dependent Variables:  
It is something that depends on other factors.   
For example, a test score could be a dependent variable because it could change depending on several factors such as how much you studied,  
how much sleep you got the night before you took the test, or even how hungry you were when you took it  


# TESTING NEW TECHNIQUES

In [None]:
"""
Testing Recursive Feature Elimination
"""
from sklearn.feature_selection import RFE
model = LogisticRegression()
rfe = RFE(model,8)
fit = rfe.fit(X_train, Y_train)
print(fit.n_features_)
print(fit.support_)
print(fit.ranking_)

In [None]:
print(fit)

In [None]:
report_RFE = classification_report(Y_validation, fit)
print(report_RFE)

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)
y_pred = random_forest.predict(X_validation)

In [None]:
report = classification_report(Y_validation, y_pred)
print(report)