# Checkpoint 5

## Part A - Explore Data 

In [None]:
#Import Libraries


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [None]:
#Reading File
data_frame = pd.read_csv(r'C:\Users\stesa\Desktop\GoMyCode\Titanic_Survived.csv')

#Display data
data_frame.head()

In [None]:
#Data Length
data_frame.shape

In [None]:
#output counts
data_frame['Survived'].value_counts()

In [None]:
#Plot 'Survived'
sns.countplot(x =data_frame.Survived,order=data_frame['Survived'].value_counts().index)
plt.show()

In [None]:
#Plot 'Survived' with 'Pclass'
sns.countplot(x='Survived', hue='Pclass', data=data_frame)
plt.show()

In [None]:
#Plot 'Survived' with 'male'
sns.countplot(x='Survived', hue='male', data=data_frame)
plt.show()

In [None]:
#Plot 'Survived' with 'FamilySize'
sns.countplot(x='Survived', hue='FamilySize', data=data_frame)
plt.show()

## Part B - Supervised Machine Learning

### 1 - Logistic Regression

## Solution 1

#####      A- Splitting dataset into a training set and a testing set

In [None]:
#features extraction

x = data_frame.drop('Survived' , axis=1)
y = data_frame['Survived']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=20) 


#####      B- Feature Scalling

In [None]:
from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.fit_transform(x_test)

#####      C- Build Model

In [None]:
model = LogisticRegression(solver='newton-cg') #build Logistic regression model
model.fit(x_train,y_train) #fitting the training data
pred=model.predict(x_test) #testing our model’s performance
y_pred_logistic = model.decision_function(x_test)

print(classification_report(y_test,pred))


#####      D- Confusion Matrix

In [None]:
confusion_matrix = pd.crosstab(y_test, pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)
print(confusion_matrix)
plt.show()


## ROC / AUC

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score 

logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_logistic)
auc_logistic = auc(logistic_fpr,logistic_tpr)

plt.figure(figsize=(10,5), dpi=60)
plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic (auc = %0.3f)' % auc_logistic)
plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')
plt.legend()
plt.show()
auc = np.round(roc_auc_score(y_test, pred), 3) 
print('Default Area Under Cuve : Threshold = 0.5')
print("Auc for our data is {}". format(auc))
print('Auc after using ROC_Curve is 0.864')

## Solution 2

#####      A- Splitting dataset into a training set and a testing set

In [None]:
#features extraction

features = ['Pclass','Fare','FamilySize','C','Q','S','female','male','Age']
x = data_frame[features]
y = data_frame['Survived']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=20) 


#####      B- Feature Scalling

In [None]:
from sklearn.preprocessing import StandardScaler

sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.fit_transform(x_test)


#####     C- Build Model

In [None]:
model = LogisticRegression(solver='newton-cg') #build Logistic regression model
model.fit(x_train,y_train) #fitting the training data
pred=model.predict(x_test) #testing our model’s performance
y_pred_logistic = model.decision_function(x_test)

print(classification_report(y_test,pred))

#####      D- Confusion Matrix

In [None]:
confusion_matrix = pd.crosstab(y_test, pred, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(confusion_matrix, annot=True)
print(confusion_matrix)
plt.show()


## ROC / AUC

In [None]:
from sklearn.metrics import roc_curve, auc, roc_auc_score 

logistic_fpr, logistic_tpr, threshold = roc_curve(y_test, y_pred_logistic)
auc_logistic = auc(logistic_fpr,logistic_tpr)

plt.figure(figsize=(10, 5), dpi=60)
plt.plot(logistic_fpr, logistic_tpr, marker='.', label='Logistic (auc = %0.3f)' % auc_logistic)
plt.xlabel('False Positive Rate -->')
plt.ylabel('True Positive Rate -->')
plt.legend()
plt.show()

auc = np.round(roc_auc_score(y_test, pred), 3) 
print('Default Area Under Cuve : Threshold = 0.5')
print('Auc For our Data is {}'. format(auc))
print('Auc after using ROC_Curve is 0.860')

### ROC_AUC Definition

In [None]:
## Receiver Operating Characteristics (ROC) :

#Curve is constructed by plotting the true positive rate (TPR) against the false positive rate (FPR).
#Classifiers that give curves closer to the top-left corner indicate a better performance.
#The closer the curve comes to the 45-degree diagonal of the ROC space, the less accurate the test.

## The Area Under the Curve (AUC)

#Auc is the measure of the ability of a classifier to distinguish between classes and is used as a summary of the ROC curve
#An excellent model has AUC near to the 1 which means it has a good measure of separability. 
#A poor model has AUC near to the 0 which means it has the worst measure of separability