# Classification Algorithms and Model Evaluation

In this notebook, we will cover:

* Logistic Regression
* Confusion Matrix
* Precision, Recall, Accuracy, F1 Score
* ROC AUC Curve
* Deciding Binary Classifier threshold
* KNN

Importing all necessary packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline

Setting global seed of notebook

In [None]:
np.random.seed(seed=25)

### Task 1: Load Data from 'titanic_clean.csv'

In [None]:
#write code here
data = None
df = None
df.head()

### One Hot encoding for categorical varaibles

In [None]:
df_OneHot=pd.get_dummies(df,columns=['Pclass','Sex','Embarked','Title','GrpSize','FareCat','AgeCat'])
df_OneHot.head()

In [None]:
df=df_OneHot.copy()

### Task 2: Create Independent and Dependent Variables

In [None]:
#write code here
X = None
Y = None

### Task 3: Train Test n Split the data

In [None]:
# Import the library
from sklearn.model_selection import train_test_split

In [None]:
#Write the code here
xtrain, xtest, ytrain, ytest = None
print(xtrain.shape, ytrain.shape)
print(xtest.shape, ytest.shape)

In [None]:
xtrain.head()

In [None]:
ytrain.head()

# 1. Logistic Regression

### Creating Model & Training

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr_model = LogisticRegression(random_state=25)

In [None]:
lr_model.fit(xtrain, ytrain)

### Evaluation 

In [None]:
pred = lr_model.predict(xtest)

In [None]:
pred[0:9]

Predicting prabability of **0** and **1**

In [None]:
pred_prb = lr_model.predict_proba(xtest)

In [None]:
pred_prb[0:9,0:9]

First value in Numpy array is probability of **0** and second is probability of **1**

Only predicting and extracting probability values of **1**

In [None]:
lr_pred_prb = lr_model.predict_proba(xtest)[:,1]

### Comparison of Predicted and Actual

In [None]:
xtest.head()

In [None]:
xt = xtest.copy()
xt['pred'] = pred
xt['pred_probability'] = lr_pred_prb
xt['actual'] = ytest
xt.head()

### Confusion Matrix Play ground

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(ytest, pred)

In [None]:
confusion_matrix(ytest, pred).ravel()

In [None]:
tn, fp, fn, tp = confusion_matrix(ytest, pred).ravel()
conf_matrix=pd.DataFrame({"pred_Survived":[tp,fp],"pred_Not Survived":[fn,tn]},index=["Survived","Not Survived"])
conf_matrix

### Accuracy

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy = (tp + tn) / (tp + fp + tn + fn)
print("Accuracy: {}".format(accuracy))

In [None]:
accuracy_lr = accuracy_score(ytest,pred)
print("Accuracy by built-in function: {}".format(accuracy_lr))

### Precision

In [None]:
from sklearn.metrics import precision_score

In [None]:
precision_1 = tp / (tp + fp)
print("Precision for 1: {}".format(precision_1))
precision_0 = tn / (tn + fn)
print("Precision for 0: {}".format(precision_0))

In [None]:
precision_lr = precision_score(ytest,pred)
print("Precision by built-in function: {}".format(precision_lr))

### Recall

In [None]:
from sklearn.metrics import recall_score

In [None]:
recall_1 = tp / (tp + fn)
print("Recall for 1: {}".format(recall_1))
recall_0 = tn / (tn + fp)
print("Recall for 0: {}".format(recall_0))

In [None]:
recall_lr = recall_score(ytest,pred)
print("Recall by built-in function: {}".format(recall_lr))

### F1 Score

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_1 = (2 * precision_1 * recall_1) / (precision_1 + recall_1)
print("F1 Score for 1: {}".format(f1_1))
f1_0 = (2 * precision_0 * recall_0) / (precision_0 + recall_0)
print("F1 Score for 0: {}".format(f1_0))

In [None]:
f1_lr=f1_score(ytest,pred)
print("F1 Score by built-in function: {}".format(f1_lr))

### Class Distribution in Training Data

In [None]:
ytrain.value_counts()

### Classification Report

In [None]:
from sklearn.metrics import classification_report

In [None]:
#get report of precision recall and f1 score
print(classification_report(ytest,pred))
#macro avg 

In [None]:
import helper_confusion_matrix as helper
helper.conf_matrix(ytest,lr_pred_prb)

### ROC AUC Curve

_Receiver Operating Characteristic_ & _Area Under Curve_

In [None]:
tpr = recall_lr
fpr = fp / (fp + tn)

In [None]:
tpr, fpr

In [None]:
fpr = 1 - recall_0
tpr, fpr

Defining function to plot ROC AUC Curve

In [None]:
from sklearn.metrics import auc,roc_curve,roc_auc_score

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.figure(figsize=(8,6))
    plt.title('ROC Curve')
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([-0.005, 1, 0, 1.005])
    plt.xticks(np.arange(0,1, 0.05), rotation=90)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend(loc='best')

Getting TPR, FPR values for each threshold on ROC AUC Curve

In [None]:
fpr,tpr,threshold=roc_curve(ytest,lr_pred_prb)

Calculating AUC score from ytest and predicted probabilities

In [None]:
auc_lr=roc_auc_score(ytest,lr_pred_prb)
auc_lr

Plotting AUC ROC Curve

In [None]:
sns.set_context('poster')
plot_roc_curve(fpr,tpr,label='AUC = %0.3f'% auc_lr)

## Model Complexity

In [None]:
from sklearn.preprocessing import PolynomialFeatures
acc_train=[]
acc_test=[]

for i in range(1,6):
    poly_reg = PolynomialFeatures(degree=i)
    
    X_tr_poly,X_tst_poly= poly_reg.fit_transform(xtrain),poly_reg.fit_transform(xtest)
    
    lr_poly = LogisticRegression(random_state=25)
    lr_poly.fit(X_tr_poly, ytrain)
   
    y_tr_predicted,y_tst_predict = lr_poly.predict(X_tr_poly),lr_poly.predict(X_tst_poly)
   
    acc_train.append(accuracy_score(ytrain, y_tr_predicted))
    acc_test.append(accuracy_score(ytest, y_tst_predict))
    

In [None]:
plt.figure(figsize=(18,5))
sns.set_context('poster')

sns.lineplot(x=list(range(1,6)), y=acc_train, label='Training')

sns.lineplot(x=list(range(1,6)), y=acc_test, label='Testing')

# 2. K Nearest Neighbors (KNN)

### Task 4: Create Independent and Dependent Variables

In [None]:
#write code here
X = None
Y = None

### Task 5: Train test and split the dataset

In [None]:
#write code here
xtrain, xtest, ytrain, ytest = None
print(xtrain.shape, ytrain.shape)
print(xtest.shape, ytest.shape)

For KNN, we need to stadardize data first

In [None]:
from sklearn.preprocessing import StandardScaler 

In [None]:
scaler = StandardScaler()  
scaler.fit(xtrain)
X_train_=scaler.transform(xtrain)
X_test_=scaler.transform(xtest)
X_train=pd.DataFrame(data=X_train_, columns=xtrain.columns)
X_test=pd.DataFrame(data=X_test_, columns=xtest.columns)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
clf_knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
clf_knn.fit(X_train,ytrain)

In [None]:
knn_pred=clf_knn.predict(X_test)
knn_pred_prb=clf_knn.predict_proba(X_test)[:,1]

In [None]:
accuracy_train=accuracy_score(ytrain,clf_knn.predict(X_train))
print("Accuracy Train: {}".format(accuracy_train))

In [None]:
accuracy_knn = accuracy_score(ytest,knn_pred)
print("Accuracy : {}".format(accuracy_knn))

In [None]:
print(classification_report(ytest,knn_pred))

In [None]:
fpr,tpr,threshold=roc_curve(ytest,knn_pred_prb)

In [None]:
auc_knn=roc_auc_score(ytest,knn_pred_prb)
auc_knn

In [None]:
sns.set_context('poster')
plot_roc_curve(fpr,tpr,label='AUC = %0.3f'% auc_knn)

### Model Complexity

In [None]:
accuracy_train=[]
accuracy_test=[]
for i in range(1,26):
    cnn_model = KNeighborsClassifier(n_neighbors=i)
    cnn_model.fit(X_train,ytrain)
   
    knn_pred=cnn_model.predict(X_test)
    knn_pred_prb=cnn_model.predict_proba(X_test)[:,1]
   
    accuracy_train.append(accuracy_score(ytrain,cnn_model.predict(X_train)))
    accuracy_test.append(accuracy_score(ytest,knn_pred))



In [None]:
plt.figure(figsize=(18,5))
sns.set_context('poster')

sns.lineplot(x=list(range(1,26)), y=accuracy_train, label='Training')

sns.lineplot(x=list(range(1,26)), y=accuracy_test, label='Testing')
plt.xlabel('Number of Neighbours')
plt.ylabel('Accuracy Score')

# 3. Submission on Kaggle

### Task 6: Import test data

In [None]:
#write code here
test = None
df_test = None

### Task 7: Do One Hot encoding of test data

In [None]:
#write code here


### Task 8: Separate Passenger ID for submission

In [None]:
#Write code here


### Task 9: Do prediction through final model

In [None]:
#write code here
pred_final=None

#### Creating Data Frame for submission

In [None]:
submission=pd.DataFrame({'PassengerId':PassengerID,'Survived':pred_final})

In [None]:
submission.head()

### Task 10: Export the dataset into csv file

In [None]:
# Write code here
