<a href="https://colab.research.google.com/github/derek-byte/Sololearn-Machine-Learning/blob/main/Sololearn_Model_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Lesson 18.1: Evaluation Metrics

In [None]:
# Confusion Matrix
# (Correct)/(All) = 713/887 = 80.38%

Lesson 19.1: Precision and Recall

In [None]:
# Conceptually, precision refers to the percentage of positive results which are relevant and recall to the percentage of positive cases correctly classified.
precision = (positive_predicted_correctly/positive_predictions) 
precision = 233 / (233 + 65) = 0.7819
# precision = TP / (TP + FP)

# Recall is the percent of positive cases that the model predicts correctly. Again, we will be using the confusion matrix to compute our result.
recall = (positive_predicted_correctly/positive_cases)
recall = 233 / (233 + 109) = 0.6813
# recall = TP / (TP + FN)

# The F1 score is an average of precision and recall so that we have a single score for our model.
F1 = 2 * (precision * recall) / (precision + recall)

Lesson 20.1: Calculating Metrics in Scikit-learn

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import pandas as pd

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values
model = LogisticRegression()
model.fit(X, y)
y_pred = model.predict(X)

print("accuracy:", accuracy_score(y, y_pred))
print("precision:", precision_score(y, y_pred))
print("recall:", recall_score(y, y_pred))
print("f1 score:", f1_score(y, y_pred))

print(confusion_matrix(y, y_pred))
# [475, 70],
# [103, 239]
# True Positives: 239, False Positives: 70, False Negatives: 103, True Negatives: 475

Lesson 21.1: Training and Testing 

In [None]:
# A standard breakdown is to put 70-80% of our data in the training set and 20-30% in the test set.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import pandas as pd

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27, test_size=50)

print("whole dataset:", X.shape, y.shape)
print("training set:", X_train.shape, y_train.shape)
print("test set:", X_test.shape, y_test.shape)
# By default the training set is 75% of the data and the test set is the remaining 25% of the data.

print(X_train.shape, y_train.shape)
# Result: (75, 4)(75,)
# 75% used for training and 4 features because of x_train, x_test, y_train, y_test

# Building a Scikit-learn Model Using a Training Set
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

Lesson 22.1: Foundations of the ROC Curve

In [None]:
# The sensitivity is another term for the recall, which is the true positive rate. Recall that it is calculated as follows:
sensitivity = recall = (positive_predicted_correctly/positive_cases)

# The specificity is the true negative rate. It’s calculated as follows.
specifity = negatives_predited_correctly / negative_cases

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support, recall_score
import pandas as pd

sensitivity_score = recall_score
def specificity_score(y_true, y_pred):
  p, r, f, s = precision_recall_fscore_support(y_true, y_pred)
  return r[0] 

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27, test_size=50)
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = LogisticRegression()
model.fit(X, y)
# y_pred = model.predict(X)
# Probability of positive class 
y_pred = model.predict_proba(X_test)[:, 1] > 0.75
print("predict_proba", model.predict_proba(X_test))

# print("sensitivity:", sensitivity_score(y_test, y_pred))
# print("specificity:", specificity_score(y_test, y_pred))

Lesson 23.1: The ROC Curve

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

model1 = LogisticRegression()
model1.fit(X_train, y_train)

y_pred_proba1 = model1.predict_proba(X_test)
print("model 1 AUC score:", roc_auc_score(y_test, y_pred_proba1[:, 1]))

model2 = LogisticRegression()
model2.fit(X_train[:, 0:2], y_train)
y_pred_proba2 = model2.predict_proba(X_test[:, 0:2])
print("model 1 AUC score:", roc_auc_score(y_test, y_pred_proba2[:, 1]))

#Graph
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred_proba = model.predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba[:,1])

plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('1 - specificity')
plt.ylabel('sensitivity')
plt.show()

Lesson 24.1: K-Fold Cross Validation

In [None]:
# This process for creating multiple training and test sets is called k-fold cross validation.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import numpy as np

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

X_train, X_test, y_train, y_test = train_test_split(X, y)

# building the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluating the Model 
y_pred = model.predict_proba(X_test)
print("accuracy {0:5f}".format(accuracy_score(y_test, y_pred)))
print("precision {0:5f}".format(precision_score(y_test, y_pred)))
print("recall {0:5f}".format(recall_score(y_test, y_pred)))
print("f1 score {0:5f}".format(f1_score(y_test, y_pred)))

Lesson 25.1: KFold Class

In [None]:
from sklearn.model_selection import KFold
import pandas as pd

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
X = df[['Age', 'Fare']].values[:6]
y = df['Survived'].values[:6]

# Shuffle data and randomize oredr of data
kf = KFold(n_splits=3, shuffle=True)
for train, test in kf.split(X):
  print(train, test)

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
import pandas as pd

df = pd.read_csv('https://sololearn.com/uploads/files/titanic.csv')
df['male'] = df['Sex'] == 'male'
X = df[['Pclass', 'male', 'Age', 'Siblings/Spouses', 'Parents/Children', 'Fare']].values
y = df['Survived'].values

# Shuffle data and randomize oredr of data
kf = KFold(n_splits=5, shuffle=True)

splits = list(kf.split(X))
train_indices, test_indices = splits[0]
X_train = X[train_indices]
X_test = X[test_indices]
y_train = y[train_indices]
y_test = y[test_indices]

model = LogisticRegression()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))

# Print Scores
scores = []
kf = KFold(n_splits=5, shuffle=True)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
print(scores)
print(np.mean(scores))