In [3]:
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, accuracy_score, confusion_matrix
from scipy.special import expit
import pandas as pd
import numpy as np
import seaborn as sns
from fastbook import *
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from dtreeviz.trees import *
from IPython.display import Image, display_svg, SVG


import warnings
warnings.filterwarnings("ignore")

# Create the model

In [4]:
df_train = pd.read_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_train_one.csv')
df_train.drop(columns=['Unnamed: 0'], inplace=True)
df_test = pd.read_csv('/home/jupyter/charliemacuject/research_papers/dme/input/df_test_one.csv')
df_test.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
X_train = df_train.drop(columns=['outcome', 'id'])
y_train = df_train.outcome
X_test = df_test.drop(columns=['outcome', 'id'])
y_test = df_test.outcome

In [6]:
X_train = pd.concat([X_train, pd.get_dummies(X_train.location)], axis=1)
X_train.drop(columns=['location'], inplace=True)
X_test = pd.concat([X_test, pd.get_dummies(X_test.location)], axis=1)
X_test.drop(columns=['location'], inplace=True)

In [7]:
X_train = pd.concat([X_train, pd.get_dummies(X_train.initiation_drug)], axis=1)
X_train.drop(columns=['initiation_drug'], inplace=True)
X_test = pd.concat([X_test, pd.get_dummies(X_test.initiation_drug)], axis=1)
X_test.drop(columns=['initiation_drug'], inplace=True)

In [8]:
X_train.columns

Index(['mean_vision', 'std_vision', 'peak_visual_improvement', 'baseline',
       'boronia', 'box_hill', 'Avastin', 'Eylea', 'IVTA', 'Lucentis',
       'No Injection', 'Ozurdex'],
      dtype='object')

In [9]:
X_test.columns

Index(['mean_vision', 'std_vision', 'peak_visual_improvement', 'baseline',
       'boronia', 'box_hill', 'Avastin', 'Eylea', 'Lucentis', 'No Injection',
       'Ozurdex'],
      dtype='object')

In [10]:
#X_test['Ozurdex'] = 0
X_test['IVTA'] = 0

In [11]:
def accuracy(y_true, y_pred):
    correct_counter = 0
    for yt, yp in zip(y_true, y_pred):
        if yt == yp:
            correct_counter += 1
    return round(correct_counter / len(y_true), 5)

In [12]:
def m_accuracy(m, X, y): 
    return accuracy(m.predict(X), y)

In [40]:
def rf(xs, y, n_estimators=100, max_samples=28,
       max_features=0.5, min_samples_leaf=5, random_state=42, **kwargs):
    return RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features, random_state=42,
        min_samples_leaf=min_samples_leaf, oob_score=True, warm_start=True).fit(xs, y)

In [42]:
m = rf(X_train, y_train)
m_accuracy(m, X_train, y_train), m_accuracy(m, X_test, y_test)

(0.68333, 0.87234)

In [15]:
len(X_train), len(X_test)

(120, 47)

## Metrics

In [1]:
from sklearn import metrics 

l1 = [0,1,1,1,0,0,0,1]
l2 = [0,1,0,1,0,1,0,0]
metrics.accuracy_score(l1, l2)

0.625

In [18]:
y_test.value_counts()

0    28
1    19
Name: outcome, dtype: int64

In [19]:
28 / (28 + 19)

0.5957446808510638

Thus, if we instantiated a model that just predicted a negative OVC, we would have 58%. The distribution of labels is fairly even.

Before learning about precision, we need to define a few terms. Here, we assume that patients with positive OVC are in the positive class (1) and with negative OVC are in the negative class (0).
* __True positive__ (TP): given a patient, if the model predicts they have a positive OVC, and the actual target is 1, is is considered a true positive.
* __True negative__ (TN): given a patient, if the model predicts they have a negative OVC, and the actual target is 0, is is considered a true negative.

In [20]:
def true_positive(y_true, y_pred):
    tp = 0
    for yt, yp in zip(y_true, y_pred):
        if yt == 1 and yp == 1:
            tp += 1
    return tp

def true_negative(y_true, y_pred):
    tn = 0
    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 0:
            tn += 1
    return tn

def false_positive(y_true, y_pred):
    fp = 0
    for yt, yp in zip(y_true, y_pred):
        if yt == 0 and yp == 1:
            fp += 1
    return fp

def false_negative(y_true, y_pred):
    fn = 0
    for yt, yp in zip(y_true, y_pred):
        if yt == 1 and yp == 0:
            fn += 1
    return fn

We can now re-implement accuracy score using these metrics.

In [26]:
def accuracy_v2(y_true, y_pred):
    tp = true_positive(y_true, y_pred)
    fp = false_positive(y_true, y_pred)
    fn = false_negative(y_true, y_pred)
    tn = true_negative(y_true, y_pred)
    return (tp + tn) / (tp + tn + fp + fn)

In [27]:
accuracy_v2(l1, l2)

0.625

_Precision_ is defined as $TP / (TP + FP)$. Let's say we make a new model on the slightly skewed dataset, and our model correctly identifies 80 negative OVCs out of 90 and 8 positive OVCs out of 10. Thus, we identify 88 out of 100 patients correctly. Accuracy is thus 88%.

However, out of these 100 patients, 10 negative OVCs are misclassified as positive and 2 negatives are misclassified as positive. Thus, $TP=8, TN=80, FP=10, FN=2$. So precision is $8/(8+10)=0.444$. This means our model is correct 44.4% of the time when trying to identify positive OVCs.

Think of precision as being the percentage of correctly predicted positive cases out of the total predicted positive cases. That is, of our patients we predicted were OVC positive, how many actually were?

In [28]:
def precision(y_true, y_pred):
    tp = true_positive(y_true, y_pred)
    fp = false_positive(y_true, y_pred)
    return tp / (tp + fp)

In [29]:
precision(l1, l2)

0.6666666666666666

_Recall_ is defined as $TP / (TP + FN)$. In the above case recall is $8/(8+2)=0.80$. Recall thus calculates how many of the actual positive OVCs our model picks up on.

In [30]:
def recall(y_true, y_pred):
    tp = true_positive(y_true, y_pred)
    fn = false_negative(y_true, y_pred)
    return tp / (tp + fn)

In [31]:
recall(l1, l2)

0.5

It's challenging to choose a value of threshold that gives both good precision and recall values. If the threshold for cutoff is too high, you end up with a smaller number of true positives and a high number of false negatives. This decreases the recall; however, the precision will be high. If the treshold is too low, false positives will increase, and precision will be less.

_F1 score_ is a metric that combines both precision and recall. It is defined as a simple weighted average (harmonic mean) of precision and recall. If we denote precision using P and recall using R, we can represent the F1 score as $F1 = 2PR / (P + R)$. Rearranging, we obtain $F1 = 2TP / (2TP + FP + FN)$.

In [32]:
def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    score = 2* p * r / (p + r)
    return score

In [33]:
f1(l1, l2)

0.5714285714285715

True positive rate is the same as recall, and is also known as _sensitivity_.

In [34]:
def tpr(y_true, y_pred):
    return recall(y_true, y_pred)

In [35]:
def fpr(y_true, y_pred):
    fp = false_positive(y_true, y_pred)
    tn = true_negative(y_true, y_pred)
    return fp / (tn + fp)

## Automating metrics

In [44]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

def model_evaluation(model, X, y_true):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    df = pd.DataFrame({'Accuracy': [accuracy], 'F1_Score': [f1],
                       'Precision': [precision], 'Recall': [recall]})
    return df

In [45]:
model_evaluation(m, X_test, y_test)

Unnamed: 0,Accuracy,F1_Score,Precision,Recall
0,0.87234,0.85,0.809524,0.894737
