# Computing performance metrics from scratch


In [1]:
import numpy as np
import pandas as pd


#### Loading the data

In [24]:
dt_5_a = pd.read_csv('5_a.csv')
prob = np.asarray(dt_5_a['proba'])

#### Deriving class labels

In [25]:
y_pred = np.asarray(list(map(lambda x: 0 if x < 0.5 else 1, prob)))
y_obs = np.asarray(dt_5_a['y'])

#### Calculating performance metrics
##### Confusion Matrix, F1 Score and Accuaracy Score

In [26]:
def confusion_matrix_F1_Score_and_Accuracy(y_true, y_pred):
    
    """
    This Function computes Confusion Matrix, F1 Score and Accuracy
    """
    
    # calculating TN , FN, FP, TP
    TN = np.sum(np.logical_and(y_true == 0, y_pred == 0))
    FN = np.sum(np.logical_and(y_true == 1, y_pred == 0))
    FP = np.sum(np.logical_and(y_true == 0, y_pred == 1))
    TP = np.sum(np.logical_and(y_true == 1, y_pred == 1))
    
    
    # putting the values into a numpy array
    d = np.array([[TN, FN], [FP, TP]])
    
    # calculating precision
    precision = TP / (FP + TP)
    
    # calculating recall
    recall = TP/ (FN + TP)
    
    # calculating F1 Score
    F1_Score = 2 * (precision * recall) / (precision + recall)
    
    # Calculating accuracy
    Accuracy = (TN + TP) / (TN + FN + FP + TP)
    
    # printing the results
    print('Confusion Matrix')
    print(pd.DataFrame(d, columns = ['Actual:0', 'Actual:1'], index = ['Predicted:0', 'Predicted:1']))
    print('\nF1 Score: ', F1_Score)
    print('\nAccuracy: ', Accuracy)
    

confusion_matrix_F1_Score_and_Accuracy(y_obs, y_pred)

Confusion Matrix
             Actual:0  Actual:1
Predicted:0         0         0
Predicted:1       100     10000

F1 Score:  0.9950248756218906

Accuracy:  0.9900990099009901


##### Auc Score

In [8]:
def auc(y_true, y_score):
    
    """
    This function calculates AUC
    """
    # list for storing values
    y_pred_lst = []
    TN_lst = []
    FN_lst = []
    FP_lst = []
    TP_lst = []
    
    # threshold values
    Thresholds = sorted(set(y_score), reverse = True)
    
    # calculating n y_pred lists according to thresold values
    for threshold in Thresholds:
        y_pred_lst_threshold = [1 if s >= threshold else 0 for s in y_score]
        y_pred_lst_threshold = pd.Series(y_pred_lst_threshold)
        y_pred_lst.append(y_pred_lst_threshold)
    
    # calculating TN, FN, FP, TP values
    for y_pred in y_pred_lst:
        TN = np.sum(np.logical_and(y_obs == 0, y_pred == 0))
        FN = np.sum(np.logical_and(y_obs == 1, y_pred == 0))
        FP = np.sum(np.logical_and(y_obs == 0, y_pred == 1))
        TP = np.sum(np.logical_and(y_obs == 1, y_pred == 1))
        TN_lst.append(TN)
        FN_lst.append(FN)
        FP_lst.append(FP)
        TP_lst.append(TP)
    
    TN_lst = np.array(TN_lst)
    FN_lst = np.array(FN_lst)
    FP_lst = np.array(FP_lst)
    TP_lst = np.array(TP_lst)
    
    # calculating negatives and positives
    negative = TN_lst + FP_lst
    positive = TP_lst + FN_lst
    
    # calculating TPR and FPR
    TPR_array = TP_lst / positive
    FPR_array = FP_lst/ negative
    
    # calculating AUC
    auc = np.trapz(TPR_array, FPR_array)
    print('AUC: ', auc)

    
auc(y_obs, prob)

AUC:  0.48829900000000004



<pre>
Computing performance metrics for the given data <strong>5_b.csv</strong>


<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

#### Loading the data

In [36]:
dt_5_b = pd.read_csv('5_b.csv')
prob = np.asarray(dt_5_b['proba'])

#### Deriving Class Labels

In [37]:
y_pred = np.asarray(list(map(lambda x: 0 if x < 0.5 else 1, prob)))
y_obs = np.asarray(dt_5_b['y'])

#### Calculating Performance Metrics
##### Confusion Metrics, F1 Score and Accuaracy Score

In [38]:
def confusion_matrix_F1_Score_and_Accuracy(y_true, y_pred):
    
    """
    This Function computes Confusion Matrix, F1 Score and Accuracy
    """
    
    # calculating TN , FN, FP, TP
    TN = np.sum(np.logical_and(y_true == 0, y_pred == 0))
    FN = np.sum(np.logical_and(y_true == 1, y_pred == 0))
    FP = np.sum(np.logical_and(y_true == 0, y_pred == 1))
    TP = np.sum(np.logical_and(y_true == 1, y_pred == 1))
    
    
    # putting the values into a numpy array
    d = np.array([[TN, FN], [FP, TP]])
    
    # calculating precision
    precision = TP / (FP + TP)
    
    # calculating recall
    recall = TP/ (FN + TP)
    
    # calculating F1 Score
    F1_Score = 2 * (precision * recall) / (precision + recall)
    
    # Calculating accuracy
    Accuracy = (TN + TP) / (TN + FN + FP + TP)
    
    # printing the results
    print('Confusion Matrix')
    print(pd.DataFrame(d, columns = ['Actual:0', 'Actual:1'], index = ['Predicted:0', 'Predicted:1']))
    print('\nF1 Score: ', F1_Score)
    print('\nAccuracy: ', Accuracy)

confusion_matrix_F1_Score_and_Accuracy(y_obs, y_pred)

Confusion Matrix
             Actual:0  Actual:1
Predicted:0      9761        45
Predicted:1       239        55

F1 Score:  0.2791878172588833

Accuracy:  0.9718811881188119


##### AUC Score

In [39]:
def auc(y_true, y_score):
    
    """
    This function calculates AUC
    """
    # list for storing values
    y_pred_lst = []
    TN_lst = []
    FN_lst = []
    FP_lst = []
    TP_lst = []
    
    # threshold values
    Thresholds = sorted(set(y_score), reverse = True)
    
    # calculating n y_pred lists according to thresold values
    for threshold in Thresholds:
        y_pred_lst_threshold = [1 if s >= threshold else 0 for s in y_score]
        y_pred_lst_threshold = pd.Series(y_pred_lst_threshold)
        y_pred_lst.append(y_pred_lst_threshold)
    
    # calculating TN, FN, FP, TP values
    for y_pred in y_pred_lst:
        TN = np.sum(np.logical_and(y_obs == 0, y_pred == 0))
        FN = np.sum(np.logical_and(y_obs == 1, y_pred == 0))
        FP = np.sum(np.logical_and(y_obs == 0, y_pred == 1))
        TP = np.sum(np.logical_and(y_obs == 1, y_pred == 1))
        TN_lst.append(TN)
        FN_lst.append(FN)
        FP_lst.append(FP)
        TP_lst.append(TP)
    
    TN_lst = np.array(TN_lst)
    FN_lst = np.array(FN_lst)
    FP_lst = np.array(FP_lst)
    TP_lst = np.array(TP_lst)
    
    # calculating negatives and positives
    negative = TN_lst + FP_lst
    positive = TP_lst + FN_lst
    
    # calculating TPR and FPR
    TPR_array = TP_lst / positive
    FPR_array = FP_lst/ negative
    
    # calculating AUC
    auc = np.trapz(TPR_array, FPR_array)
    print('AUC: ', auc)

    
auc(y_obs, prob)

AUC:  0.9377570000000001


Computing the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

Predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$




#### Loading the data

In [30]:
dt_5_c = pd.read_csv('5_c.csv')
prob = np.asarray(dt_5_c['prob'])

#### Deriving the class

In [31]:
y_pred = np.asarray(list(map(lambda x: 0 if x < 0.5 else 1, prob)))
y_obs = np.asarray(dt_5_c['y'])

#### Calculating the best threshold probability

In [32]:
def best_threshold(y_true, y_pred):
    """
    This function computes the best threshold of probability which gives the lowest
    value of metric A = 500*(number of false negative) + 100*(number of false positive)
    """
    
    Thresholds = np.array(sorted(set(prob), reverse = True))
    
    y_pred_lst = []
    FN_lst = []
    FP_lst = []
    for threshold in Thresholds:
        y_pred_lst_threshold = [1 if s >= threshold else 0 for s in prob]
        y_pred_lst_threshold = pd.Series(y_pred_lst_threshold)
        y_pred_lst.append(y_pred_lst_threshold)
        
    for y_pred in y_pred_lst:
        FP = np.sum(np.logical_and(y_obs == 0, y_pred == 1))
        FN = np.sum(np.logical_and(y_obs == 1, y_pred == 0))
        FN_lst.append(FN)
        FP_lst.append(FP)
        
    FN_lst = np.array(FN_lst)
    FP_lst = np.array(FP_lst)
    
    A = 500 * FN_lst + 100 * FP_lst
    
    idx = np.where(A == min(A))
    
    print('Best Threshold of probability: ', Thresholds[idx])

best_threshold(y_obs, y_pred)

Best Threshold of probability:  [0.23003903]


<pre>
Computing performance metrics(for regression) for the given data <strong>5_d.csv</strong>

<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

#### Loading the data

In [34]:
dt_5_d = pd.read_csv('5_d.csv')
y_obs = np.asarray(dt_5_d['y'])
y_pred = np.asarray(dt_5_d['pred'])

#### Calculating the Performance Metrics for regression

In [35]:
def performance_metrics_regression(y_true, y_pred):
    
    """
    This function provides performance metrics for regression such as
    MSE, MAPE, Coefficient of Determination
    """
    
    # calculating MSE
    MSE = np.square(y_true - y_pred).mean()
    
    # calculating MAPE
    abs_Error = abs(y_pred - y_true)
    MAPE = np.sum(abs_Error) / np.sum(y_true)
    MAPE
    
    # calculating R_Squared
    y_bar = np.mean(y_true)
    SS_total = np.sum(np.square(y_true - y_bar))
    SS_res = np.sum(np.square(y_true - y_pred))
    R_Squared = 1 - (SS_res / SS_total)
    
    print('Mean Square Error: ', MSE)
    print('\nMean Absolute Percentage Error: ', MAPE)
    print('\nR_Squared or Coefficient of determination: ', R_Squared)
    
performance_metrics_regression(y_obs, y_pred)

Mean Square Error:  177.16569974554707

Mean Absolute Percentage Error:  0.1291202994009687

R_Squared or Coefficient of determination:  0.9563582786990937
