# Compute performance metrics for the given Y and Y_score without sklearn

In [102]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

## **Section A:**

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [103]:
# Reading data
df = pd.read_csv("5_a.csv")

# Predicting y values
y_predicted = [0 if i[2] < 0.5 else 1 for i in df.itertuples()]

# Adding predicted values in dataframe in column "y_pred"
df['y_pred'] = y_predicted

# Checking the data
df.head()

Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,1
1,1.0,0.635165,1
2,1.0,0.766586,1
3,1.0,0.724564,1
4,1.0,0.889199,1


In [104]:
# 1. Computing confusion matrix

TN, FN, FP, TP = 0, 0, 0, 0

# Iterating over rows of data-frame
for i in df.itertuples():
    if (i[1] == 0 and i[3] == 0):
        TN += 1
    elif (i[1] == 1 and i[3] == 0):
        FN += 1
    elif (i[1] == 0 and i[3] == 1):
        FP += 1
    else:
        TP += 1
        
confusion_matrix = [[TN, FN],[FP, TP]]

print("Confusion matrix:", confusion_matrix)

Confusion matrix: [[0, 0], [100, 10000]]


In [105]:
# 2. Computing F1 score

pr =  float(TP / (TP + FP))
re = float(TP / (FN + TP))

f1_score = 2 * ((pr * re) / (pr + re))
print("F1 Score:", f1_score)

F1 Score: 0.9950248756218906


In [106]:
# 3. Computing AUC Score

# Calculating FPR and TPR values
def calculate_FPR_TPR_values(df):
    FPR_values = []
    TPR_values = []
    
    # Getting only unique probabilities for threshold in decreasing order
    threshold_prob_scores = sorted(set(df["proba"]), reverse=True)
    
    for value in threshold_prob_scores:
        
        # Predicting y values for every unique threshold value
        y_hat = [1 if i[2] >= value else 0 for i in df.itertuples()]
        
        TN, FN, FP, TP = 0, 0, 0, 0
        
        # Computing confusion matrix values for every unique threshold value
        for i, j in zip(df["y"], y_hat):
            if (i == 0 and j == 0):
                TN += 1
            elif (i == 1 and j == 0):
                FN += 1
            elif (i == 0 and j == 1):
                FP += 1
            else:
                TP += 1
    
        FPR_values.append(float(FP / (TN + FP)))
        TPR_values.append(float(TP / (FN + TP)))
    
    return (FPR_values, TPR_values)

temp = calculate_FPR_TPR_values(df)
print("The AUC value is", np.trapz(temp[1], temp[0]))

The AUC value is 0.48829900000000004


In [107]:
# 4. Computing Accuracy

print("Accuracy:", float((TN + TP) / (TN + FN + FP + TP)))

Accuracy: 0.9900990099009901


## **Section B:**

<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [108]:
# Reading data
df_2 = pd.read_csv("5_b.csv")

# Predicting y values
y_predicted = [0 if i[2] < 0.5 else 1 for i in df_2.itertuples()]

# Adding predicted values in dataframe in column "y_pred"
df_2['y_pred'] = y_predicted

# Checking the data
df_2.head()

Unnamed: 0,y,proba,y_pred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0


In [109]:
# 1. Computing confusion matrix

TN, FN, FP, TP = 0, 0, 0, 0

# Iterating over rows of data-frame
for i in df_2.itertuples():
    if (i[1] == 0 and i[3] == 0):
        TN += 1
    elif (i[1] == 1 and i[3] == 0):
        FN += 1
    elif (i[1] == 0 and i[3] == 1):
        FP += 1
    else:
        TP += 1

confusion_matrix_2 = [[TN, FN],[FP, TP]]

print("Confusion matrix:", confusion_matrix_2)

Confusion matrix: [[9761, 45], [239, 55]]


In [110]:
# 2. Computing F1 score

pr =  float(TP / (TP + FP))
re = float(TP / (FN + TP))

f1_score = 2 * ((pr * re) / (pr + re))
print("F1 Score:", f1_score)

F1 Score: 0.2791878172588833


In [111]:
# 3. Computing AUC Score

# Calculating FPR and TPR values
def calculate_FPR_TPR_values(df):
    FPR_values = []
    TPR_values = []
    
    # Getting only unique probabilities for threshold in decreasing order
    threshold_prob_scores = sorted(set(df["proba"]), reverse=True)
    
    for value in threshold_prob_scores:
        
        # Predicting y values for every unique threshold value
        y_hat = [1 if i[2] >= value else 0 for i in df.itertuples()]
        
        TN, FN, FP, TP = 0, 0, 0, 0
        
        # Computing confusion matrix values for every unique threshold value
        for i, j in zip(df["y"], y_hat):
            if (i == 0 and j == 0):
                TN += 1
            elif (i == 1 and j == 0):
                FN += 1
            elif (i == 0 and j == 1):
                FP += 1
            else:
                TP += 1
    
        FPR_values.append(float(FP / (TN + FP)))
        TPR_values.append(float(TP / (FN + TP)))
    
    return (FPR_values, TPR_values)

temp = calculate_FPR_TPR_values(df_2)
print("The AUC value is", np.trapz(temp[1], temp[0]))

The AUC value is 0.9377570000000001


In [112]:
# 4. Computing Accuracy

print("Accuracy:", float((TN + TP) / (TN + FN + FP + TP)))

Accuracy: 0.9718811881188119


## **Section C:**

<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [113]:
# Reading data
df_3 = pd.read_csv("5_c.csv")

# Checking the data
df_3.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [114]:
prob_metA_dict = dict()

# Getting only unique probabilities for threshold in decreasing order
threshold_prob_scores = sorted(set(df_3["prob"]), reverse=True)

for value in threshold_prob_scores:
    
    # Predicting y values for every unique threshold value
    y_hat = [1 if i[2] >= value else 0 for i in df_3.itertuples()]
        
    FN, FP = 0, 0
    
    for i, j in zip(df_3["y"], y_hat):
        if (i == 1 and j == 0):
            FN += 1
        if (i == 0 and j == 1):
            FP += 1
    
    # Calculating metric A's value for every unique threshold value's FN and FP values
    A = (500 * FN) + (100 * FP)
    
    # Making a dictionary with threshold prob values as "key" and its metric A's value as "value"
    prob_metA_dict[value] = A

# Finding the minimum value of metric A
temp = min(prob_metA_dict.values())

# For the minimum value of metric A, finding the adjacent threshold probability value
for k, v in prob_metA_dict.items():
    if v == temp:
        print("The best threshold of probability which gives lowest values of metric A is", k)
    

The best threshold of probability which gives lowest values of metric A is 0.230039028


## **Section D:**

<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [115]:
# Reading the data
df_4 = pd.read_csv("5_d.csv")

# Checking the data
df_4.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [116]:
# 1. Computing Mean Square Error

sum = 0

# Iterating over rows of data-frame
for i in df_4.itertuples():
    sum = sum + pow((i[1]-i[2]),2)
    
print("The Mean Square Error is", round((sum/len(df_4)),2))

The Mean Square Error is 177.17


In [117]:
# 2. Computing MAPE

sum_e = 0
sum_y = 0

# Iterating over rows of data-frame
for i in df_4.itertuples():
    
    # Finding sum of absolute errors
    sum_e = sum_e + abs(i[1]-i[2])
    
    # Finding sum of actual values
    sum_y = sum_y + i[1]
    
print("The Mean Absolute Percentage Error is {0} %".format(round(float(sum_e/sum_y)*100,2)))

The Mean Absolute Percentage Error is 12.91 %


In [118]:
# 3. Computing R^2 error

SS_res = 0
SS_tot = 0
sum_y = 0

# Iterating over rows of data-frame
for i in df_4.itertuples():
    
    # Finding sum of actual values
    sum_y = sum_y + i[1]

# Finding mean of actual values
y_mean = float(sum_y/len(df_4))

# Iterating over rows of data-frame
for i in df_4.itertuples():
    SS_res = SS_res + pow((i[1]-i[2]),2)
    SS_tot = SS_tot + pow((i[1]-y_mean),2)
    
print("R^2 error is", round(float(1-(SS_res/SS_tot)),2))

R^2 error is 0.96


In [None]:
# End