# Compute performance metrics for the given Y and Y_score without sklearn

In [1]:
import numpy as np
import pandas as pd
# other than these two you should not import any other packages

<pre>
<font color='red'><b>A.</b></font> Compute performance metrics for the given data <strong>5_a.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

### <pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [2]:
data = pd.read_csv('5_a.csv')
data['y_pred'] = 0
data.head()

Unnamed: 0,y,proba,y_pred
0,1.0,0.637387,0
1,1.0,0.635165,0
2,1.0,0.766586,0
3,1.0,0.724564,0
4,1.0,0.889199,0


In [3]:
def pred_values(data,y,thresh):
    y_pred = []
    for label in data[y]:
        if label<thresh:
            y_pred.append(0)
        else:
            y_pred.append(1)
    return y_pred

In [4]:
def cal_vals(df,y,y_pred):
    tp = 0
    tn = 0
    fn = 0
    fp = 0
    for val1,val2 in enumerate(df['y']):
        if(df.y_pred[val1] == 1) and df.y[val1] == 1:
            tp=tp+1
        if(df.y_pred[val1] == 0) and df.y[val1] == 0:
            tn=tn+1
        if(df.y_pred[val1] == 0) and df.y[val1] == 1:
            fn=fn+1
        if(df.y_pred[val1] == 1) and df.y[val1] == 0:
            fp = fp+1
    return {'TN':tn,'TP':tp,'FN':fn,'FP':fp}

In [17]:
#setting threshold to be 0.5 
data['y_pred'] = pred_values(data,'proba',0.5)
confusion_matrix = cal_vals(data,'y','y_pred')

In [18]:
# confusion matrix values 
print("Confusion matrix is: ",confusion_matrix)

Confusion matrix is:  {'TN': 0, 'TP': 10000, 'FN': 0, 'FP': 100}


In [19]:
# Calculating prescision and recall
x=data.y.value_counts()
P=x[1]

precision=confusion_matrix['TP']/(confusion_matrix['TP']+confusion_matrix['FP'])
recall=confusion_matrix['TP']/P


In [20]:
#Calculating F1 score
F1_score = 2*precision*recall/(precision+recall)
print('F1 score is: ',F1_score)

F1 score is:  0.9950248756218906


In [21]:
# Calculating Accuracy
Acc = (confusion_matrix['TP']+confusion_matrix['TN'])/data.shape[0]
print('Accuracy is: ',Acc)

Accuracy is:  0.9900990099009901


In [22]:
# AUC Score calculation function
from tqdm import tqdm_notebook      # purpose of import is to just see progress
def auc_calc(df):
    s = df['y'].value_counts()
    P = s[1]
    N = s[0]
    tpr = []
    fpr = []
    for elem in tqdm_notebook(df['proba']):
        df['y_pred'] = pred_values(df,'proba',elem)
        confusion_matrix = cal_vals(df,'y','y_pred')
        tpr.append(confusion_matrix['TP']/P)
        fpr.append(confusion_matrix['FP']/N)
        df.drop(columns = ['y_pred'])
    return np.trapz(tpr,fpr)

In [23]:
#Sorting the rows
data=data.sort_values(by ='proba',ascending=False)
data.drop(columns = ['y_pred'])

Unnamed: 0,y,proba
1664,1.0,0.899965
2099,1.0,0.899828
1028,1.0,0.899825
9592,1.0,0.899812
8324,1.0,0.899768
...,...,...
8294,1.0,0.500081
1630,1.0,0.500058
7421,1.0,0.500058
805,1.0,0.500047


In [24]:
#Print AUS Score
AUC_score = auc_calc(data)
print ('AUC Score is :',AUC_score)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=10100.0), HTML(value='')))


AUC Score is : 0.48829900000000004


<pre>
<font color='red'><b>B.</b></font> Compute performance metrics for the given data <strong>5_b.csv</strong>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a></li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [5]:
data2 = pd.read_csv('5_b.csv')
data2.head()

Unnamed: 0,y,proba
0,0.0,0.281035
1,0.0,0.465152
2,0.0,0.352793
3,0.0,0.157818
4,0.0,0.276648


In [6]:
data2['y_pred'] = 0
data2.head()

Unnamed: 0,y,proba,y_pred
0,0.0,0.281035,0
1,0.0,0.465152,0
2,0.0,0.352793,0
3,0.0,0.157818,0
4,0.0,0.276648,0


In [7]:
#Calculating confudion matrix
data2['y_pred'] = pred_values(data2,'proba',0.5)
confusion_matrix2 = cal_vals(data2,'y','y_pred')

In [8]:
# Calculating prescision and recall
x = data2.y.value_counts()
P = x[1]
precision = confusion_matrix2['TP']/(confusion_matrix2['TP']+confusion_matrix2['FP'])
recall = confusion_matrix2['TP']/P

In [9]:
#Calculating F1 score
F1_score = 2*precision*recall/(precision+recall)
print('F1 score is: ',F1_score)

F1 score is:  0.2791878172588833


In [10]:
#Calculating the accuracy score
accuracy2 = (confusion_matrix2['TP'] + confusion_matrix2['TN']) / data2.shape[0]
print('Accuracy is :',accuracy2)

Accuracy is : 0.9718811881188119


In [31]:
#Calculating AUC Score
data2 = data2.sort_values(by='proba',ascending=False)
data2.drop(columns=['y_pred'])
AUC_score2 = auc_calc(data2)
print('AUC Score is: ',AUC_score2)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if __name__ == '__main__':


HBox(children=(FloatProgress(value=0.0, max=10100.0), HTML(value='')))


AUC Score is:  0.9377570000000001


<font color='red'><b>C.</b></font> Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data <strong>5_c.csv</strong>
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [11]:
data3 = pd.read_csv('5_c.csv')
data3.head()

Unnamed: 0,y,prob
0,0,0.458521
1,0,0.505037
2,0,0.418652
3,0,0.412057
4,0,0.375579


In [12]:
print(data3.shape)
data3 = data3.sort_values(by='prob',ascending=False)
data3.head()

(2852, 2)


Unnamed: 0,y,prob
2634,1,0.957747
2548,1,0.951437
2447,1,0.948638
2788,1,0.944094
2456,1,0.941113


In [34]:
#data3['y'].value_counts()

In [18]:
def min_mat(data3):
    P  = data3['y'].value_counts()[1]
    N  = data3['y'].value_counts()[0]
    tpr  = fpr = []
    mat = {}
    for elem in tqdm_notebook(data3['prob']):
        data3['y_pred'] = pred_values(data3,'prob',elem)
        con_mat = cal_vals(data3,'y','y_pred')
        mat_val = (500*con_mat['FN'])+(100*con_mat['FP'])
        mat[elem] = mat_val
        data3.drop(columns = ['y_pred'])
    return(mat)

In [15]:
from tqdm import tqdm_notebook 

In [19]:
res = min_mat(data3)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=2852.0), HTML(value='')))




In [22]:
temp = min(res.values()) 
res2 = [key for key in res if res[key] == temp]
print('Key:Value pair for min value of the specified metric :',res2,temp)

Key:Value pair for min value of the specified metric : [0.2300390278970873] 141000


<pre>
<font color='red'><b>D.</b></font> Compute performance metrics(for regression) for the given data <strong>5_d.csv</strong>
    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [23]:
data4 = pd.read_csv('5_d.csv')
data4.head()

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [24]:
print(data4.shape)

(157200, 2)


In [25]:
def error(df,c1,c2):
    val = []
    for index, (value1, value2) in enumerate(zip(df[c1], df[c2])):
        val.append(value1-value2)
    return val

In [26]:
#Absolute error
def absolute_error(df,col):
    val = []
    for index,value in enumerate(df[col]):
        val.append(abs(value))
    return val

In [27]:
def mse(df,col):
    return ss_res(df,col)/len(df[col])

In [29]:
#Mean Absolute squared error
def mape(df,c1,c2):
    val = sum(df[c1])/sum(df[c2])
    return val

In [30]:
def ss_res(df,col):
    val = 0
    for index,value in enumerate(df[col]):
        val = val+(value*value)
    return val

In [43]:
def ss_tot(df,col):
    val = 0
    mean_val = data4['y'].mean()
    for index,value in enumerate(df[col]):
        val = val+ (value-mean_val)*(value-mean_val)
    return val

In [35]:
data4['error'] = error(data4,'y','pred')
data4['abs_error'] = absolute_error(data4,'error')

In [39]:
MSE = mse(data4,'error')
print("Mean Squared Error : ", MSE)

Mean Squared Error :  177.16569974554707


In [41]:
MAPE=mape(data4,'abs_error','y')
print('MAPE :', MAPE)

MAPE : 0.1291202994009687


In [44]:
SS_RES=ss_res(data4,'error')
SS_TOT=ss_tot(data4,'y')
R_square= 1- (SS_RES/SS_TOT)
print('Co-efficient of determination value : ',R_square)

Co-efficient of determination value :  0.9563582786990964
