# Compute performance metrics for the given Y and Y_score without sklearn

In [8]:
import numpy as np
import pandas as pd
 
# other than these two you should not import any other packages


## A. Compute performance metrics for the given data '5_a.csv'
 <pre>  <b>Note 1:</b> in this data you can see number of positive points >> number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_a.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a> Note: it should be numpy.trapz(tpr_array, fpr_array) not numpy.trapz(fpr_array, tpr_array)
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [83]:
# Reading files
df_a=pd.read_csv('5_a.csv')



In [84]:

# refer - https://pandas.pydata.org/docs/reference/api/pandas.Series.apply.html#pandas.Series.apply
def pred(y):
    return 0 if y < 0.5 else 1
df_a['y_pred'] = df_a['proba'].apply(pred)

print(df_a.shape)
print(type(df_a))
print(df_a.head(3))

(10100, 3)
<class 'pandas.core.frame.DataFrame'>
     y     proba  y_pred
0  1.0  0.637387       1
1  1.0  0.635165       1
2  1.0  0.766586       1


In [89]:
# write your code here for task A
def confusion_matrix(y_i, y_pred):
    """This functions compute confution matrix
        return : TP FP TN FN
    """
    TN = 0
    FN = 0
    TP = 0 
    FP = 0
    for i in range(len(y_i)):
        if y_i[i] == 1 and y_pred[i] == 1:
            TP += 1
        elif y_i[i] == 0 and y_pred[i] == 0:
            TN += 1
        if y_i[i] == 1 and y_pred[i] == 0:
            FN += 1
        if y_i[i] == 0 and y_pred[i] == 1:
            FP += 1
    return TP, FP, TN, FN

def F1_Score(y_i, y_pred):
    """This Function returns F1-score
    Mathematical Formlula - F1 = 2 * (Precision*Recall)/Precision+Recall

    where precision = TP/(TP+FP)
          Recall = TP/(TP+FN)
    """
    TP, FP, TN, FN = confusion_matrix(y_i, y_pred)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    return 2*(precision*recall)/(precision+recall)

def accuracy(y_i, y_pred):
    """This function computes accuracy score of model
    accuracy - (TP+TN)/(TP+TN+FP+FN)
    returns acc score
    """
    TP, FP, TN, FN = confusion_matrix(y_i, y_pred)
    return (TP+TN)/(TP+TN+FP+FN)

# refer - https://stackoverflow.com/questions/65748968/how-to-compute-auc-score-manually-without-using-sklearn
# https://www.kaggle.com/code/paulrohan2020/performance-metrics-without-sklearn
# https://numpy.org/doc/stable/reference/generated/numpy.trapz.html
def auc_score(df):
    t=[]
    f=[]
    sort = df.sort_values("proba",ascending=False) 
    for i in range(len(sort)):
        sort['y_pred']=np.where(sort['proba']>=sort.iloc[i]['proba'],1,0)
        TP, FP, TN, FN = confusion_matrix(sort['y'], sort['y_pred'])
        fpr=FP/(TN+FP)
        tpr=TP/(TP+FN)
        t.append(tpr)
        f.append(fpr)
    result = np.trapz(t, f)
    return result
    

In [90]:
TP, FP, TN, FN = confusion_matrix(df_a['y'], df_a['y_pred'])
p = [[TP, FN], [FP, TN]]
print(p)

[[10000, 0], [100, 0]]


In [91]:
print(F1_Score(df_a['y'], df_a['y_pred']))
print(accuracy(df_a['y'], df_a['y_pred']))


0.9950248756218906
0.9900990099009901


In [92]:
auc = auc_score(df_a)
print(auc)

0.48829900000000004




## B. Compute performance metrics for the given data '5_b.csv'
<pre>
   <b>Note 1:</b> in this data you can see number of positive points << number of negatives points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_b.csv</b>
   <b>Note 3:</b> you need to derive the class labels from given score</pre> $y^{pred}= \text{[0 if y_score < 0.5 else 1]}$

<pre>
<ol>
<li> Compute Confusion Matrix </li>
<li> Compute F1 Score </li>
<li> Compute AUC Score, you need to compute different thresholds and for each threshold compute tpr,fpr and then use               numpy.trapz(tpr_array, fpr_array) <a href='https://stackoverflow.com/q/53603376/4084039'>https://stackoverflow.com/q/53603376/4084039</a>, <a href='https://stackoverflow.com/a/39678975/4084039'>https://stackoverflow.com/a/39678975/4084039</a>
Note- Make sure that you arrange your probability scores in descending order while calculating AUC</li>
<li> Compute Accuracy Score </li>
</ol>
</pre>

In [79]:
df_b=pd.read_csv('5_b.csv')
df_b.head()
def pred(y):
    return 0 if y < 0.5 else 1
df_b['y_pred'] = df_b['proba'].apply(pred)

In [80]:
# write your code here for task B
TP, FP, TN, FN = confusion_matrix(df_b['y'], df_b['y_pred'])
p = [[TP, FN], [FP, TN]]
print(p)

[[55, 45], [239, 9761]]


In [81]:
print(F1_Score(df_b['y'], df_b['y_pred']))
print(accuracy(df_b['y'], df_b['y_pred']))

0.2791878172588833
0.9718811881188119


In [93]:
auc = auc_score(df_b)
print(auc)

0.9377570000000001


### C. Compute the best threshold (similarly to ROC curve computation) of probability which gives lowest values of metric <b>A</b> for the given data 
<br>

you will be predicting label of a data points like this: $y^{pred}= \text{[0 if y_score < threshold  else 1]}$

$ A = 500 \times \text{number of false negative} + 100 \times \text{numebr of false positive}$

<pre>
   <b>Note 1:</b> in this data you can see number of negative points > number of positive points
   <b>Note 2:</b> use pandas or numpy to read the data from <b>5_c.csv</b>
</pre>

In [96]:
df_c=pd.read_csv('5_c.csv')
df_c.head()
def pred(y):
    return 0 if y < 0.5 else 1
df_c['y_pred'] = df_c['prob'].apply(pred)

print(df_c.columns)

Index(['y', 'prob', 'y_pred'], dtype='object')


In [99]:
 # write your code for task C
def best_threshold(data):
    temp = 0
    threshold=[]
    A=[]
    sort = data.sort_values("prob",ascending=False)  
    for i in range(len(sort)):
        if not (temp==(sort.iloc[i]['prob']) ):
            temp=sort.iloc[i]['prob'] 
            threshold.append(temp)
            sort['y_pred']=np.where(sort['prob']>=sort.iloc[i]['prob'],1,0)
            TP, FP, TN, FN = confusion_matrix(sort['y'], sort['y_pred']) 
            value=500*FN+100*FP
            A.append(value) 
    index=A.index(min(A))  
    return threshold[index]

In [100]:
print(best_threshold(df_c))

0.2300390278970873



## D.</b></font> Compute performance metrics(for regression) for the given data 5_d.csv
<pre>    <b>Note 2:</b> use pandas or numpy to read the data from <b>5_d.csv</b>
    <b>Note 1:</b> <b>5_d.csv</b> will having two columns Y and predicted_Y both are real valued features
<ol>
<li> Compute Mean Square Error </li>
<li> Compute MAPE: https://www.youtube.com/watch?v=ly6ztgIkUxk</li>
<li> Compute R^2 error: https://en.wikipedia.org/wiki/Coefficient_of_determination#Definitions </li>
</ol>
</pre>

In [9]:
df_d=pd.read_csv('5_d.csv')
df_d.head()
 

Unnamed: 0,y,pred
0,101.0,100.0
1,120.0,100.0
2,131.0,113.0
3,164.0,125.0
4,154.0,152.0


In [10]:
df_d.head()
y_i = df_d['y']
y_pred = df_d['pred']

In [11]:
 # write your code for task 5d
# https://www.geeksforgeeks.org/python-mean-squared-error/
# https://www.geeksforgeeks.org/python-coefficient-of-determination-r2-score/
# https://www.geeksforgeeks.org/how-to-calculate-mape-in-python/
def metrics_Regression(df, y_i, y_pred):
    """formula for MSE. = E[(X−ˆX)2]=E[(X−g(Y))2]."""
    n=len(y_i)
    df['ei']= df.apply(lambda x: abs(x['y'] - x['pred']), axis=1)  
    df['mse']= df['ei'].apply(lambda x: x*x)  
    total=df['mse'].sum()
    mse=total/n
    mape=(df['ei'].sum())/(df['y'].sum())
    mean=(df['y'].sum())/n  
    ssres=df['mse'].sum()
    df['sstotal']= df.apply(lambda x: (x['y'] - mean), axis=1)
    df['sstotal']= df['sstotal'].apply(lambda x: x*x)
    sstotal=df['sstotal'].sum()
    rsquared=1-(ssres/sstotal)
    return mse,mape,rsquared
    

In [12]:
mse, mape, rsquared = metrics_Regression(df_d, y_i, y_pred)


In [13]:
print("MSE: ", mse)
print("MAPE: ", mape)
print("RSQUARED: ", rsquared)

MSE:  177.16569974554707
MAPE:  0.1291202994009687
RSQUARED:  0.9563582786990937
