# MAP and nDCG

In [82]:
import numpy as np
import pandas as pd
import ml_metrics 

 <h3> Part A - Extract top_n ground truth - One time execution </h3>
<br> 1. Read the file which has userId, TopicId, Weight
<br> 2. sort and extract top_n topics for each user

In [86]:
#one time execution
#change the file location accordingly
topk = 10
file_location = "../1_Data_UserId_100/day31.txt"
a,b,c = file_location.split('/')
destination_location = '../101_Extracted_GT/' + c

names=['UserId', 'TopicId', 'Weight']
data = pd.read_csv(file_location, sep='\t', names=names)

userlist=np.arange(100, 2558,1)
final_gt = []

#delete the data if file "groundtruth" already exists
open(destination_location, 'w').close()
#for each user writes the top k topics
for uid in userlist:
    gt = data[data['UserId'] == uid]
    if(len(gt)>0):
        gt = gt.sort_values(['Weight', 'TopicId'],ascending=[False, True])
        gt = gt.iloc[0:topk,:]
        gt.to_csv(destination_location, encoding='utf-8', index=False, sep='\t', mode='a', header=False)

# Calculate nDCG function

In [12]:
"""
Input:
gt is ground truth topics - a single list
pred is predicted topics - a single list
Output: returns array of 1s and 0s if predicted is right(1) or wrong(0) compared to ground truth
"""
def convert_relavant_nonrelavant(gt,pred):
    binary_relavant=np.repeat(0,len(pred))
    for i in range(len(pred)):
        for j in range(len(gt)):
            if(pred[i] == gt[j]):
                binary_relavant[i]=1
    return(binary_relavant)

# print("convert_relavant_nonrelavant: ",convert_relavant_nonrelavant([5,10,15,20,25],[42,25,5,8,10]))

"""
Input:
list1 is ground truth topics - a single list
list2 is predicted topics - a single list
Output: returns calculated DCG value for all elements in the predicted list
"""

def calculate_dcg(list1, list2):
    #convert into binary_relavant topic or non-relavant topic
    relavant = convert_relavant_nonrelavant(list1, list2)
    p = len(relavant)
    log2i = np.log2(np.asarray(range(1, p + 1)) + 1)
  #  print("log2i",log2i)
  #  print("array",(np.power(2, rel) - 1) / log2i)
    return ((np.power(2, relavant) - 1) / log2i).sum()
    
#calculate_dcg(gt, pre)
# print("calculate_dcg: ",calculate_dcg([5,10,15,20,25],[42,25,5,8,10]))

"""
Input:
list1 is ground truth topics - a single list
Output: returns ideal value for DCG
"""
# Question: while calculating the iDCG, ideal I am considering all the top 5 topics are relavant
# what happens when some weights are equal
# Assumption: all the top 5 ground truth is considered as relavant
def calculate_IDCG(list1):
    rel = np.repeat(1, len(list1))
    p = len(rel)
    log2i = np.log2(np.asarray(range(1, p + 1)) + 1)
    #print("log2i",log2i)
    #print("array",(np.power(2, rel) - 1) / log2i)
    return ((np.power(2, rel) - 1) / log2i).sum()

# print("calculate_IDCG: ",calculate_IDCG([5,10,15,20,25]))


def nDCG(list1, list2):
    return(calculate_dcg(list1, list2)/calculate_IDCG(list1))

# print("nDCG---> ",nDCG([5,10,15,20,25],[42,25,5,8,10]))

<h3> Compare groundtruth and predicted</h3>
<br> STEP - PART B - Extract top_n predicted values
<br> 1. read ground truth and read predicted files
<br> 2. Prep the data for comparion
<br> Use different methods to compare

In [104]:
topk=5
gt_file = '../101_Extracted_GT/day61.txt'
#please keep the format as is
# Zmatrix59 is for day60
pred_file = 'Z_all_t4_c20_exp1/predicted.csv'

# 1. extract groundtruth file---------------------------------------------------
names=['UserId', 'TopicId', 'Weight']
groundtruth = pd.read_csv(gt_file, sep='\t',names=names) 
#print(groundtruth)
#extract only userid
gt_userlist = pd.unique(groundtruth['UserId'])
print(gt_userlist)
print("Ground truth is available for ",len(gt_userlist)," users")
#drop weight and group by UserId
groundtruth = groundtruth.drop(columns=['Weight'])
groundtruth = groundtruth.groupby('UserId')['TopicId'].apply(list)
print("Groundtruth")
print(groundtruth.head())

#2. import predicted interests---------------------------------------------------------------
names=['UserId', 'TopicId', 'Weight']
pi = pd.read_csv(pred_file, sep='\t',names=names) 

#drop weight and group by UserId
pi=pi.drop(columns=['Weight'])
predicted_list = pi.groupby('UserId')['TopicId'].apply(list)
print("\n Predictions available for ",len(predicted_list)," users")
predicted_list.columns=['UserId', 'TopicId']

#3. remove users from Predicted interests where ground truth is not available in ground truth----
Filtered_predicted  = predicted_list[predicted_list.index.isin(gt_userlist)]
print("Predicted")
print(Filtered_predicted.head())



[ 100  101  102 ... 2555 2556 2557]
Ground truth is available for  2044  users
Groundtruth
UserId
100       [8, 20, 86, 42, 48, 13, 82, 0, 1, 2]
101        [89, 20, 50, 69, 8, 22, 0, 1, 2, 3]
102    [29, 53, 68, 64, 35, 41, 57, 76, 82, 0]
103            [91, 0, 1, 2, 3, 4, 5, 6, 7, 8]
104         [52, 7, 50, 27, 80, 0, 1, 2, 3, 4]
Name: TopicId, dtype: object

 Predictions available for  2458  users
Predicted
UserId
100    [54, 45, 13, 94, 52, 33, 62, 89, 87, 44]
101    [35, 41, 10, 11, 46, 52, 57, 89, 66, 28]
102     [19, 75, 8, 76, 28, 89, 44, 81, 87, 53]
103     [77, 57, 83, 59, 63, 16, 79, 28, 1, 91]
104     [48, 71, 67, 93, 6, 66, 24, 37, 52, 87]
Name: TopicId, dtype: object


# Call different the evaluating functions

In [105]:
#4. call MAP function
print("\nMAP using in built function:")
print(ml_metrics.mapk(groundtruth,Filtered_predicted , topk))

#5 Call nDCG function
#zip gt and predicted
tuple_list = list(zip(groundtruth,Filtered_predicted))
#print(tuple_list)
total=0.0
count=len(gt_userlist)
for i in range(len(tuple_list)):
    nDCG_val= nDCG(tuple_list[i][0], tuple_list[i][1])
#    print("i is",i , " and nDCG is ", nDCG_val)
    total=total+ nDCG_val
final_nDCG = total/count
print("\n\nnDCG: ",final_nDCG)



MAP using in built function:
0.047426614481409


nDCG:  0.10190536214927012


In [None]:
#True_positive=I predicted  3, top5 has 3
# False positive: I predicted 3, but top5 does not have any 3
#False negative: top5 has 3, but did not predict 3
#True negative:

In [None]:
 result:#day1_t1_c50
<br> 0.1986483134920635