In [1]:
from collections import Counter
from collections import OrderedDict
import numpy as np
import pandas
import heapq

In [17]:
visit_popularity = Counter()
purchase_popularity = Counter()

with open('coursera_sessions_train.txt', 'r') as f:
    for line in f.xreadlines():
        visits, purchases = line.strip().split(';')
        for visit in visits.split(','):
            visit_popularity[visit] += 1
        if purchases != '':
            for purchase in purchases.split(','):
                purchase_popularity[purchase] += 1

In [18]:
def recommend_by_purchase(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: purchase_popularity.get(x, 0))

def recommend_by_visit(items, max_count):
    return heapq.nlargest(max_count, OrderedDict.fromkeys(items), key=lambda x: visit_popularity.get(x, 0))

In [19]:
def calculate_metrics(recommend, max_count, sessions_file):
    with open(sessions_file, 'r') as f:
        avg_recall = np.zeros(max_count)
        avg_precision = np.zeros(max_count)
        sessions_count = 0
        for line in f.xreadlines():
            visits, purchases = line.strip().split(';')
            if purchases != '':
                visits = visits.split(',')
                purchases = set(purchases.split(','))
                rec = recommend(visits, max_count)
                rec_hits = np.array(map(lambda x: x in purchases, rec))
                hits = np.zeros(max_count)
                hits[:len(rec_hits)] = rec_hits
                sessions_count += 1
                avg_recall +=  np.cumsum(hits) / len(purchases)
                avg_precision += np.cumsum(hits) / (np.arange(max_count) + 1)

    return pandas.DataFrame({
            'k': np.arange(max_count) + 1,
            'avg_recall@k': [round(x, 2) for x in avg_recall / sessions_count],
            'avg_precision@k': [round(x, 2) for x in avg_precision / sessions_count]
    }).set_index('k')

In [21]:
calculate_metrics(recommend_by_purchase, 5, 'coursera_sessions_test.txt')

Unnamed: 0_level_0,avg_precision@k,avg_recall@k
k,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.53,0.46
2,0.38,0.64
3,0.3,0.73
4,0.25,0.79
5,0.21,0.82
