In [1]:
import numpy as np
import random
import pandas as pd
import math
import sys

In [2]:
class Dataset:
    def __init__(self, path1='../chapter2/movielens_data/ratings.dat', path2='../chapter2/movielens_data/movies.dat'):
        self.data, self.contents = self.load_data(path1, path2)
    
    def load_data(self, path1, path2):
        data = []
        with open(path1) as file:
            for l in file:
                data.append(tuple(map(int, l.strip().split('::')[:2])))
        
        contents = dict()
        with open(path2, 'rb') as file:
            for l in file:
                l = str(l)[2:-1]
                contents[int(l.strip().split('::')[0])] = l.strip().split('::')[-1].split('|')
        
        for item, cont in contents.items():
            cont[-1] = cont[-1].strip('\\n')
        
        return data, contents
    
    def split_data(self, M=10, k=3, seed=1):
        test = []
        train = []
        random.seed(seed)
        
        for user, item in self.data:
            if random.randint(0, M-1) == k:
                test.append((user, item))
            else:
                train.append((user, item))
        
        def convert_dict(data):
            data_dict = {}
            for user, item in data:
                if user not in data_dict:
                    data_dict[user] = set()
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict
        
        return convert_dict(train), convert_dict(test)

In [66]:
class Metrics:
    def __init__(self, recs, train_data, test_data):
        self.prec = self.precision(recs, train_data, test_data)
        self.rc = self.recall(recs, train_data, test_data)
        self.cov = self.coverage(recs, test_data, train_data)
    
    
    def precision(self, recs, train_data, test_data):
        correct = 0
        total = 0
        for user, items in test_data.items():
            # items : movies which user have interacted
            if user not in recs:
                continue
            rank = recs[user]
            total += len(rank)
            for movie in rank[0]:
                # movie : our recommendation item
                if movie in items:
                    correct += 1
        prec = round((correct / total)*100, 4)
        
        return prec
    
    
    def recall(self, recs, train_data, test_data):
        correct = 0
        total = 0
        for user, items in test_data.items():
            if user not in recs:
                continue
            rank = recs[user]
            total += len(items)
            for movie in rank[0]:
                if movie in items:
                    correct += 1
        rc = round((correct / total)*100, 4)
        
        return rc
    
    def coverage(self, recs, test_data, train_data):
        all_item, recom_item = set(), set()
        for user in test_data:
            if user not in recs:
                continue
            for item in train_data[user]:
                all_item.add(item)
            rank = recs[user]
            for item in rank[0]:
                recom_item.add(item)
        return round(len(recom_item) / len(all_item) * 100, 4)

In [51]:
class ItemContentKNN:
    def __init__(self, train_data, test_data, contents, N):
        self.N = N
        self.item_similar, self.similar = self.train(train_data, contents)
    
    def train(self, train_data, contents):
        # establish word-item table
        word_item = dict()
        for item in contents:
            for word in contents[item]:
                if word not in word_item:
                    word_item[word] = dict()
                word_item[word][item] = 1
                
        for word in word_item:
            for item in word_item[word]:
                word_item[word][item] /= math.log(1 + len(word_item[word]))
        
        item_similar = dict()
        mo = dict()
        for word in word_item:
            for item in word_item[word]:
                if item not in item_similar:
                    item_similar[item] = dict()
                    mo[item] = 0
                mo[item] += word_item[word][item] ** 2
                for other in word_item[word]:
                    if item == other:
                        continue
                    if other not in item_similar[item]:
                        item_similar[item][other] = 0
                    item_similar[item][other] += word_item[word][item] * word_item[word][other]
        
        for u in item_similar:
            for v in item_similar[u]:
                item_similar[u][v] /= math.sqrt(mo[u] * mo[v])
    
        similar = {k: list(sorted(v.items(), reverse=True, key=lambda x:x[1])) for k, v in item_similar.items()}
        
        # item_similar: {item1:{item2:xxx,item3:xxx ...}, item2:{item1:xxx,item3:xxx ...} ...}
        return item_similar, similar
    
    
    def recommend(self, train_data, test_data, K):
        recs = dict()
        items = dict()
        for user in train_data:
            seen_items = set(train_data[user])
            for item in train_data[user]:
                for u, _ in self.similar[item][:K]:
                    if u in seen_items:
                        continue
                    if u not in items:
                        items[u] = 0
                    items[u] += self.item_similar[item][u]
                    
            recs[user] = list(sorted(items.items(), reverse=True, key=lambda x:x[1]))[:self.N]
            items.clear()
            
        return recs

In [7]:
dataset = Dataset()
train_data, test_data = dataset.split_data()

In [61]:
ick = ItemContentKNN(train_data, test_data, dataset.contents, 10)

In [84]:
recs = ick.recommend(train_data, test_data, 12)

In [85]:
metric = Metrics(recs, train_data, test_data)

In [86]:
print(metric.rc, metric.prec, metric.cov)

0.1634 0.2734 37.8517
