In [38]:
import numpy as np
import pandas as pd
import random
import math

In [74]:
class DataSet:
    def __init__(self, path1, path2):
        self.data, self.profile = self.load_data(path1, path2)
        
    def load_data(self, path1, path2):
        data = []
        with open(path1) as file:
            for line in file:
                line = line.strip().split('\t')[0:2]
                data.append(line)
        print('file1 loaded.')
        
        with open(path2) as file:
            profile = dict()
            for line in file:
                user, gender, age, country, signup = line.strip().split('\t')
                if age == '':
                    age = -1
                profile[user] = {
                    'gender' : gender,
                    'age' : int(age),
                    'country' : country
                }
        print('file2 loaded')
        
        # get first 10000 users
        users = set(list(profile.keys())[:10000])
        data = [x for x in data if x[0] in users]
        profile = {k:profile[k] for k in users}
        
        # data : user : singer_id
        # profile : user : gender, age, country
        
        return data, profile
    
    
    def split_data(self):
        train, test = [], []
        random.seed(1)
        for user, item in self.data:
            if random.randint(0, 9) == 1:  
                test.append((user, item))
            else:
                train.append((user, item))
        
        # create user-item tables
        def convert_dict(data):
            data_dict = {}
            for user, item in data:
                if user not in data_dict:
                    data_dict[user] = set()
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict
        
        # train/test : {user : (singer1, singer2, ...)}

        return convert_dict(train), convert_dict(test), self.profile

In [91]:
class Metrics:
    def __init__(self, recs, train_data, test_data):
        self.prec = self.precision(recs, train_data, test_data)
        self.rc = self.recall(recs, train_data, test_data)
        self.cov = self.coverage(recs, test_data, train_data)
    
    
    def precision(self, recs, train_data, test_data):
        correct = 0
        total = 0
        for user, items in test_data.items():
            # items : movies which user have interacted
            if user not in recs:
                continue
            rank = recs[user]
            total += len(rank)
            for movie in rank:
                # movie : our recommendation item
                if movie in items:
                    correct += 1
        prec = round((correct / total)*100, 4)
        
        return prec
    
    
    def recall(self, recs, train_data, test_data):
        correct = 0
        total = 0
        for user, items in test_data.items():
            if user not in recs:
                continue
            rank = recs[user]
            total += len(items)
            for movie in rank:
                if movie in items:
                    correct += 1
        rc = round((correct / total)*100, 4)
        
        return rc
    
    def coverage(self, recs, test_data, train_data):
        all_item, recom_item = set(), set()
        for user in test_data:
            if user not in recs:
                continue
            for item in train_data[user]:
                all_item.add(item)
            rank = recs[user]
            for item in rank:
                recom_item.add(item)
        return round(len(recom_item) / len(all_item) * 100, 4)

In [84]:
class MostPopular:
    def __init__(self, train_data, N):
        self.N = N
        self.hot_singer = self.train(train_data)
    
    
    def train(self, train_data):
        singers = dict()
        for user in train_data:
            for singer in train_data[user]:
                if singer not in singers:
                    singers[singer] = 0
                singers[singer] += 1
        
        hot_singer = sorted(singers.items(), reverse=True, key=lambda x:x[1])
        
        return list(hot_singer)
    
    def recommend(self, train_data, test_data):
        recs = dict()
        for user in test_data:
            if user in train_data:
                seen_singers = set(train_data[user])
                recs[user] = [x[0] for x in self.hot_singer if x[0] not in seen_singers][:self.N]
            
        return recs 

In [86]:
class GenderMostPopular:
    def __init__(self, train_data, profile, N):
        self.N = N
        self.hot_f, self.hot_m = self.train(train_data, profile)
    
    def train(self, train_data, profile):
        hot_m = dict()
        hot_f = dict()
        for user in train_data:
            for singer in train_data[user]:
                if profile[user]['gender'] == 'm':
                    if singer not in hot_m:
                        hot_m[singer] = 0
                    hot_m[singer] += 1
                elif profile[user]['gender'] == 'f':
                    if singer not in hot_f:
                        hot_f[singer] = 0
                    hot_f[singer] += 1
        
        hot_f = list(sorted(hot_f.items(), reverse=True, key=lambda x:x[1]))
        hot_m = list(sorted(hot_m.items(), reverse=True, key=lambda x:x[1]))
        
        return hot_f, hot_m
    
    
    def recommend(self, train_data, test_data, profile):
        recs = dict()
        for user in test_data:
            if user in train_data:
                seen_singer = set(train_data[user])
                if profile[user]['gender'] == 'm':
                    recs[user] = [x[0] for x in self.hot_m if x[0] not in seen_singer][:self.N]
                elif profile[user]['gender'] == 'f':
                    recs[user] = [x[0] for x in self.hot_f if x[0] not in seen_singer][:self.N]
                    
        return recs

In [None]:
class AgeMostPopular:
    def __init__(self, train_data, profile, N):
        pass

In [77]:
dataset = DataSet('lastfm-dataset-360K/usersha1-artmbid-artname-plays.tsv', 
                 'lastfm-dataset-360K/usersha1-profile.tsv')

file1 loaded.
file2 loaded


In [78]:
train_data, test_data, profile = dataset.split_data()

In [94]:
mp = MostPopular(train_data, 10)
recs1 = mp.recommend(train_data, test_data)

In [95]:
metric1 = Metrics(recs1, train_data, test_data)
print(metric1.rc, metric1.prec, metric1.cov)

4.5484 2.2507 0.0565


In [87]:
gmp = GenderMostPopular(train_data, profile, 10)
recs2 = gmp.recommend(train_data, test_data, profile)

In [92]:
metric2 = Metrics(recs2, train_data, test_data)
print(metric2.rc, metric2.prec, metric2.cov)

4.7806 2.3644 0.0822


In [83]:
metric.cov

0.0565