In [104]:
import random
import math
import numpy as np
import time
import pandas as pd
from tqdm import tqdm, trange
import warnings
import copy
import sys
warnings.filterwarnings('ignore')

In [105]:
class Dataset:
    def __init__(self, path='movielens_data/ratings.dat'):
        self.data = self.load_data(path)
    
    def load_data(self, path):
        data = []
        for l in open(path):
            data.append(tuple(map(int, l.strip().split('::')[:2])))
            
        return data
    
    def split_data(self, M=5, k=0, seed=1):
        test = []
        train = []
        random.seed(seed)
        
        for user, item in self.data:
            if random.randint(0, M-1) == k:
                test.append((user, item))
            else:
                train.append((user, item))
        
        def convert_dict(data):
            data_dict = {}
            for user, item in data:
                if user not in data_dict:
                    data_dict[user] = set()
                data_dict[user].add(item)
            data_dict = {k: list(data_dict[k]) for k in data_dict}
            return data_dict
        
        return convert_dict(train), convert_dict(test)

In [106]:
class Metrics:
    def __init__(self, recs, test_data):
        self.prec = self.precision(recs, test_data)
        self.rc = self.recall(recs, test_data)
    
    
    def precision(self, recs, test_data):
        correct = 0
        total = 0
        for user, items in test_data.items():
            # items : movies which user have interacted
            rank = recs[user]
            total += len(rank)
            for movie in rank:
                # movie : our recommendation item
                if movie in items:
                    correct += 1
        prec = round((correct / total)*100, 4)
        
        return prec
    
    
    def recall(self, recs, test_data):
        correct = 0
        total = 0
        for user, items in test_data.items():
            rank = recs[user]
            total += len(items)
            for movie in rank:
                if movie in items:
                    correct += 1
        rc = round((correct / total)*100, 4)
        
        return rc

In [108]:
class LFM:
    def __init__(self, epoches=5, K=10, alpha=0.05, lambda_r=0.01, 
                 ratio=1, N=10):
        self.epoches = epoches
        self.item_pool = []
        self.alpha = alpha
        self.lambda_r = lambda_r
        self.K = K
        self.ratio = ratio
        self.N = N
        self.items = []
        self.pops = []
    

    def get_itempool(self, train):
        all_items = {}
        for user in train:
            for item in train[user]:
                if item not in all_items:
                    all_items[item] = 0
                all_items[item] += 1
                
        all_items = list(all_items.items())
        
        self.items = [x[0] for x in all_items]
        self.pops = [x[1] for x in all_items]
        
        for user, items in train.items():
            for item in items:
                self.item_pool.append(item)

        '''
        popular = sorted(all_items, reverse=True, key=lambda x:x[1])
        for p in popular:
            self.item_pool.append(p[0])
        '''
    
    
    def collect_samples(self, items):
        # collect positive samples
        labels = dict()
        for movie in items:
            labels[movie] = 1
        
        # collect negative smaples
        n_negative = 0
        i = 0
        
        while (n_negative < len(items)*self.ratio) and (i < len(self.item_pool)-1):
            negative_sample = self.item_pool[np.random.randint(0, len(self.item_pool)-1)]
            #negative_sample = self.item_pool[i]
            i += 1
            if negative_sample in labels:
                continue
            labels[negative_sample] = 0
            n_negative += 1
        
        return labels
    
    def collect_samples2(self, items):
        # collect positive samples
        labels = dict()
        for movie in items:
            labels[movie] = 1
        
        # collect negative smaples
        n_negative = 0
        
        seen = set(items)
        pos_num = len(seen)
        item = np.random.choice(self.items, int(pos_num*self.ratio*3), self.pops)
        item = [x for x in item if x not in seen][:int(pos_num*self.ratio)]
        for i in item:
            labels[i] = 0
        
        return labels
    
    
    def loss(self, train_error):
        C = 0.0
        for user, error in train_error.items():
            C += error
            
        return round(C, 3)
        
    
    
    def predict(self, user, item):
        rank = np.dot(self.P[user], self.Q[item])
        rank = 1.0/(1 + np.exp(-rank))
        
        return rank
    
    
    def init_model(self, train_data):
        self.P = dict()
        self.Q = dict()
        for user in train_data:
            self.P[user] = np.random.random(self.K)
        for item in self.items:
            self.Q[item] = np.random.random(self.K)
      
    
    def train(self, train_data):    
        self.init_model(train_data)
        
        error_dict = dict()
        for user in train_data.keys():
            error_dict[user] = 0
            
        
        for epoch in trange(self.epoches):
            train_error = copy.deepcopy(error_dict)
            for user, items in train_data.items():
                samples = self.collect_samples2(items)
                for item, label in samples.items():
                    error = label - self.predict(user, item)
                    self.P[user] += self.alpha * (error * self.Q[item] - self.lambda_r * self.P[user])
                    self.Q[item] += self.alpha * (error * self.P[user] - self.lambda_r * self.Q[item])

                    train_error[user] += np.sqrt(np.square(error))

            #lr *= 0.9 
            print('loss: {}'.format(self.loss(train_error)))
            
    
    def recommend(self, train_data, test_data):
        recs = dict()
        recommend_score = dict()
        for user, u_items in test_data.items():
            recs[user] = []
            recommend_score.clear()
            for item in self.items:
                if item not in train_data[user]:
                    # recommend_score: item : score
                    recommend_score[item] = np.dot(self.P[user], self.Q[item])
            for movie in sorted(list(recommend_score.items()), reverse=True, key=lambda x:x[1])[0:self.N]:
                recs[user].append(movie[0])
        
        return recs

In [109]:
dataset = Dataset()
train_data, test_data = dataset.split_data()

In [110]:
lfm = LFM()
lfm.get_itempool(train_data)

In [111]:
lfm.train(train_data)


  0%|          | 0/5 [00:00<?, ?it/s][A
 20%|██        | 1/5 [00:44<02:59, 44.77s/it][A

loss: 607411.832



 40%|████      | 2/5 [01:26<02:11, 43.99s/it][A

loss: 499337.765



 60%|██████    | 3/5 [02:11<01:28, 44.13s/it][A

loss: 447994.259



 80%|████████  | 4/5 [02:58<00:45, 45.02s/it][A

loss: 419919.474



100%|██████████| 5/5 [03:43<00:00, 45.09s/it][A

loss: 402508.943


In [112]:
recs = lfm.recommend(train_data, test_data)

In [113]:
metric = Metrics(recs, test_data)

In [114]:
metric.rc

7.4158

In [115]:
metric.prec

24.6179