In [None]:
import numpy as np
import pandas as pd
import warnings
import os
import sys
import math
import copy
warnings.filterwarnings('ignore')

In [None]:
class Metrics:
    def __init__(self, user_item, recs):
        self.recs = recs
        self.test_data = test_data
        self.user_item = user_item
      
    
    def precision(self):
        correct = 0
        total = 0
        for user, items in self.user_item.items():
            # items : movies which user have interacted
            rank = self.recs[user]
            total += len(rank)
            for movie, score in rank:
                # movie : our recommendation item
                if movie in items:
                    correct += 1
        prec = round((correct / total)*100, 4)
        
        return prec
    
    def recall(self):
        correct = 0
        total = 0
        for user, items in self.user_item.items():
            rank = self.recs[user]
            total += len(items)
            for movie, score in rank:
                if movie in items:
                    correct += 1
        rc = round((correct / total)*100, 4)
        
        return rc, correct
    
    def coverage(self):
        all_item = set()
        recom_item = set()
        for user, items in self.user_item.items():
            for item in items:
                all_item.add(item)
            rank = self.recs[user]
            for movie, score in rank:
                recom_item.add(movie)
        cov = round(len(recom_item) / len(all_item), 4)
        
        return cov
    
    def popularity(self, item_pop):
        num = 0
        pop = 0
        for user, items in self.user_item.items():
            rank = self.recs[user]
            for movie, score in rank:
                pop += math.log(1 + item_pop[movie])
                num += 1
        popular = round(pop/num, 4)
        
        return popular
                
        

In [None]:
class LFM:
    def __init__(self, epoches=10, K=10, alpha=0.01, lambda_r=0.1, ratio=1):
        self.epoches = epoches
        self.item_pool = []
        self.P = dict()
        self.Q = dict()
        self.alpha = alpha
        self.lambda_r = lambda_r
        self.K = K
        self.item_pop = dict()
        self.ratio = ratio
    
    def get_itemPool(self, train_data):
        '''
        # item_pool stores all the items with repetion
        for index, row in train_data.iterrows():
            self.item_pool.append(row['movie_id'])
        '''
        
        # item_pool stores the popularity of items in order
        all_item = dict()
        for index, row in train_data.iterrows():
            if row['movie_id'] not in all_item:
                all_item[row['movie_id']] = 0
            all_item[row['movie_id']] += 1
        
        self.item_pop = copy.deepcopy(all_item)
        all_item = list(all_item.items())
        # all_item/popular : (movie_id, times)
        popular = sorted(all_item, reverse=True, key=lambda x:x[1])
        for pop in popular:
            self.item_pool.append(pop[0])
        
    
    def load_data(self):
        path1 = 'movielens_data/users.dat'
        path2 = 'movielens_data/ratings.dat'
        path3 = 'movielens_data/movies.dat'
        
        users = pd.read_csv(path1, header = None, sep = '::')
        ratings = pd.read_csv(path2, header = None, sep = '::')
        movies = pd.read_csv(path3, header = None, sep = '::')
        
        users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
        ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
        movies.columns = ['movie_id', 'title', 'genres']
        
        return users, ratings, movies
    
    
    def split_train_test(self, users, ratings):
        all_data = ratings.merge(users, on = 'user_id', how = 'left')
        
        shuffled_indices = np.random.permutation(len(all_data))
        test_set_size = int(len(all_data) * 0.2)
        
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        
        return all_data.iloc[train_indices], all_data.iloc[test_indices]
    
    
    def collect_samples(self, user_items):
        '''
        collect samples, set the items which the
        user hava interacted as positives; and denote
        the popular items which user haven't interacted
        as negatives.
        @user_items: item set of user (set)
        '''
        # collect positive samples
        labels = dict()
        for movie in user_items:
            labels[movie] = 1
        
        # collect negative smaples
        n_negative = 0
        i = 0
        while n_negative < len(user_items)*self.ratio:
            #negative_sample = self.item_pool[np.random.randint(0, len(self.item_pool)-1)]
            negative_sample = self.item_pool[i]
            i += 1
            if negative_sample in labels:
                continue
            labels[negative_sample] = 0
            n_negative += 1
        
        return labels
    
    
    def loss(self, train_error):
        C = 0.0
        for user, error in train_error.items():
            C += error
            
        return round(C, 3)
        
    
    def predict(self, user, item):
        rank = np.dot(self.P[user], self.Q[item])
        rank = 1.0/(1 + np.exp(-rank))
        
        return rank
    
    
    def init_model(self):
        for user in self.user_item.keys():
            self.P[user] = np.random.rand(self.K)
        
        for item in self.item_pool:
            self.Q[item] = np.random.rand(self.K)
            
    
    def train(self, train_data):
        # get popular items
        self.get_itemPool(train_data)
        
        self.user_item = dict()
        # user_item table
        print('establishing user-item tabel...')
        for index, row in train_data.iterrows():
            if row['user_id'] not in self.user_item:
                self.user_item[row['user_id']] = set()
            self.user_item[row['user_id']].add(row['movie_id'])
        
        # inital model
        self.init_model()
        
        error_dict = dict()
        for user in self.user_item.keys():
            error_dict[user] = 0
            
        for epoch in range(self.epoches):
            print('epoch {} training...'.format(epoch))
            train_error = copy.deepcopy(error_dict)
            for user, items in self.user_item.items():
                samples = self.collect_samples(items)
                
                # samples : {movie_id : label}
                for item, label in samples.items():
                    error = label - self.predict(user, item)
                    self.P[user] -= self.alpha * (-error * self.Q[item] + self.lambda_r * self.P[user])
                    self.Q[item] -= self.alpha * (-error * self.P[user] + self.lambda_r * self.Q[item])
                    
                    train_error[user] += np.square(error)
                    
            self.alpha *= 0.9
            print('loss: {}'.format(self.loss(train_error)))
            print('=============================')
    
    
    def recommend(self, user, N):
        iteracted_item = self.user_item[user]
        recommend_score = dict()
        for item in self.item_pool:
            if item not in iteracted_item:
                # recommend_score : item_id : score
                recommend_score[item] = np.dot(self.P[user], self.Q[item])
        
        recs = sorted(recommend_score.items(), reverse=True, key=lambda x:x[1])[0:N]
        
        return recs

In [None]:
users, ratings, movies = LFM().load_data()
print('data loaded')

In [None]:
lfm = LFM(epoches=200, K=5, alpha=0.02, lambda_r=0.05, ratio=1)
train_data, test_data = lfm.split_train_test(users, ratings)

In [None]:
lfm.train(train_data.head(int(train_data.shape[0]/100)))
#lfm.train(train_data)

In [None]:
test_user = list(set(test_data['user_id']))
N = 100
user_list = []
recs = dict()
for user in test_user:
    if user in lfm.user_item:
        user_list.append(user)
        recs[user] = lfm.recommend(user, N)
        print('user_id: ', user)
        #for movie, score in recs[user]:
            #print(movie, movies.loc[movies['movie_id'] == movie]['title'].values[0], score)
        #print('==================================================')

In [None]:
print('establishing user item table...')
user_item = dict()
for index, row in test_data.iterrows():
    if row['user_id'] in user_list:
        if row['user_id'] not in user_item:
            user_item[row['user_id']] = set()
        user_item[row['user_id']].add(row['movie_id'])
print('done')

In [None]:
metric = Metrics(user_item, recs)
print('precision: {}'.format(metric.precision()))
print('recall: {}'.format(metric.recall()[0]))
#print('coverage: {}'.format(metric.coverage()))
#print('popularity: {}'.format(metric.popularity(lfm.item_pop)))
print('total: {}'.format(N * len(user_list)))
print('correct: {}'.format(metric.recall()[1]))

In [None]:
m_num = 0
each = []
for u, m in lfm.user_item.items():
    m_num += len(m)
    each.append(len(m))
print(m_num/len(lfm.user_item))
each = pd.DataFrame(each)
#print(each.describe())

lfm = LFM(epoches=80, K=2, alpha=0.02, lambda_r=0.01, ratio=1)     100
precision: 8.1894
recall: 1.7076
total: 33690
correct: 2759

In [None]:
lfm.P

In [None]:
np.random.rand(5)