In [1]:
import numpy as np
import pandas as pd
import warnings
import os
import sys
import math
import copy
warnings.filterwarnings('ignore')

In [2]:
class LFM:
    def __init__(self, epoches=10, K=10, alpha=0.01, lambda_r=0.1):
        self.epoches = epoches
        self.item_pool = []
        self.P = dict()
        self.Q = dict()
        self.alpha = alpha
        self.lambda_r = lambda_r
        self.K = K
    
    def get_itemPool(self, train_data):
        all_item = dict()
        for index, row in train_data.iterrows():
            if row['movie_id'] not in all_item:
                all_item[row['movie_id']] = 0
            all_item[row['movie_id']] += 1
        
        all_item = list(all_item.items())
        # all_item/popular : (movie_id, times)
        popular = sorted(all_item, reverse=True, key=lambda x:x[1])
        for pop in popular:
            self.item_pool.append(pop[0])
    
    
    def load_data(self):
        path1 = 'movielens_data/users.dat'
        path2 = 'movielens_data/ratings.dat'
        path3 = 'movielens_data/movies.dat'
        
        users = pd.read_csv(path1, header = None, sep = '::')
        ratings = pd.read_csv(path2, header = None, sep = '::')
        movies = pd.read_csv(path3, header = None, sep = '::')
        
        users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
        ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
        movies.columns = ['movie_id', 'title', 'genres']
        
        return users, ratings, movies
    
    
    def split_train_test(self, users, ratings):
        all_data = ratings.merge(users, on = 'user_id', how = 'left')
        
        shuffled_indices = np.random.permutation(len(all_data))
        test_set_size = int(len(all_data) * 0.2)
        
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        
        return all_data.iloc[train_indices], all_data.iloc[test_indices]
    
    
    def collect_samples(self, user_items):
        '''
        collect samples, set the items which the
        user hava interacted as positives; and denote
        the popular items which user haven't interacted
        as negatives.
        @user_items: item set of user (set)
        '''
        #print('collecting positive samples...')
        # collect positive samples
        labels = dict()
        for movie in user_items:
            labels[movie] = 1
        
        #print('collecting negative samples...')
        # collect negative smaples
        n_negative = 0
        i = 0
        while n_negative < len(user_items):
            # collect negative samples from
            # item pool list, which stores all
            # the items and ordered by popular index
            negative_sample = self.item_pool[i]
            i += 1
            if negative_sample in labels:
                continue
            labels[negative_sample] = 0
            n_negative += 1
        
        return labels
    
    
    def loss(self, train_error):
        C = 0.0
        for user, error in train_error.items():
            C += error
            
        return round(C, 3)
        
    
    def predict(self, user, item):
        rank = np.dot(self.P[user], self.Q[item])
        rank = 1.0/(1 + math.exp(-rank))
        
        return rank
    
    
    def init_model(self):
        for user in self.user_item.keys():
            self.P[user] = np.random.normal(size=self.K)
        
        for item in self.item_pool:
            self.Q[item] = np.random.normal(size=self.K)
            
    
    def train(self, train_data):
        # get popular items
        self.get_itemPool(train_data)
        
        self.user_item = dict()
        # user_item table
        print('establishing user-item tabel...')
        for index, row in train_data.iterrows():
            if row['user_id'] not in self.user_item:
                self.user_item[row['user_id']] = set()
            self.user_item[row['user_id']].add(row['movie_id'])
        
        # inital model
        self.init_model()
        
        error_dict = dict()
        for user in self.user_item.keys():
            error_dict[user] = 0
            
        for epoch in range(self.epoches):
            print('epoch {} training...'.format(epoch))
            train_error = copy.deepcopy(error_dict)
            for user, items in self.user_item.items():
                samples = self.collect_samples(items)
                
                # samples : {movie_id : label}
                for item, label in samples.items():
                    error = label - self.predict(user, item)
                    self.P[user] += self.alpha * (error * self.Q[item] - 
                                                   self.lambda_r * self.P[user])
                    self.Q[item] += self.alpha * (error * self.P[user] - 
                                                   self.lambda_r * self.Q[item])
                    '''
                    train_error[user] += np.square(error) + self.lambda_r * (np.sum(np.square(self.P[user])) 
                                                                             + np.sum(np.square(self.Q[item])))
                    '''
                    train_error[user] += np.square(error)
                    
            #self.alpha *= 0.9
            print('loss: {}'.format(self.loss(train_error)))
            print('=============================')
                

In [3]:
users, ratings, movies = LFM().load_data()
print('data loaded')

data loaded


In [4]:
lfm = LFM(epoches=500, K=15, alpha=0.1, lambda_r=0.01)
train_data, test_data = lfm.split_train_test(users, ratings)

In [None]:
lfm.train(train_data.head(int(train_data.shape[0]/500)))