In [1]:
import numpy as np
import pandas as pd
import warnings
import os
import sys
import math
import copy
warnings.filterwarnings('ignore')

In [2]:
class ItemCF:
    def __init__(self, alpha=0.5):
        self.alpha = alpha
    
    def load_data(self):
        path1 = 'movielens_data/users.dat'
        path2 = 'movielens_data/ratings.dat'
        path3 = 'movielens_data/movies.dat'
        
        users = pd.read_csv(path1, header = None, sep = '::')
        ratings = pd.read_csv(path2, header = None, sep = '::')
        movies = pd.read_csv(path3, header = None, sep = '::')
        
        users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
        ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
        movies.columns = ['movie_id', 'title', 'genres']
        
        return users, ratings, movies
    
    def split_train_test(self, users, ratings):
        all_data = ratings.merge(users, on = 'user_id', how = 'left')
        
        shuffled_indices = np.random.permutation(len(all_data))
        test_set_size = int(len(all_data) * 0.2)
        
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        
        return all_data.iloc[train_indices], all_data.iloc[test_indices]
    
    
    def item_similarity(self, train_data):
        print('establishing the user_item table...')
        self.user_items = dict()
        for index, row in train_data.iterrows():
            if row['user_id'] not in self.user_items:
                self.user_items[row['user_id']] = set()
            self.user_items[row['user_id']].add(row['movie_id'])
        
        self.item_list = list(set(train_data['movie_id']))
        W = np.zeros([len(self.item_list), len(self.item_list)])
        
        print('calculating the co-rated matrix...')
        N = dict()
        k = 1
        for user, items in self.user_items.items():
            #print(str(k) + '/' + str(len(self.user_items)))
            k += 1
            for i in items:
                if self.item_list.index(i) not in N:
                    N[self.item_list.index(i)] = 0
                N[self.item_list.index(i)] += 1
                for j in items:
                    if i == j:
                        continue
                    W[self.item_list.index(i)][self.item_list.index(j)] += 1
        
        print('calculating the final similarity matrix...')
        for i in range(W.shape[0]):
            for j in range(W.shape[1]):
                if i == j or W[i][j] == 0:
                    continue
                #W[i][j] = W[i][j] / math.sqrt(N[i] * N[j])
                W[i][j] = (1 / math.log(1 + W[i][j])) / (math.pow(N[i], 1-self.alpha) * math.pow(N[j], self.alpha))
            # normalization for W
            W[i] = W[i] / W[i].max()
    
        return W
    
    def recommend(self, ratings, user, W, k=5):
        # if the uesr not in the dict, return an empty list
        if user not in self.user_items:
            return []
        
        # get the items which the user interacted 
        items = self.user_items[user]
        recommend_score = dict()
        for item in items:
            similar_items = sorted(enumerate(W[self.item_list.index(item)]), 
                                   reverse=True, key=lambda x:x[1])[0:k]
            
            # if the highest value is zero, return an empty list
            if similar_items[0][1] == 0:
                return []

            first_k = dict()
            for s in similar_items:
                # {movie_id : similarity value}
                m_id = self.item_list[s[0]]
                first_k[m_id] = s[1]
            
            '''
            for each similar items, if the user didn't interact with
            it, calculate the recommendation score and recommend it
            '''
            for movie in first_k.keys():
                if movie not in self.user_items[user]:   # if user didn't interact with the product
                    ave_rating = ratings.loc[ratings['movie_id'] == movie]['rating'].values.mean()
                    if first_k[movie] == 0:
                        continue
                    if movie not in recommend_score:
                        recommend_score[movie] = 0
                    # recommend_score = similarity_value * average_rating
                    recommend_score[movie] += first_k[movie] * ave_rating
            
        return recommend_score

In [3]:
users, ratings, movies = ItemCF().load_data()
print('data loaded')

data loaded


In [4]:
icf = ItemCF(alpha = 0.7)
train_data, test_data = icf.split_train_test(users, ratings)
W = icf.item_similarity(train_data.head(int(train_data.shape[0]/100)))
print('done.')

establishing the user_item table...
calculating the co-rated matrix...
calculating the final similarity matrix...
done.


In [None]:
test_user = list(set(test_data['user_id']))
for user in test_user:
    result = icf.recommend(ratings, user, W, k=3)
    if len(result) != 0:
        print('user_id: ', user)
        for movie, score in result.items():
            print(movie, movies.loc[movies['movie_id'] == movie]['title'].values[0], score)
        print('==================================================')