In [125]:
import numpy as np
import pandas as pd
import warnings
import os
import sys
import math
import copy
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True)


class UserCF:
    def __init__(self):
        pass
    
    def load_data(self):
        path1 = 'movielens_data/users.dat'
        path2 = 'movielens_data/ratings.dat'
        path3 = 'movielens_data/movies.dat'
        
        users = pd.read_csv(path1, header = None, sep = '::')
        ratings = pd.read_csv(path2, header = None, sep = '::')
        movies = pd.read_csv(path3, header = None, sep = '::')
        
        users.columns = ['user_id', 'gender', 'age', 'occupation', 'zip_code']
        ratings.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
        movies.columns = ['movie_id', 'title', 'genres']
        
        return users, ratings, movies
    
    def split_train_test(self, users, ratings):
        all_data = ratings.merge(users, on = 'user_id', how = 'left')
        
        shuffled_indices = np.random.permutation(len(all_data))
        test_set_size = int(len(all_data) * 0.2)
        
        test_indices = shuffled_indices[:test_set_size]
        train_indices = shuffled_indices[test_set_size:]
        
        return all_data.iloc[train_indices], all_data.iloc[test_indices]
    
    def user_similarity(self, train_data):
        
        # build inverse table for item_users
        print('establising the inverse table...')
        self.item_users = dict()
        for index, row in train_data.iterrows():
            if row['movie_id'] not in self.item_users:
                self.item_users[row['movie_id']] = set()
            self.item_users[row['movie_id']].add(row['user_id'])
            
        # calculate co-rated items between users
        print('calculating the co-rated matrix...')
        self.user_list = list(set(train_data['user_id']))
        movie_list_len = len(set(train_data['movie_id']))
        
        C = [[0 for i in range(len(self.user_list))] for i in range(len(self.user_list))]
        W = copy.deepcopy(C)
        
        self.N = dict()
        i = 1
        for movie, users in self.item_users.items():
            #print(str(i) + ' in ' + str(movie_list_len))
            i += 1
            for u in users:
                if self.user_list.index(u) not in self.N:
                    self.N[self.user_list.index(u)] = 0
                self.N[self.user_list.index(u)] += 1
                for v in users:
                    if u == v:
                        continue
                    #C[self.user_list.index(u)][self.user_list.index(v)] += 1
                    C[self.user_list.index(u)][self.user_list.index(v)] += 1 / math.log(1 + len(users))
                    
        # calculate finial similarity matrix W
        print('calculating the final similarity matrix...')
        for i in range(len(C)):
            for j in range(len(C)):
                if i == j or C[i][j] == 0:
                    continue
                W[i][j] = C[i][j] / math.sqrt(self.N[i] * self.N[j])
        
        return W
    
    def recommend(self, ratings, user, W, k=5, threshold=3):
        # find the first k highest similarity values
        similar = sorted(enumerate(W[self.user_list.index(user)]), reverse=True, key=lambda x:x[1])[0:k]
        
        movie_list = np.array(k)
        first_k = dict()
        for s in similar:
            # {user_index : similarity_value}
            first_k[s[0]] = s[1]
            # find the movies which the similar uses have interacted
            movie_list = np.append(movie_list, np.array(ratings.loc[ratings['user_id'] == s[0]]['movie_id'].values))
        
        # if the highest value zero, retun emtpy
        if similar[0][1] == 0:
            return []
    
    
        recommend_score = dict()
    
        # if users's 'similar users' have touched the product,
        # but the user have interacted with the producet,
        # give an evluation more than the threshold,
        # then recommend it to the current user
        for movie in movie_list:
            if movie in self.item_users:
                if user not in self.item_users[movie]:    # if user didn't interact with the product
                    for u in first_k.keys():
                        if u in self.item_users[movie]:    # if the similar users have interacted with the product
                            rating = int(ratings.loc[(ratings['user_id'] == u) & (ratings['movie_id'] == movie)]['rating'])
                            if rating >= threshold:
                                if movie not in recommend_score:
                                    recommend_score[movie] = 0
                                recommend_score[movie] += first_k[u] * rating     

                        
        return recommend_score

In [23]:
users, ratings, movies = UserCF().load_data()
print('data loaded')

data loaded


In [126]:
ucf = UserCF()
train_data, test_data = ucf.split_train_test(users, ratings)
W = ucf.user_similarity(train_data.head(int(train_data.shape[0]/1000)))
print('done')

establising the inverse table...
calculating the co-rated matrix...
calculating the final similarity matrix...
done


In [127]:
test_user = list(set(test_data['user_id']))
for user in test_user:
    if user not in ucf.user_list:
        continue
    result = ucf.recommend(ratings, user, W, k=3, threshold=5)
    if len(result) != 0:
        print('user_id: ', user)
        for movie, score in result.items():
            print(movie, movies.loc[movies['movie_id'] == movie]['title'].values[0], score)
        print('==================================================')

user_id:  25
1965 Repo Man (1984) 3.6067376022224087
user_id:  1873
953 It's a Wonderful Life (1946) 4.551196133134186
user_id:  2885
1965 Repo Man (1984) 3.6067376022224087
user_id:  3780
3578 Gladiator (2000) 2.275598066567093
user_id:  5389
527 Schindler's List (1993) 3.6067376022224087
user_id:  5976
527 Schindler's List (1993) 3.6067376022224087
user_id:  5982
3785 Scary Movie (2000) 4.551196133134186


In [88]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [133]:
a = [0,1,2]
b = [a] * len(a)
for i in range(len(b)):
    for j in range(len(b)):
        if i == j:
            continue
        b[i][j] += 3
print(id(b[0]), id(b[1]))

140478493192648 140478493192648


In [131]:
a = [0,1,2]
b = [[0 for i in range(len(a))] for i in range(len(a))]
for i in range(len(b)):
    for j in range(len(b)):
        if i == j:
            continue
        b[i][j] += 3
b

[[0, 3, 3], [3, 0, 3], [3, 3, 0]]