# User-Based recommender system
This notebook is written by Zitong Li

In [4]:
import pandas as pd
import copy as cp

In [5]:
data = pd.read_csv('../Data/ratings.csv')
data = data[['userId', 'movieId','rating']]
movies = pd.read_csv('../Data/movies.csv')
movies = movies[['movieId', 'title']]

### Train test split
For each user in all users, select fixed portion of movies he watched as train set and the rest as test set.

In [6]:
def train_test_split(data, seed = 6, portion = 0.8):
    train_set = {}
    test_set = {}
    for user, movies in data.groupby('userId'):
        movies = movies.sample(frac=1, random_state=seed).reset_index(drop=True)
        train = movies[:int(portion*len(movies))]
        test = movies[int(portion*len(movies)):]
        train_set[user] = train[['movieId', 'rating']]
        test_set[user] = test[['movieId', 'rating']]
    #print('Data preparation finished')
    return train_set, test_set

In [7]:
def itemUserAndRating(train_set):
    item_user = {}
    rating = {}
    for u, items in train_set.items():
        rating[u] = {}
        movies = items['movieId'].tolist()
        ratings = items['rating'].tolist()
        for inum in range(len(movies)):
            i = movies[inum]
            rating[u][i] = [ratings[inum], 0]
            if i not in item_user.keys():
                item_user[i] = set()
            item_user[i].add(u)
    return item_user, rating


### User-base theory
1. Caculate the matrix of user similarity based on cosin similarity
2. For a specific combination of user and item, (u, i), in all the other users who watched movie i, choose the top K most similar users to the use u.
3. Predict the score for such combination based on those top K similar users.

In [8]:
from operator import itemgetter
import numpy as np

def User_Similarity(train_set):
    N = {}
    C = {}
    W = {}
    for u, u_movie in train_set.items():
        N[u] = u_movie.shape[0]
        C[u] = {}
        W[u] = {}
        for v in train_set.keys():
            C[u][v] = 0
            W[u][v] = 0
    # Build inverse table
    item_user, rating = itemUserAndRating(train_set)
    for i, users in item_user.items():
        for u in users:
            for v in users:
                if v != u:
                    urating = rating[u][i][0]
                    vrating = rating[v][i][0]
                    corating = 5 - abs(urating - vrating)
                    C[u][v] += 1/np.log(1+len(users)*1.0) * corating
    
    # Calculate similarity matrix
    for u, related_users in C.items():
        for v, cuv in related_users.items():
            W[u][v] = cuv/(N[u]*N[v])**0.5
    print('user similarity finished')
    return W


Predict all the possible ratings

In [9]:
def predict(train_set, K, user_sim):
    # store the original ratings
    item_user, orig_ratings = itemUserAndRating(train_set)

    predict_ratings = cp.copy(orig_ratings)
    for u, wu in user_sim.items():  # u is the user, wu is the user similarity list of u
        # already_items is the movies already watched by u
        already_items = train_set[u]['movieId'].tolist()
        for item, vs in item_user.items():  # item is some movie, vs is the list of peopele who watched this movie
            if item not in already_items:
                # friendNum is the total amount of users we can use to predict for u with movie item
                friendNum = min(K, len(vs))
                wuv = [(v, wu[v]) for v in vs]
                topv = sorted(wuv, key=itemgetter(1), reverse=True)[
                    :friendNum]  # topv store the v id and wuv
                sum_wuv = sum([_[1] for _ in topv])
                if sum_wuv == 0:
                    #nobody wathced this movie
                    continue
                topv_normal = [(_[0], _[1]/sum_wuv)
                               for _ in topv]  # normalise the similarity
                if item not in predict_ratings[u]:
                    predict_ratings[u][item] = [0, 0]
                for v in topv_normal:
                    # the predict rating is the sum of product of related users's ratings and his similarity
                    predict_ratings[u][item][0] += orig_ratings[v[0]][item][0] * v[1]
                for v in topv:
                    predict_ratings[u][item][1] += orig_ratings[v[0]][item][0] * v[1]
        predict_ratings[u] = sorted(predict_ratings[u].items(), key = itemgetter(0))
    print('prediction finished.')
    return predict_ratings

In [10]:
def rmse(predict, real):
    return np.sqrt(np.mean((predict-real)**2))

In [11]:
# get the performance score of the predictions
def get_score(predictions, test_set):
    rmselist = []
    p = cp.copy(predictions)
    for user in test_set.keys():
        p[user] = list(map(lambda x:(x[0], x[1][0]), p[user]))
        prediction = pd.DataFrame(p[user], columns=['movieId', user])
        test = test_set[user]
        merge = pd.merge(prediction, test)
        user_error = rmse(merge[user], merge['rating'])
        #print(f"For user {user}, test error = {user_error}")
        rmselist.append(user_error)
    return np.mean(rmselist)

Let's test the model and get the rmse score

In [19]:
train_set, test_set = train_test_split(data)
user_similarity = User_Similarity(train_set)
predictions = predict(train_set, 20, user_similarity)
print(get_score(predictions, test_set))


user similarity finished
prediction finished.
0.9460420702603404


Let's do the cross validation and get a more convinceble error and variance.

In [0]:
def cross_validation(data, fold = 5, K = 20):
    score_list = []
    for seed in range(fold):
        train_set, test_set = train_test_split(data, seed)
        user_similarity = User_Similarity(train_set)
        predictions = predict(train_set, K, user_similarity)
        test_score = get_score(predictions, test_set)
        print(f"test:{seed}, rmse is {test_score}.")
        score_list.append(test_score)
    return np.mean(score_list), np.var(score_list,ddof=1)

In [0]:
mean, variance = cross_validation(data)
print('____________________________________')
print(f"Mean RMSE: {mean}, variance {variance}")

Let's see how will the score change if we modify the K values

In [0]:
import matplotlib.pyplot as plt
%matplotlib inline

train_set, test_set = train_test_split(data)
error = []
x = []
user_similarity = User_Similarity(train_set)
for K in range(1, 16, 2):
    predictions = predict(train_set, K, user_similarity)
    test_score = get_score(predictions, test_set)
    print(f"K:{K}, rmse is {test_score}.")
    error.append(test_score)
    x.append(K)
plt.plot(x, error)
plt.xlabel('K')
plt.ylabel('rmse error')
plt.show()


In [0]:
movies = pd.read_csv('./ml-latest-small/movies.csv')
movies = movies[['movieId', 'title']]
movies = movies.set_index('movieId')
titles = movies.to_dict()['title']

Let's recommend some movies for some users.

In [0]:
def recommend(user, train_set, predicitons, titles, topn = 10):
    print(f"For user {user}, the top {topn} movies we recommend:")
    prediction = predictions[user]
    already_items = train_set[user]['movieId'].tolist()
    i = 0
    result = []
    prediction = sorted(prediction, key = lambda x:x[1][1], reverse = True)
    for movie, score in prediction:
        if movie not in already_items:
            result.append((movie, titles[movie], score[1]))
            i += 1
        if i == topn:
            break
    result = pd.DataFrame(result)
    result.columns = ['movieId', 'title', 'score']
    print(result)
    return result


In [0]:
#recomd = recommend(3, train_set, predictions, titles)
for user in [1,2]:
    recommend(user, train_set, predictions, titles, 10)
    print('_____________________________________________________________')