In [2101]:
import numpy as np
import pandas as pd

In [2102]:
# define column names
col_names = {
    "data": [ 'user id' , 'item id' , 'rating' , 'timestamp'],
    "item": ['movie id' , 'movie title' , 'release date' , 'video release date' ,
              'IMDb URL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western'],
    "user": ['user id' , 'age' , 'gender' , 'occupation' , 'zip code'],
    "genre": ['genre', 'genre id']
}

In [2103]:
def read_data(file_name, sep, encoding, col_names):
    output = pd.read_csv(file_name, sep=sep, encoding=encoding, names=col_names)
    return output

# Task 1: Read and display data


In [2104]:
ratings = read_data("./u.data", "\t", 'utf-8', col_names["data"])
ratings.head(5)

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


## Downloaded the correct data

In [2105]:
print(len(ratings))

100000


In [2106]:
movies = read_data("./u.item", "|", 'latin-1', col_names["item"])
movies.head(5)

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [2107]:
movies.drop(columns= ['video release date', 'IMDb URL'], inplace=True)
movies.head(5)

Unnamed: 0,movie id,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [2108]:
users = read_data("./u.user", "|", 'utf-8', col_names["user"])
users.head(5)

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [2109]:
genres = read_data("./u.genre", "|", 'utf-8', col_names["genre"])
genres.head(20)

Unnamed: 0,genre,genre id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


In [2110]:
def get_ratings_single_movie(movie_id):
    # get all available ratings for a single movie
    return ratings[ratings["item id"] == movie_id].sort_values(by=['user id'])

In [2111]:
def get_ratings_single_user(user_id):
    # get all movies rated by the user
    return ratings[ratings["user id"] == user_id].sort_values(by=['item id'])  

In [2112]:
def get_both_rated_set(user1_ratings, user2_ratings):
    # return all items rated by both user sorted by the item id
    sim_user1_ratings = user1_ratings[user1_ratings["item id"].isin(user2_ratings["item id"])]
    sim_user2_ratings = user2_ratings[user2_ratings["item id"].isin(user1_ratings["item id"])]
    
    sim_user1_ratings = sim_user1_ratings.sort_values(by=['item id'])
    sim_user2_ratings = sim_user2_ratings.sort_values(by=['item id'])
    return sim_user1_ratings, sim_user2_ratings

In [2113]:
def get_both_raters_set(item1_ratings, item2_ratings):
    # return all ratings bytserh users who rated both items sorted by the user id
    item1_raters = item1_ratings[item1_ratings["user id"].isin(item2_ratings["user id"])]["user id"].tolist()
    item2_raters = item2_ratings[item2_ratings["user id"].isin(item1_ratings["user id"])]["user id"].tolist()
    users_rated_both = np.unique(item1_raters + item2_raters)
    ratings_both = ratings[ratings["user id"].isin(users_rated_both)]
    ratings_both = ratings_both.sort_values(by=['user id'])
    return ratings_both

# Task 2: Pearson correlation function

In [2114]:
def pearson_correlation(user1_id, user2_id):
    # calculate pearson correlation between 2 users
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # step3: calculate mean ratings ra, rb
    mean_user1_ratings = np.mean(sim_user1_ratings['rating'])
    mean_user2_ratings = np.mean(sim_user2_ratings['rating'])
    # step4: calculate the variance  
    var_1 = np.array(np.subtract(sim_user1_ratings["rating"], [mean_user1_ratings]))
    var_2 = np.array(np.subtract(sim_user2_ratings["rating"], [mean_user2_ratings]))

    # step5: compute the pearson correlation
    numerator = np.sum(var_1*var_2)
    denominator = np.sqrt(np.sum(np.power(var_1, 2)))*np.sqrt(np.sum(np.power(var_2, 2)))

    if denominator == 0:
        # in the case that denominator = 0 return -1
        return -1, user1_id, user2_id
    else:
        correlation = numerator / denominator
        return correlation, user1_id, user2_id


# Task 3: Prediction function user based

In [2115]:
def predict_single_pair_user(user1_id, user2_id, item_id):
    # predict item's score of user 1 based on user 2
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # if there is no similar rated item, return nan
    if sim_user1_ratings.empty:
        return list([float('NaN'), float('NaN')])
    
    # step3: compute the mean rating of user 2
    mean_user2_ratings = np.mean(sim_user2_ratings['rating'])
    
    # step4: get the pearson correlation
    correlation, user1_id, user2_id = pearson_correlation(user1_id, user2_id)
    var_2 = float(user2_ratings[user2_ratings["item id"] == item_id]["rating"]) - mean_user2_ratings
    
    # step5: return the output
    numerator = (correlation*var_2)
    denominator = correlation
    return list([numerator, denominator])

In [2116]:
def predict_user_item(user_id, item_id):
    # predict item's score for user
    # step 1: get user ratings
    user_ratings = get_ratings_single_user(user_id)
    # step 2: compute the mean rating
    mean_user_ratings = np.mean(user_ratings['rating'])
    
    # step 3:  get all other users which rated the item
    users_domain = ratings[ratings["item id"] == item_id]
    
    # step 4: predict for each user in the users domain
    correlations = users_domain.apply(lambda row: predict_single_pair_user(user_id, row["user id"], item_id), axis=1, result_type="expand")
    correlations = np.array(correlations)
    # filter all nan, which cause by no same rated item between 2 users
    correlations = correlations[~np.isnan(correlations).any(axis=1), :]
    
    # step 5: calculate the score and return
    pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])
    return item_id, movies.at[item_id - 1, 'movie title'], pred_score

In [2117]:
predict_user_item(4, 346)

(346, 'Jackie Brown (1997)', 2.343200686356572)

# Task 4: 10 most similar users and 20 most relevant movies for a user

In [2118]:
def recommend_users_and_movies_user_based(user_id, users_num=10, movies_num=20):
    # recommend users and movies for user

    # step 1: get correlation of other users to the user
    other_users = users[users["user id"] != user_id]
    user_correlations = other_users.apply(lambda row: pearson_correlation(user_id, row["user id"]), axis=1, result_type="expand")
    
    # step 2: sort the correclation and choose the top 10
    user_correlations = np.array(user_correlations)
    user_correlations = user_correlations[np.argsort(user_correlations[:,0])]
    user_correlations = user_correlations[-users_num:]
    top_users = user_correlations[:,-1]

    # step 3: get movies not yet rated by this user
    user_ratings = get_ratings_single_user(user_id)
    movies_not_rated = movies[~movies["movie id"].isin(user_ratings["item id"])]

    # step 4: predict movie ratings for this user
    movies_ratings = movies_not_rated.apply(lambda row: predict_user_item(user_id, row["movie id"]), axis=1, result_type="expand")
    movies_ratings = np.array(movies_ratings)
    movies_ratings = movies_ratings[np.argsort(movies_ratings[:,2])]
    movies_ratings = movies_ratings[-movies_num:]
    top_movies = movies_ratings[:,-1]
    
    return top_users, top_movies
    

In [2119]:
# recommend_users_and_movies_user_based(4)

# Task 5: Adjusted cosine similarity

In [2120]:
def adjusted_cosine_similarity(item1_id, item2_id):
    # calculate adjusted cosine similarity between 2 items
    # step1: get all ratings of both items
    item1_ratings = get_ratings_single_movie(item1_id)
    item2_ratings = get_ratings_single_movie(item2_id)
    
    # step2: get the users who rated both items
    users_rated_both = get_both_raters_set(item1_ratings, item2_ratings)
    
    # step3: get average ratings from these users
    mean_user_ratings = users_rated_both.groupby("user id").mean()

    # step4: calculate the variance  
    var_1 = np.array(np.subtract(np.float64(item1_ratings[item1_ratings["user id"].isin(item2_ratings["user id"])]["rating"]), mean_user_ratings["rating"]))
    var_2 = np.array(np.subtract(np.float64(item2_ratings[item2_ratings["user id"].isin(item1_ratings["user id"])]["rating"]), mean_user_ratings["rating"]))

    # step5: compute the adjusted cosine similarity
    numerator = np.sum(var_1*var_2)
    denominator = np.sqrt(np.sum(np.power(var_1, 2)))*np.sqrt(np.sum(np.power(var_2, 2)))

    if denominator == 0:
        # in the case that denominator = 0 return -1
        return -1, item1_id, item2_id
    else:
        adjusted_cosine_similarity = numerator / denominator
        return adjusted_cosine_similarity, item1_id, item2_id


In [2121]:
adjusted_cosine_similarity(4, 346)

(0.22128604193998852, 4, 346)

# Task 6: Item-based prediction function

In [2122]:
def predict_single_pair_item(item1_id, item2_id, user_id):
    # predict item's score of user 1 based on user 2
    # step1: get all ratings by 2 users
    item1_ratings = get_ratings_single_movie(item1_id)
    item2_ratings = get_ratings_single_movie(item2_id)
    
    # step2: get the items rated by both users
    users_rated_both = get_both_raters_set(item1_ratings, item2_ratings)
    
    # if there is no mutal person who rated this item, return nan
    if users_rated_both.empty:
        return list([float('NaN'), float('NaN')])
    
    # step3: calcalate adjusted cosine similarity
    similarity, item1_id, item2_id = adjusted_cosine_similarity(item1_id, item2_id)

    var_2 = np.float64(item2_ratings[item2_ratings["user id"] == user_id]["rating"])

    # step5: return the output
    numerator = (similarity*var_2)
    denominator = similarity
    return list([numerator, denominator])

In [2123]:
def item_based_predicter(user_id, item_id):
    # predict item's score for user
    # step 1: get user ratings
    user_ratings = get_ratings_single_user(user_id)
    user_ratings = user_ratings[user_ratings["item id"] != item_id]

    # step 2: predict for each rated item
    similarities = user_ratings.apply(lambda row: predict_single_pair_item(item_id, row["item id"], user_id), axis=1, result_type="expand")
    similarities = np.array(similarities)
    # filter all nan, which cause by no same rated item between 2 users
    similarities = similarities[~np.isnan(similarities).any(axis=1), :]

    # step 5: calculate the score and return
    pred_score = np.sum(similarities[:,0]) / np.sum(similarities[:,1])
    return item_id, movies.at[item_id - 1, 'movie title'], pred_score

In [2124]:
item_based_predicter(4, 34)

[  -3.59483799  -66.14822675  -21.53898399  -59.11211862   82.67954406
   76.70688037   49.37648733   53.03839269  -49.9164868   -49.9164868
   40.47244269 -100.          -66.0810607    26.90464733  100.
  -28.92505384  -10.2020538   -73.93127429  100.        ]
-0.1881891074144164
-5.522502371623476 -0.0018818910741440487


(34, 'Doom Generation, The (1995)', 2934.549426106029)

# Task 7: Top 20 relevant movies that item-based recommender suggests

In [2125]:
def recommend_movies_item_based(user_id, movies_num=20):
    # recommend users and movies for user

    # step 1: get movies not yet rated by this user
    user_ratings = get_ratings_single_user(user_id)
    movies_not_rated = movies[~movies["movie id"].isin(user_ratings["item id"])]

    # step 2: predict movie ratings for this user
    movies_ratings = movies_not_rated.apply(lambda row: item_based_predicter(user_id, row["movie id"]), axis=1, result_type="expand")
    movies_ratings = movies_ratings.sort_values(by=2, ascending=False)
    print(movies_ratings)
    top_movies = movies_ratings[-movies_num:]
    return top_movies

In [2126]:
recommend_movies_item_based(4)

[  26.23215648   31.94826473   37.82493313   12.25552783  -32.33963708
  -33.54217961   -8.03182956    7.75610435   12.9554       16.35107122
   -0.70516233    3.28124722  -12.31649886  -10.00951415   10.13588385
  -47.0134524    -1.33066942  -18.67984617   24.933717    -36.85848253
  -95.90096722    2.76348116 -100.           -3.36768809]
-213.65814046783163
-9.67743348327999 -2.136581404678316
[   3.02907031  -18.51298632    2.06017167   -2.8839288   -13.36905954
   30.56430349   29.45020088   -4.69503839   23.29289361   24.98429152
  -19.40073447   51.89207839   16.08113914   -4.85955931   19.99506881
   31.50511152   47.30341179   -5.07605397  -43.36288628   13.47888756
  100.         -100.         -100.           16.79847541]
98.27485704604015
3.847983555609932 0.9827485704604019
[ -26.84904803  -36.26081836  -35.8163955    -3.83436161  -12.16993676
    3.9459525    24.55126266  -10.64557535   14.34185116   -4.66449158
  -20.110854    -56.00274803    0.59086721   21.82527588   23.

KeyboardInterrupt: 