In [97]:
import numpy as np
import pandas as pd

In [98]:
# define column names
col_names = {
    "data": [ 'user id' , 'item id' , 'rating' , 'timestamp'],
    "item": ['movie id' , 'movie title' , 'release date' , 'video release date' ,
              'IMDb URL' , 'unknown' , 'Action' , 'Adventure' , 'Animation' ,
              "Children's" , 'Comedy' , 'Crime' , 'Documentary' , 'Drama' , 'Fantasy' ,
              'Film-Noir' , 'Horror' , 'Musical' , 'Mystery' , 'Romance' , 'Sci-Fi' ,
              'Thriller' , 'War' , 'Western'],
    "user": ['user id' , 'age' , 'gender' , 'occupation' , 'zip code'],
    "genre": ['genre', 'genre id']
}

In [99]:
def read_data(file_name, sep, encoding, col_names):
    output = pd.read_csv(file_name, sep=sep, encoding=encoding, names=col_names)
    return output

In [100]:
ratings = read_data("./u.data", "\t", 'utf-8', col_names["data"])
ratings.head(5)

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [101]:
movies = read_data("./u.item", "|", 'latin-1', col_names["item"])
movies.drop(columns= ['video release date', 'IMDb URL'], inplace=True)
movies.head(5)

Unnamed: 0,movie id,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [102]:
users = read_data("./u.user", "|", 'utf-8', col_names["user"])
users.head(5)

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [103]:
genres = read_data("./u.genre", "|", 'utf-8', col_names["genre"])
genres.head(5)

Unnamed: 0,genre,genre id
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4


In [104]:
def get_ratings_single_movie(movie_id):
    # get all available ratings for a single movie
    return ratings[ratings["item id"] == movie_id].sort_values(by=['user id'])

In [105]:
def get_ratings_single_user(user_id):
    # get all movies rated by the user
    return ratings[ratings["user id"] == user_id].sort_values(by=['item id'])  

In [106]:
def get_both_rated_set(user1_ratings, user2_ratings):
    # return all items rated by both user sorted by the item id
    sim_user1_ratings = user1_ratings[user1_ratings["item id"].isin(user2_ratings["item id"])]
    sim_user2_ratings = user2_ratings[user2_ratings["item id"].isin(user1_ratings["item id"])]
    
    sim_user1_ratings = sim_user1_ratings.sort_values(by=['item id'])
    sim_user2_ratings = sim_user2_ratings.sort_values(by=['item id'])
    return sim_user1_ratings, sim_user2_ratings

In [107]:
def get_both_raters_set(item1_ratings, item2_ratings):
    # return all ratings bytserh users who rated both items sorted by the user id
    item1_raters = item1_ratings[item1_ratings["user id"].isin(item2_ratings["user id"])]["user id"].tolist()
    item2_raters = item2_ratings[item2_ratings["user id"].isin(item1_ratings["user id"])]["user id"].tolist()
    users_rated_both = np.unique(item1_raters + item2_raters)
    ratings_both = ratings[ratings["user id"].isin(users_rated_both)]
    ratings_both = ratings_both.sort_values(by=['user id'])
    return ratings_both

# Pearson correlation function

In [108]:
def pearson_correlation(user1_id, user2_id):
    # calculate pearson correlation between 2 users
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # step3: calculate mean ratings ra, rb
    mean_user1_ratings = np.mean(user1_ratings['rating'])
    mean_user2_ratings = np.mean(user2_ratings['rating'])

    # step4: calculate the variance  
    var_1 = np.array(np.subtract(sim_user1_ratings["rating"], [mean_user1_ratings]))
    var_2 = np.array(np.subtract(sim_user2_ratings["rating"], [mean_user2_ratings]))

    # step5: compute the pearson correlation
    numerator = np.sum(var_1*var_2)
    denominator = np.sqrt(np.sum(np.power(var_1, 2)))*np.sqrt(np.sum(np.power(var_2, 2)))

    if denominator == 0:
        # in the case that denominator = 0 return NaN
        return float('NaN'), user1_id, user2_id
    else:
        correlation = numerator / denominator
        return correlation, user1_id, user2_id


# User-based prediction function

In [109]:
def predict_single_pair_user(user1_id, user2_id, item_id):
    # predict item's score of user 1 based on user 2
    # step1: get all ratings by 2 users
    user1_ratings = get_ratings_single_user(user1_id)
    user2_ratings = get_ratings_single_user(user2_id)
    
    # step2: get the items rated by both users
    sim_user1_ratings, sim_user2_ratings = get_both_rated_set(user1_ratings, user2_ratings)
    
    # if there is no similar rated item, return nan
    if sim_user1_ratings.empty:
        return [float('NaN'), float('NaN')]
    
    # step3: compute the mean rating of user 2
    mean_user2_ratings = np.mean(user2_ratings['rating'])
    
    # step4: get the pearson correlation
    correlation, user1_id, user2_id = pearson_correlation(user1_id, user2_id)
    var_2 = float(user2_ratings[user2_ratings["item id"] == item_id]["rating"]) - mean_user2_ratings

    # step5: return the output
    numerator = (correlation*var_2)
    denominator = correlation
    return [numerator, denominator]

In [110]:
def predict_user_item(user_id, item_id):
    # predict item's score for user
    # if user already rated the item, return the rating
    existing_rating = ratings.loc[(ratings['user id'] == user_id) & (ratings['item id'] == item_id)]
    if not existing_rating.empty:
        return item_id, movies.at[item_id - 1, 'movie title'], existing_rating['rating'].values[0]
        
    # step 1: get user ratings
    user_ratings = get_ratings_single_user(user_id)
    # step 2: compute the mean rating
    mean_user_ratings = np.mean(user_ratings['rating'])
    
    # step 3:  get all other users which rated the item
    users_domain = ratings[ratings["item id"] == item_id]
    
    # step 4: predict for each user in the users domain
    correlations = users_domain.apply(lambda row: predict_single_pair_user(user_id, row["user id"], item_id), axis=1, result_type="expand")
    correlations = np.array(correlations)
    
    # filter all nan, which cause by no same rated item between 2 users
    correlations = correlations[~np.isnan(correlations).any(axis=1), :]

    # step 5: calculate the score and return
    pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])
    return item_id, movies.at[item_id - 1, 'movie title'], pred_score

In [111]:
def get_predicted_ratings(user_id):
    movies_ratings = movies.apply(lambda row: predict_user_item(user_id, row["movie id"]), axis=1, result_type="expand")
    movies_ratings.columns = ["movie id", "movie title", "pred_rating"]

    return movies_ratings

# Task 1: Average aggregation method

In [112]:
def avg_aggration(user1, user2, user3):
    # get predicted ratings of 3 users
    user1_ratings = get_predicted_ratings(user1)
    user2_ratings = get_predicted_ratings(user2)
    user3_ratings = get_predicted_ratings(user3)

    # create dataframe with all 3 users' predicted ratings
    data = [user1_ratings["movie title"], user1_ratings["pred_rating"], user2_ratings["pred_rating"], user3_ratings["pred_rating"]]
    headers = ["movie title", "user1 rating", "user2 rating", "user3 rating"]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['average'] = all_users_ratings.iloc[:, 1:3].mean(axis=1)

    return all_users_ratings

In [113]:
def top_avg_movies(user1, user2, user3, movies_num):
    # get top movies with average rating
    all_users_ratings = avg_aggration(user1, user2, user3)
    top_movies = all_users_ratings.sort_values(by=['average'], ascending=False)
    top_movies = top_movies.iloc[:movies_num, :]
    
    return top_movies

In [114]:
# This took about 7.5 min to run
top_movies_avg = top_avg_movies(2, 17, 35, 20)
top_movies_avg.head(20)

  pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])
  pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])


Unnamed: 0,movie title,user1 rating,user2 rating,user3 rating,average
845,To Gillian on Her 37th Birthday (1996),2.616093,95.530013,3.272804,49.073053
1028,Jury Duty (1995),2.168984,92.983486,2.187442,47.576235
1498,Grosse Fatigue (1994),2.54801,88.704098,2.02015,45.626054
1606,Hurricane Streets (1998),4.494416,27.045019,5.183859,15.769717
1154,"Rendezvous in Paris (Rendez-vous de Paris, Les...",3.21178,24.069365,1.058021,13.640572
1024,Fire Down Below (1997),2.852918,19.794104,3.0,11.323511
883,Year of the Horse (1997),18.46317,3.94791,4.331758,11.20554
1241,"Old Lady Who Walked in the Sea, The (Vieille q...",3.309313,15.830928,4.954293,9.57012
916,Mercury Rising (1998),3.729833,13.358204,-19.776488,8.544019
871,Love Jones (1997),3.651978,12.685553,0.632951,8.168766


# Task 2: Least-misery aggregation method

In [115]:
def least_misery_aggration(user1, user2, user3):
    # get predicted ratings of 3 users
    user1_ratings = get_predicted_ratings(user1)
    user2_ratings = get_predicted_ratings(user2)
    user3_ratings = get_predicted_ratings(user3)

    # create dataframe with all 3 users' predicted ratings
    data = [user1_ratings["movie title"], user1_ratings["pred_rating"], user2_ratings["pred_rating"], user3_ratings["pred_rating"]]
    headers = ["movie title", "user1 rating", "user2 rating", "user3 rating"]
    all_users_ratings = pd.concat(data, axis=1, keys=headers)
    all_users_ratings['minimum'] = all_users_ratings.iloc[:, 1:3].min(axis=1)

    return all_users_ratings

In [116]:
def top_least_misery_movies(user1, user2, user3, movies_num):
    # get top movies with least-misery rating
    all_users_ratings = least_misery_aggration(user1, user2, user3)
    top_movies = all_users_ratings.sort_values(by=['minimum'], ascending=False)
    top_movies = top_movies.iloc[:movies_num, :]
    
    return top_movies

In [117]:
# This took about 7.5 min to run
top_movies_least_misery = top_least_misery_movies(2, 17, 35, 20)
top_movies_least_misery.head(20)

  pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])
  pred_score = mean_user_ratings + np.sum(correlations[:,0]) / np.sum(correlations[:,1])


Unnamed: 0,movie title,user1 rating,user2 rating,user3 rating,minimum
1320,Open Season (1996),6.457737,5.772432,2.508046,5.772432
1642,Angel Baby (1995),5.317964,6.526303,3.131678,5.317964
1448,Pather Panchali (1955),4.998092,5.260339,11.972835,4.998092
813,"Great Day in Harlem, A (1994)",5.612193,4.93823,4.902516,4.93823
1292,Star Kid (1997),4.770224,5.8497,4.389185,4.770224
1130,Safe (1995),4.915867,4.765145,3.863128,4.765145
1535,Aiqing wansui (1994),5.414304,4.740341,,4.740341
1494,Flirt (1995),4.679563,5.137684,0.712499,4.679563
1499,Santa with Muscles (1996),4.986808,4.637955,4.214286,4.637955
1303,New York Cop (1996),4.601792,5.614251,0.197605,4.601792
