# Combined version


In [5]:
# All our intelligence together in this version

import numpy as np
import pandas as pd

#Load the data and some pre-processing step
path = 'https://raw.githubusercontent.com/duy7590/Group-Recommender-System/main/ratings.csv'
df = pd.read_csv(path, sep=',')

#Merge titles and prettify
path_titles = "https://raw.githubusercontent.com/duy7590/Group-Recommender-System/main/movies.csv"
movie_titles = pd.read_csv(path_titles)
df = pd.merge(df,movie_titles,on='movieId')
df = df.drop(labels=['timestamp'], axis=1)
df = df.drop(labels=['genres'], axis=1)
df = df.drop(labels=['title'], axis=1) # The titles look cool in the print, but movieId is safer solution.

In [6]:
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,5,1,4.0
2,7,1,4.5
3,15,1,2.5
4,17,1,4.5
...,...,...,...
100831,610,160341,2.5
100832,610,160527,4.5
100833,610,160836,3.0
100834,610,163937,3.5


In [7]:
# This part is the similarity computation using Duy's code
df1 = df.pivot_table(index='movieId', columns='userId', values='rating')
df1.head()

matrix_corr_user = df1.corr(method='pearson').fillna(0) #User similarity matrix

In [8]:
# Predictions computation for individual users using Duy's code
mean_user_rating = df1.mean(axis=1) #mean rating of each movies
#You use np.newaxis so that mean_user_rating has same format as ratings
ratings_diff = (df1 - mean_user_rating[:, np.newaxis]).fillna(0)
#Prediction based on user-similarity and pearson correlation formula 
pred = mean_user_rating[:, np.newaxis] + (matrix_corr_user.dot(ratings_diff.T) / np.array([np.abs(matrix_corr_user).sum()]).T).T

  after removing the cwd from sys.path.
  


In [9]:
# Due to the reason that some items only have few ratings, it could cause prediction bias.
# We accept only movies, whose rating count exceeds a certain threshold.

necessary_ratings = 100  # We have to discuss this later
movie_list = []
for i in range(len(pred)):
    if df1.iloc[i].count() > necessary_ratings:
        movie_list.append(df1.iloc[i].name)
    else:
        pass
print('Number of movies that has more than ' + str(necessary_ratings) + ' ratings: '+str(len(movie_list)))

test_list = np.array(movie_list)

Number of movies that has more than 100 ratings: 134


In [10]:
def find_unseen_movies(user1, user2, user3, data):
    
    data1 = data.loc[data['userId']==user1] # Data where user1 has ratings
    mask1 = ~data['movieId'].isin(data1['movieId']) # Drops the movies of 1
    data_left1 = data[mask1] # What remains after user1
    
    data2 = data.loc[data['userId']==user2] # Data where user2 has ratings
    mask2 = ~data_left1['movieId'].isin(data2['movieId']) # Dropping off
    data_left2 = data_left1[mask2] # What remains after user2 
    
    data3 = data.loc[data['userId']==user3] # Data where user3 has ratings
    mask3 = ~data_left2['movieId'].isin(data3['movieId'])
    data_left3 = data_left2[mask3]  # Here are only unseen movies
    
    # Numpy array of unique values of the rest of the movieIds
    return data_left3['movieId'].unique()

In [11]:
# Choosing the users for demonstration part
user1 = 1
user2 = 2
user3 = 3

# Testing the approaches for 3 users
print()
print('The users chosen for testing are ' + str(user1) + ', ', end='')
print(str(user2) + ' and ' + str(user3))
print()

print('Assumption made here is, that only such movies that none of ', end='')
print('the test users have rated, are taken into recommendations.')
print()

unseen_movies = find_unseen_movies(user1, user2, user3, df) # Array


The users chosen for testing are 1, 2 and 3

Assumption made here is, that only such movies that none of the test users have rated, are taken into recommendations.



In [12]:
# This is the table for storaging the individual predictions of the test users.
predicts = pd.DataFrame(data=None, index=test_list)
predicts.index.name = 'movieId'

predict1 = pred[user1]
predict2 = pred[user2]
predict3 = pred[user3]

predicts = predicts.merge(predict1, how='left', on='movieId')
predicts = predicts.merge(predict2, how='left', on='movieId')
predicts = predicts.merge(predict3, how='left', on='movieId')

predicts.columns=['predict1', 'predict2', 'predict3']

In [13]:
predicts[0:5]

Unnamed: 0_level_0,predict1,predict2,predict3
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.898379,3.869372,3.914575
2,3.417943,3.46192,3.482031
6,3.935026,3.937365,4.061274
10,3.472662,3.585496,3.530641
32,3.992793,3.966754,3.995862


In [14]:
print('The top 20 recommended movies for this group, using i) average method:')
print()

# Compute and storage the average ratings
predicts['average'] = predicts.loc[:,
        ['predict1', 'predict2', 'predict3']].mean(axis=1)
# Inter-storage for the movies sorted by average
ave = predicts.sort_values(by='average', ascending=False)

# Pretty printing of the movieIds
for i in range(20):
    print(ave.iloc[i].name, end=' ')
print()
print()

The top 20 recommended movies for this group, using i) average method:

858 318 58559 48516 2959 1221 260 1196 1213 2329 1198 2028 2571 50 1208 1089 1197 4973 7361 296 



In [15]:
predicts

Unnamed: 0_level_0,predict1,predict2,predict3,average
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.898379,3.869372,3.914575,3.894109
2,3.417943,3.461920,3.482031,3.453965
6,3.935026,3.937365,4.061274,3.977888
10,3.472662,3.585496,3.530641,3.529600
32,3.992793,3.966754,3.995862,3.985136
...,...,...,...,...
48516,4.276681,4.322131,4.238543,4.279119
58559,4.226314,4.320131,4.292878,4.279774
60069,4.058605,4.099338,4.068206,4.075383
68954,4.016408,4.002610,3.968902,3.995973


In [16]:
print('The top 20 recommended movies for this group, using ii) least misery method:')
print()

predicts['min_pred'] = predicts[['predict1', 'predict2', 'predict3']].min(axis=1)
least_mis = predicts.sort_values(by='min_pred', ascending=False)

# Pretty printing 
for j in range(20):
    print(least_mis.iloc[j].name, end=' ')
print()

The top 20 recommended movies for this group, using ii) least misery method:

858 48516 1221 58559 1213 2959 1198 2329 260 318 1197 2571 1208 1196 1089 7153 4973 7361 2028 50 


In [17]:
predicts

Unnamed: 0_level_0,predict1,predict2,predict3,average,min_pred
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,3.898379,3.869372,3.914575,3.894109,3.869372
2,3.417943,3.461920,3.482031,3.453965,3.417943
6,3.935026,3.937365,4.061274,3.977888,3.935026
10,3.472662,3.585496,3.530641,3.529600,3.472662
32,3.992793,3.966754,3.995862,3.985136,3.966754
...,...,...,...,...,...
48516,4.276681,4.322131,4.238543,4.279119,4.238543
58559,4.226314,4.320131,4.292878,4.279774,4.226314
60069,4.058605,4.099338,4.068206,4.075383,4.058605
68954,4.016408,4.002610,3.968902,3.995973,3.968902


In [18]:
# And finally the B part, which needs one more function using the same individual predictions.

# Add the max values
predicts['max_pred'] = predicts[['predict1', 'predict2', 'predict3']].max(axis=1)
predicts['biggest_diff'] = predicts['max_pred'] - predicts['min_pred']

In [19]:
predicts['biggest_diff'].max()

0.3664043050278605

In [20]:
a = 0.3

In [21]:
predicts['predict_B'] = predicts['average'] - a * predicts['biggest_diff']

In [22]:
predicts['predict_B'].max()

4.320774956759661

In [23]:
predicts

Unnamed: 0_level_0,predict1,predict2,predict3,average,min_pred,max_pred,biggest_diff,predict_B
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,3.898379,3.869372,3.914575,3.894109,3.869372,3.914575,0.045203,3.880548
2,3.417943,3.461920,3.482031,3.453965,3.417943,3.482031,0.064089,3.434738
6,3.935026,3.937365,4.061274,3.977888,3.935026,4.061274,0.126249,3.940014
10,3.472662,3.585496,3.530641,3.529600,3.472662,3.585496,0.112834,3.495750
32,3.992793,3.966754,3.995862,3.985136,3.966754,3.995862,0.029108,3.976404
...,...,...,...,...,...,...,...,...
48516,4.276681,4.322131,4.238543,4.279119,4.238543,4.322131,0.083588,4.254042
58559,4.226314,4.320131,4.292878,4.279774,4.226314,4.320131,0.093818,4.251629
60069,4.058605,4.099338,4.068206,4.075383,4.058605,4.099338,0.040733,4.063163
68954,4.016408,4.002610,3.968902,3.995973,3.968902,4.016408,0.047506,3.981721


In [24]:
print('The top 20 recommended movies for this group, using our method of the B part:')
print()

result_B = predicts.sort_values(by='predict_B', ascending=False)

# Pretty printing 
for j in range(20):
    print(result_B.iloc[j].name, end=' ')
print()


The top 20 recommended movies for this group, using our method of the B part:

858 48516 58559 1221 2959 1213 1198 2329 318 1196 260 1089 1208 2571 2028 1197 4973 7153 7361 50 


# DUY'S Version

**Assignment 2: Group Recommendations Due: November 15, 2020**

**A.**

For producing group recommendation, we will use the user-based collaborative filtering approach as this implemented in Assignment 1. Specifically, for producing group recommendations, we will first compute the movies recommendations for each user in the group, and then we will aggregate
the lists of the individual users, so as to produce a single list of movies for the group.

You will implement two well established aggregation methods for producing the group recommendations.
- The first aggregation approach is the average method. The main idea behind this approach is that all members are considered equals. So, the rating of an item for a group of users will be given be averaging the scores of an item across all group members. Score: 30%
- The second aggregation method is the least misery method, where one member can act as a veto for the rest of the group. In this case, the rating of an item for a group of users is computed as the minimum score assigned to that item in all group members recommendations. Score: 30%

Produce a group of 3 users, and for this group, show the top-20 recommendations, i.e., the 20 movies with the highest prediction scores that (i) the average method suggests, and (ii) the least misery method suggest. Use the MovieLens 100K rating dataset. Score: 10%

In [1]:
import numpy as np
import pandas as pd

#Load the data and some pre-processing step
column_names = ['userId', 'movieId', 'rating', 'timestamp']
df = pd.read_csv('https://raw.githubusercontent.com/duy7590/Group-Recommender-System/main/ratings.csv', sep=',', names=column_names)
df = df[1:] #take the data less the header row
df['movieId']=df['movieId'].astype(int) 
df['rating']=df['rating'].astype(float) 
df['userId']=df['userId'].astype(int) 

#Merge movie ID and title
movie_titles = pd.read_csv("https://raw.githubusercontent.com/duy7590/Group-Recommender-System/main/movies.csv")
df = pd.merge(df,movie_titles,on='movieId')
df.head(100)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
95,269,1,5.0,850865423,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
96,270,1,5.0,853918728,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
97,273,1,5.0,835861234,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
98,274,1,4.0,1171410158,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [2]:
df[df['userId']==12].sort_values('rating',ascending=False).head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
41245,12,6942,5.0,1247264138,Love Actually (2003),Comedy|Drama|Romance
33503,12,838,5.0,1247264096,Emma (1996),Comedy|Drama|Romance
44453,12,2581,5.0,1247263444,Never Been Kissed (1999),Comedy|Romance
44399,12,2572,5.0,1247263689,10 Things I Hate About You (1999),Comedy|Romance
44370,12,2485,5.0,1247264406,She's All That (1999),Comedy|Romance


To compare the results between Item-based and user-based approach, I will take User id no.1 as the target to experiment. To compute similarity for User-based and item-based approach, we use different tools:

- **Pearson Similarity**: I used pandas.DataFrame.corr() and pandas.DataFrame.corrwith() to compute the pearson correlation between rows or columns of one or two DataFrame objects
- **Pearson Similarity**: I used pairwise_distances module in sklearn library

## Implement the user-based collaborative filtering approach, using the Pearson correlation function

Implement the user-based collaborative filtering approach, using the Pearson correlation function
for computing similarities between users (Score: 20%), and the prediction function presented in
class for predicting movies scores (Score: 20%).
Select a user from the dataset, and for this user, show the 10 most similar users and the 20 most
relevant movies that the recommender suggests. Score: 5%

In [3]:
#df = df.pivot_table(index='userId', columns='title', values='rating')
df1 = df.pivot_table(index='title', columns='userId', values='rating')
df1.head()

matrix_corr_user = df1.corr(method='pearson').fillna(0) #User similarity matrix
mean_user_rating = df1.mean(axis=1) #mean rating of each movies
#You use np.newaxis so that mean_user_rating has same format as ratings
ratings_diff = (df1 - mean_user_rating[:, np.newaxis]).fillna(0)
#Prediction based on user-similarity and pearson correlation formula 
pred = mean_user_rating[:, np.newaxis] + (matrix_corr_user.dot(ratings_diff.T) / np.array([np.abs(matrix_corr_user).sum()]).T).T

# Due to the reason that some companies only have several rating and it could make the prediction bias 
# so when make some recommendation, we only focus on those movies that have at least 100 ratings made 
# List of movies that has more than 100 rating counts  
title_list = []
for i in range(len(pred)):
    if df1.iloc[i].count() > 100:
        title_list.append(df1.iloc[i].name)
    else:
        pass
print('Number of movies that has more than 100 rating counts: '+str(len(title_list)))
  

  
  # Remove the CWD from sys.path while we load stuff.


Number of movies that has more than 100 rating counts: 134


Check the top prediction could be recommended for user id 1: 

In [4]:
print('-----------------------')
print('Top most relevant movies that the recommender suggests based on user similarity')
print('-----------------------')

#User id no.12 as target
user_id = 1
final_list = pred[user_id].sort_values(ascending=False)
for i in range(len(final_list)): 
    if final_list.index[i] in title_list:
        print(final_list.index[i],"---------Predicted rate:",final_list[i])
    else:
        pass 

        

-----------------------
Top most relevant movies that the recommender suggests based on user similarity
-----------------------
Shawshank Redemption, The (1994) ---------Predicted rate: 4.490513373908258
Star Wars: Episode IV - A New Hope (1977) ---------Predicted rate: 4.352317929655573
Godfather, The (1972) ---------Predicted rate: 4.334089317133157
Schindler's List (1993) ---------Predicted rate: 4.320860739252987
Star Wars: Episode V - The Empire Strikes Back (1980) ---------Predicted rate: 4.305943897259695
Fight Club (1999) ---------Predicted rate: 4.293044583601053
Usual Suspects, The (1995) ---------Predicted rate: 4.292857973297115
Departed, The (2006) ---------Predicted rate: 4.276681428637795
Godfather: Part II, The (1974) ---------Predicted rate: 4.268328225136489
Goodfellas (1990) ---------Predicted rate: 4.26195189676241
Matrix, The (1999) ---------Predicted rate: 4.253508790334696
Princess Bride, The (1987) ---------Predicted rate: 4.24536685717501
Raiders of the Lost Ar

# ANNA Version

In [25]:
import pandas as pd
import numpy as np

In [26]:
# Data reading in Anna's style
path = 'https://raw.githubusercontent.com/duy7590/Group-Recommender-System/main/ratings.csv'
data = pd.read_csv(path)
data = data.drop(labels=['timestamp'], axis=1)

# Shortening the data, will fasten the computing in developing phase
data = data[0:10000] 
#data = data[0:5000]  # Tämä ottaa tietoja 32 käyttäjältä

print('View to the data')
print()
print(data[0:6])

View to the data

   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0
5       1       70     3.0


In [27]:
def compute_uratmeans(data): # Collection of mean ratings of all users 
    
    users = data['userId'].unique()
    uratmeans = {}
    
    for user in users:
        all_data_curr_user = data.loc[data['userId']==user]
        curr_mean = all_data_curr_user['rating'].mean()
        # Add the result to a dictionary
        if user in uratmeans:
            print("Error. The user is already in the dict.")
        else:
            uratmeans[user] = curr_mean
            
    return uratmeans

In [28]:
# Dictionary of the mean ratings of all users
uratmeans = compute_uratmeans(data)

In [29]:
# Now I need the similarities and and predictions for each test user.
# They are computed using the same ideas as in the first part of the first assignment.

In [30]:
def compute_pearson_sims(data, user_a, uratmeans): 
    # This function computes all similarities between user_a ant the others.
    # Similarity of user_a with oneself is included in the result.
    
    # The group of unique users
    users = data['userId'].unique()
    sims = {}  # In this dictionary, users as keys and similarities as values
    r_a_mean = uratmeans[user_a]
    
    # I would like to change this for loop into something else, but it seems
    # to not take so much computing time on my own computer.
    for user_b in users:
        #print('user_b ' + str(user_b), end= ' ')
        data_a = data.loc[data['userId']==user_a]
        data_b = data.loc[data['userId']==user_b]
            
        # Intersection of movies, that both a and b have rated
        common_items = data_a.loc[data_a['movieId'].isin(data_b['movieId'])].copy()
        common_items = common_items.set_index('movieId')
        # There may be no common items. That is handled later.
        
        # Adding empty columns for storaging the similarity sum aux terms
        # Terms to be storaged here refer to the sum tems of the formula
        common_items['num_term'] = None
        common_items['den_term_a'] = None
        common_items['den_term_b'] = None
        common_items = common_items.drop(labels=['userId', 'rating'], axis=1)
        
        # Iterate over all common items to get gather needed ratings
        # Tässä siis käyttäjät a ja b ovat kiinnitettyjä
        r_b_mean = uratmeans[user_b]
        
        numerator = 0.0
        denominator_a = 0.0 
        denominator_b = 0.0
        
        # Aux functions to apply for computing the sum terms of the sim formula
        def pear_sim_num_term_func(unit_row):
            #print(testi)
            movie=unit_row.name
            rap = data_a.loc[data_a['movieId']==movie]['rating'].values[0]
            rbp = data_b.loc[data_b['movieId']==movie]['rating'].values[0]
            num_term = np.round((rap - r_a_mean)*(rbp - r_b_mean), decimals = 5)
            return num_term
        def pear_sim_den_term_a_func(unit_row):
            movie=unit_row.name
            rap = data_a.loc[data_a['movieId']==movie]['rating'].values[0]
            rbp = data_b.loc[data_b['movieId']==movie]['rating'].values[0] 
            den_term_a = np.round((rap - r_a_mean)**2, decimals = 5)
            return den_term_a    
        def pear_sim_den_term_b_func(unit_row):
            movie=unit_row.name
            rap = data_a.loc[data_a['movieId']==movie]['rating'].values[0]
            rbp = data_b.loc[data_b['movieId']==movie]['rating'].values[0]
            den_term_b = np.round((rbp - r_b_mean)**2, decimals = 5)
            return den_term_b
        
        # Computing and storaging the sum terms
        if len(common_items) > 0:
            common_items.loc[:,'num_term'] = common_items.apply(
                    pear_sim_num_term_func, axis=1)
            common_items.loc[:,'den_term_b'] = common_items.apply(
                    pear_sim_den_term_b_func, axis=1)
            common_items.loc[:,'den_term_a'] = common_items.apply(
                    pear_sim_den_term_a_func, axis=1)
            
            # Computing the factors of the sim formula
            numerator = common_items.sum()['num_term']
            denominator_a = np.sqrt(common_items.sum()['den_term_a'])
            denominator_b = np.sqrt(common_items.sum()['den_term_b'])
        
        pearson_similarity = 0.0
        try:  # My try did not solve the zero division. Had to add this if.
            if abs(denominator_a * denominator_b) > 0.00000001:
                pearson_similarity = numerator/(denominator_a * denominator_b)
                pearson_similarity = np.round(pearson_similarity, decimals = 3)
        except ZeroDivisionError:
            print('Somebody made a zero division.')
        except e:
            print(e)
        
        sims[user_b] = pearson_similarity
    
    return sims

In [31]:
# Similarities for 3 test users
print('Computing similarities for user 1.')
similarities1 = compute_pearson_sims(data, user1, uratmeans)
print('Similarities 1 computed.')
print('Computing similarities for user 2.')
similarities2 = compute_pearson_sims(data, user2, uratmeans)
print('Similarities 2 computed.')
print('Computing similarities for user 3.')
similarities3 = compute_pearson_sims(data, user3, uratmeans)
print('Similarities 3 computed.')

Computing similarities for user 1.
Similarities 1 computed.
Computing similarities for user 2.
Similarities 2 computed.
Computing similarities for user 3.
Similarities 3 computed.


In [32]:
# See what kind of similarities I got for the user 1. similarities1 is a dictionary
# including the similarity with the user herself.
for i in similarities1.keys():
  print(similarities1[i], end=' ')

1.0 0.999 0.011 0.21 0.195 -0.302 -0.047 0.427 0.355 -0.151 -0.45 1.0 0.948 0.225 0.312 0.05 0.198 0.231 0.226 0.526 0.086 -0.13 -0.261 0.052 0.018 -0.148 0.181 0.009 -0.063 0.052 0.052 0.27 0.105 0.1 0.313 0.442 -0.419 0.058 -0.225 -0.355 -0.143 0.145 0.151 0.609 0.239 -0.017 0.519 -0.155 0.753 0.004 0.068 -0.359 0.0 -0.189 0.37 0.128 0.353 0.256 0.057 0.36 0.188 0.509 0.31 0.199 0.824 -0.014 

In [33]:
# And for user 2
for i in similarities2.keys():
  print(similarities2[i], end=' ')

0.999 1.0 0.0 -1.0 1.0 -0.668 -0.551 -1.0 0.0 0.027 -0.256 0.0 0.999 1.0 -0.215 -0.192 -0.015 -0.306 -0.21 0.999 -0.039 -0.359 -0.999 -0.477 0.272 0.0 0.0 -0.154 -0.406 -0.513 0.0 -1.0 0.347 -0.293 0.0 -0.705 1.0 -1.0 0.0 -1.0 0.011 0.18 1.0 0.0 0.772 0.0 0.103 0.0 0.061 -0.077 0.032 0.259 0.0 -1.0 0.999 -0.289 -0.463 -0.668 0.0 0.311 -1.0 -0.159 -0.474 -0.42 0.048 0.0 

In [34]:
# And user 3
for i in similarities3.keys():
  print(similarities3[i], end=' ')

0.011 0.0 1.0 -1.0 -1.0 0.322 0.0 -1.0 0.0 0.0 0.0 0.0 0.0 1.0 -0.946 -0.201 -0.16 -0.372 -0.459 -0.609 -0.687 0.356 -1.0 -1.0 -1.0 0.0 -1.0 0.33 -0.582 0.0 -1.0 -0.874 -0.881 1.0 0.0 0.0 0.0 -1.0 1.0 -1.0 -1.0 -0.324 0.0 -1.0 -0.661 0.0 0.725 0.0 -1.0 0.206 -0.367 0.0 0.0 0.0 1.0 0.0 -0.656 -0.767 -0.551 -1.0 -1.0 -1.0 -0.867 -0.535 0.997 0.0 

In [35]:
def compute_pear_prediction(unit_row, user_a, pearson_sims, data, uratmeans):
    # This function computes the prediction for a known pair of
    # user_a and movie_p, finding the movie information from unit_row of a df
    if unit_row is None:
        return 0.0
    r_a_mean = uratmeans[user_a]
    item_p = unit_row.name
    data_p = data.loc[data['movieId']==item_p] # Data, where item_p has ratings
    users_p = data_p['userId'] # The users, who have rated item_p
    
    # Place to storage the sum terms of the prediction formula
    users_p = pd.DataFrame(index=data_p['userId'].unique()) # Drop duplicates
    users_p.index.name = 'userId'
    users_p.loc[:, 'num'] = 0.0
    users_p.loc[:, 'den'] = 0.0
    
    # Aux function to compute the numerator terms
    def num_term_func(unit_row):
        user_b = unit_row.name
        sim_ab = pearson_sims[user_b]
        row_bp = None
        try:
            row_bp = data_p.loc[data_p['userId']==user_b]
        except:
            return 0.0
        if len(row_bp) < 1:
            return 0.0
        if len(row_bp) > 1:  # If there are 2 ratings from same user
            row_bp = row_bp.iloc[0]
        rbp = row_bp['rating'].values[0]
        r_b_mean = uratmeans[user_b]
        try:
            if sim_ab * (rbp - r_b_mean) is not None:
                return np.round(sim_ab * (rbp - r_b_mean), decimals=4)
        except:
            return 0.0
    
    def den_term_func(unit_row):
        user_b = unit_row.name
        sim_ab = pearson_sims[user_b]
        return sim_ab
    
    users_p.loc[:, 'num'] = users_p.apply(num_term_func, axis=1)
    users_p.loc[:, 'den'] = users_p.apply(den_term_func, axis=1)
    
    # Next the sum parts of the prediction formula
    numerator = 0.0
    if users_p.sum()['num'] is not None:
        numerator = users_p.sum()['num']
    denominator = 0.0
    if users_p.sum()['den'] is not None:
        denominator = users_p.sum()['den']
    
    # Complete the computation of the formula. 
    pear_pred = r_a_mean
    try:
        if abs(denominator) > 0.00000001:
            pear_pred = np.round(r_a_mean + numerator/denominator, decimals=2)
            return pear_pred
    except ZeroDivisionError:
        print('Zero division, caught by try-except -scope.')
    
    return np.round(pear_pred, decimals=2)    

In [36]:
predicts.loc[:, 'predict1'] = predicts.apply(compute_pear_prediction, axis=1, args=(user1, similarities1, data, uratmeans))

predicts.loc[:, 'predict2'] = predicts.apply(
    compute_pear_prediction, axis=1, args=(
    user2, similarities2, data, uratmeans))

predicts.loc[:, 'predict3'] = predicts.apply(
    compute_pear_prediction, axis=1, args=(
    user3, similarities3, data, uratmeans))

In [38]:
#Test 1 with Borda approach
predicts_test1=predicts.drop(['average','min_pred'],axis=1)
predicts_test1['rank_1']= predicts_test1["predict1"].rank(method ='average') 
predicts_test1['rank_2']= predicts_test1["predict2"].rank(method ='average') 
predicts_test1['rank_3']= predicts_test1["predict3"].rank(method ='average') 
predicts_test1['Borda_SumOfRank'] = predicts_test1['rank_1']+predicts_test1['rank_2']+predicts_test1['rank_3']
predicts_test1

Unnamed: 0_level_0,predict1,predict2,predict3,max_pred,biggest_diff,predict_B,rank_1,rank_2,rank_3,Borda_SumOfRank
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,4.32,5.41,2.66,3.914575,0.045203,3.880548,41.0,118.0,69.0,228.0
2,4.24,4.83,2.55,3.482031,0.064089,3.434738,38.5,107.0,56.0,201.5
6,3.87,4.35,2.53,4.061274,0.126249,3.940014,14.5,80.0,52.0,146.5
10,4.18,3.22,1.70,3.585496,0.112834,3.495750,32.0,22.0,9.0,63.0
32,4.63,3.84,2.38,3.995862,0.029108,3.976404,63.0,46.0,38.0,147.0
...,...,...,...,...,...,...,...,...,...,...
48516,5.26,-12.62,2.40,4.322131,0.083588,4.254042,113.0,2.0,42.0,157.0
58559,4.84,4.02,2.83,4.320131,0.093818,4.251629,81.0,56.5,86.5,224.0
60069,4.87,4.24,2.67,4.099338,0.040733,4.063163,83.0,74.0,71.0,228.0
68954,5.42,4.24,3.04,4.016408,0.047506,3.981721,122.5,74.0,105.0,301.5


In [39]:
#Test 2 with this approach i learned from here:
#https://people.uta.fi/~kostas.stefanidis/dbir16/papers/amer-yahia09.pdf

#Average Pair-wise Disagreements of groups over item i 
predicts_test2=predicts_test1.drop(['Borda_SumOfRank'],axis=1)
predicts_test2['Disagreements'] = (abs(predicts_test2['rank_1']-predicts_test2['rank_2'])+abs(predicts_test2['rank_1']-predicts_test2['rank_3'])+abs(predicts_test2['rank_2']-predicts_test2['rank_3']))*2/(3*(3-1))
#(Group Relevance). The relevance of an item i to a group based on average method
predicts_test2['average'] = predicts['average']
#Consensus function
predicts_test2['consensus value'] = 0.8*predicts['average']-0.2*(1-predicts_test2['Disagreements'])
predicts_test2.sort_values(by=['consensus value'], ascending=False)


Unnamed: 0_level_0,predict1,predict2,predict3,max_pred,biggest_diff,predict_B,rank_1,rank_2,rank_3,Disagreements,average,consensus value
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
434,28.94,1.74,0.86,3.060756,0.072316,2.997255,134.0,9.0,1.0,88.666667,3.018950,19.948493
5989,4.74,-35.39,3.44,3.925642,0.062059,3.871353,72.0,1.0,128.0,84.666667,3.889971,19.845310
1258,2.85,5.68,3.02,4.142206,0.092470,4.070575,5.0,120.0,101.5,76.666667,4.098316,18.411986
4878,3.34,3.83,3.32,3.994877,0.024269,3.974969,8.0,45.0,122.5,76.333333,3.982250,18.252467
380,6.31,6.23,1.87,3.628177,0.120833,3.525156,131.0,126.0,14.0,78.000000,3.561406,18.249125
...,...,...,...,...,...,...,...,...,...,...,...,...
349,2.65,1.76,1.83,3.623980,0.030528,3.603146,1.0,10.0,11.0,6.666667,3.612305,4.023177
208,3.50,2.70,1.08,2.938639,0.049135,2.900707,9.0,15.0,3.0,8.000000,2.915448,3.732358
316,2.83,1.43,1.85,3.490030,0.186761,3.360561,4.0,8.0,12.0,5.333333,3.416589,3.599938
2683,4.10,3.53,2.11,3.226094,0.045110,3.183282,26.0,32.0,24.5,5.000000,3.196815,3.357452
