In [1]:
#Importing the relevant libraries
import numpy as np
import pandas as pd
import math
import statistics as stat
from scipy.stats import pearsonr, kendalltau

In [2]:
#Importing the Datasets ratings and movies
ratings = pd.read_csv(r"/Volumes/bp first/Tampere University/Recommender Systems/ml-100k/userdata.csv", sep='\t', header=None)
ratings.columns=['userId','movieId','rating','timestamp']

d = 'movieId | title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
column_names2 = d.split(' | ')

# Loading the movies dataset
movies_data = pd.read_csv('/Volumes/bp first/Tampere University/Recommender Systems/ml-100k/uitem.csv', sep='|',header=None,names=column_names2,encoding='latin-1')
movies=movies_data[['movieId','title']]
movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [3]:
#Dropping 'Timestamp' as it is not required
ratings = ratings.drop(columns = ["timestamp"])

In [4]:
#Ratings Dataset
ratings

Unnamed: 0,userId,movieId,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [5]:
#Movies Dataset
movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [6]:
# Calculating the length of unique users in the dataset just for the knowledge and check purpose
print(f'Number of unique users {len(ratings["userId"].unique())}')

Number of unique users 943


In [7]:
#Picking up a random user (userId= 87) 
userA = 87

In [8]:
#Fuction for creating dataset of a single user with all the movies he has rated
def create_userX_matrix(userId):
    userX_matrix = ratings.query('userId == @userId',inplace = False)
    return userX_matrix

#Fuction for finding other similar user who has watched some/all similar movies with the selected user(userId)
def user_subset_common_movie(userId):
    userX_matrix = create_userX_matrix(userId)
    users = ratings[ratings['movieId'].isin(userX_matrix['movieId'].tolist())]
    # Creating sub dataframes based on userId
    userSubsetGroup = users.groupby(['userId'])
    # Sorting the sub dataframe of all simmilar users so that top common with the selected user will have the priority
    userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)
    userSubsetGroup = userSubsetGroup[1:81]
    return userSubsetGroup, userX_matrix

In [9]:
# Now we will find out the similar users like the selected user (userId= 87) 
# Alternately, we can say, finding the useres who rates similar movies as our selected user
userSubsetGroupA, userAMatrix = user_subset_common_movie(userA)
userAMatrix

Unnamed: 0,userId,movieId,rating
56,87,384,4
92,87,1016,4
141,87,274,4
240,87,554,4
375,87,40,3
...,...,...,...
93711,87,128,3
95110,87,1178,3
95466,87,598,2
95984,87,849,5


In [10]:
# Inspecting the 2nd top user (1st one is the same as our selected user)
userSubsetGroupA[1]

(450,
        userId  movieId  rating
 17680     450      783       3
 17963     450      100       4
 18102     450      801       4
 18124     450      216       5
 18148     450      144       5
 ...       ...      ...     ...
 93026     450      628       4
 93734     450       22       5
 94234     450       25       3
 94454     450      228       4
 98871     450      732       3
 
 [146 rows x 3 columns])

In [11]:
#Total number of similar user who has rated most common movies as selected user (userId= 87)  
len(userSubsetGroupA)

80

In [12]:
#Store the Pearson Correlation in a dictionary, 
# where the key is the user Id and the value is the coefficient
def pearsonCorr(inputMovies, similarUsersGroup):
    pearsonCorrelationDict = {}
    #For every similar user group in our subset
    for name, group in similarUsersGroup:

        #Let's start by sorting the input and current user group so the values aren't mixed up later on
        group = group.sort_values(by='movieId')
        inputMovies1 = inputMovies.sort_values(by='movieId')

        #Get the N (total similar movies watched) for the formula 
        #nRatings = len(group)

        #Get the review scores of the Selected user for the movies that they both have in common
        temp_df = inputMovies1[inputMovies1['movieId'].isin(group['movieId'].tolist())]

        #And then store them in a temporary buffer variable in a list format to facilitate future calculations
        selectedUserTempRatingList = temp_df['rating'].tolist()
        #print(selectedUserTempRatingList)

        #Let's also put the current user group reviews in a list format
        similarUserTempRatingList = group['rating'].tolist()

        #Calculating pearson similarity
        simXX = 0
        meanSelectedUserTempRating = stat.mean(selectedUserTempRatingList)
        for i in selectedUserTempRatingList:
            simXX = simXX + pow((i - meanSelectedUserTempRating),2)

        simYY = 0
        meanSimilarUserTempRaning = stat.mean(similarUserTempRatingList)
        for j in similarUserTempRatingList:
            simYY = simYY + pow((j - meanSimilarUserTempRaning),2)

        simXY = 0
        for i, j in zip(selectedUserTempRatingList, similarUserTempRatingList):
            simXY = simXY+ ((i - meanSelectedUserTempRating ) * (j-meanSimilarUserTempRaning))

        #If the denominator is different than zero, then divide, else, 0 correlation.
        if simXX != 0 and simYY != 0:
            pearsonCorrelationDict[name] = simXY/np.sqrt(simXX*simYY)
        else:
            pearsonCorrelationDict[name] = 0                    
    
    maxSimilarUser = dict(sorted(pearsonCorrelationDict.items(), key=lambda item: item[1], reverse= True))
    return maxSimilarUser

In [13]:
#Similarity score of other similar users to our selected user (155)
def similarUsersofUserID (inputMovies, pearsonCorrelationDict):
    similarityScoreofSimilarUsers_data = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
    similarityScoreofSimilarUsers_data.head()
    similarityScoreofSimilarUsers_data.columns = ['similarityScore']
    similarityScoreofSimilarUsers_data['userId'] = similarityScoreofSimilarUsers_data.index
    similarityScoreofSimilarUsers_data.index = range(len(similarityScoreofSimilarUsers_data))
    #similarityScoreofSimilarUsers_data.head()
    topSimilarUsers=similarityScoreofSimilarUsers_data.sort_values(by='similarityScore', ascending=False)
    topSimilarUsers.head(10)
    topSimilarUsersRating = topSimilarUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')
    topSimilarUsersRating.head()
    meanRb = topSimilarUsersRating.groupby('userId').mean()[['rating']]
    meanRb.columns = ['avgRating']
    meanRb['userId'] = meanRb.index
    meanRb.index = range(len(meanRb))
    topSimilarUsersRating = topSimilarUsersRating.merge(meanRb, left_on='userId', right_on='userId', how='inner')
    topSimilarUsersRating['weightedRatingScore'] = topSimilarUsersRating['similarityScore']*(topSimilarUsersRating['rating']-topSimilarUsersRating['avgRating'])
    tempTopSimilarUsersRating = topSimilarUsersRating.groupby('movieId').sum()[['weightedRatingScore']]
    tempTopSimilarUsersRating.columns = ['sum_weightedRatingScore']
    tempTopSimilarUsersRating['movieId'] = tempTopSimilarUsersRating.index
    
    #Creates an empty dataframe
    recommendation_data = pd.DataFrame()
    #Now we take the weighted average
    meanRa = inputMovies['rating'].mean()
    recommendation_data['weighted average recommendation score'] = meanRa+(tempTopSimilarUsersRating['sum_weightedRatingScore']/topSimilarUsers['similarityScore'].sum())
    recommendation_data['movieId'] = recommendation_data.index
    recommendation_data = recommendation_data.sort_values(by='weighted average recommendation score', ascending=False)
    recommendation_data.index = [x for x in range(1,len(recommendation_data)+1)]
    return recommendation_data

In [14]:
## Calling the function Pearson Correlation for calculating the similarity index in terms of 1st user in the group (userId= 87)  
pearsonCorrelationDictA = pearsonCorr(userAMatrix, userSubsetGroupA)

## Calling the function for generating recommendations in terms of 1st user in the group userA (userId= 87) 
recMovieA = similarUsersofUserID(userAMatrix, pearsonCorrelationDictA)
recMovieA

Unnamed: 0,weighted average recommendation score,movieId
1,5.007778,50
2,4.939580,174
3,4.798163,64
4,4.764024,172
5,4.763139,98
...,...,...
1540,3.324132,412
1541,3.304386,122
1542,3.287322,546
1543,3.262478,29


In [15]:
#lets see users who are similar to 1st user in the group userA (userId= 87) by looking at correlation dictionary
pearsonCorrelationDictA

{796: 0.5057821247747694,
 533: 0.47937128941312074,
 804: 0.4482773005730129,
 896: 0.44610708130314314,
 222: 0.4244888274560173,
 457: 0.4166728208938566,
 650: 0.4160032518506304,
 416: 0.41389648854252375,
 95: 0.40355266537945583,
 648: 0.3966827659931506,
 653: 0.39557704109598135,
 660: 0.3872892928109776,
 881: 0.36450499285237564,
 311: 0.36065736241592206,
 749: 0.35498964061184024,
 301: 0.35316140710884586,
 450: 0.3455739759457818,
 933: 0.33614580138693934,
 551: 0.3294458490338792,
 880: 0.3277933375077127,
 178: 0.32011284916254873,
 886: 0.31868084804955943,
 268: 0.31833351981578084,
 916: 0.31814550083442983,
 374: 0.3171173575726351,
 682: 0.312221269924082,
 13: 0.3100584680366438,
 303: 0.3056027379099257,
 474: 0.29885188497771925,
 1: 0.29858730044677895,
 234: 0.29646255605061517,
 889: 0.2951439187884295,
 393: 0.2815628463768405,
 109: 0.2754462275814459,
 497: 0.2727241994305363,
 92: 0.271381029568534,
 276: 0.27005537501834065,
 405: 0.2693189309740629,
 

In [16]:
#As userId = 796 and userId = 533 has the highest similarity score with our selected userA
#So, lets take 796 and 533 so lets produce recommendation for them, based on pearson correlation value
#The same approach is followed for the following two users as done for the userA.
#The approach is basically finding the most similar users for userB and userC 

In [17]:
#2nd user of the group
userB = 796

# finding the useres who rates similar movies as our selected userB
userSubsetGroupB, userBMatrix = user_subset_common_movie(userB)

#Calling the function Pearson Correlation for calculating the similarity index in terms of 1st user in the group userB
pearsonCorrelationDictB = pearsonCorr(userBMatrix, userSubsetGroupB)

#Calling the function for generating recommendations in terms of 1st user in the group userB 
recMovieB = similarUsersofUserID(userBMatrix, pearsonCorrelationDictB)
recMovieB

Unnamed: 0,weighted average recommendation score,movieId
1,4.856139,50
2,4.768973,174
3,4.702548,172
4,4.639317,12
5,4.607882,22
...,...,...
1548,3.130155,412
1549,3.115721,554
1550,3.109293,29
1551,3.102263,231


In [18]:
#2nd user of the group
userC = 533

# finding the useres who rates similar movies as our selected userC
userSubsetGroupC, userCMatrix = user_subset_common_movie(userC)

#Calling the function Pearson Correlation for calculating the similarity index in terms of 1st user in the group userC
pearsonCorrelationDictC = pearsonCorr(userCMatrix, userSubsetGroupC)

#Calling the function for generating recommendations in terms of 1st user in the group userC
recMovieC = similarUsersofUserID(userCMatrix, pearsonCorrelationDictC)
recMovieC

Unnamed: 0,weighted average recommendation score,movieId
1,4.505790,64
2,4.440320,50
3,4.416857,174
4,4.377839,98
5,4.260990,172
...,...,...
1554,2.884996,412
1555,2.877763,554
1556,2.854517,931
1557,2.853632,53


In [19]:
#List of recommendations movies for 1st user in the group userA
aUserList = recMovieA['movieId'].tolist()

#List of recommendations movies for 1st user in the group userB
bUserList = recMovieB['movieId'].tolist()

#List of recommendations movies for 1st user in the group userC
cUserList = recMovieC['movieId'].tolist()

In [20]:
aUserList

[50,
 174,
 64,
 172,
 98,
 173,
 181,
 127,
 22,
 12,
 195,
 318,
 168,
 191,
 96,
 183,
 483,
 89,
 204,
 56,
 79,
 69,
 357,
 210,
 100,
 176,
 144,
 28,
 603,
 651,
 265,
 194,
 187,
 511,
 186,
 134,
 496,
 196,
 208,
 216,
 185,
 228,
 199,
 313,
 480,
 474,
 427,
 180,
 1,
 484,
 182,
 211,
 527,
 193,
 202,
 132,
 153,
 215,
 197,
 82,
 655,
 11,
 479,
 520,
 87,
 435,
 117,
 423,
 223,
 121,
 178,
 135,
 234,
 203,
 238,
 588,
 8,
 7,
 169,
 523,
 657,
 222,
 684,
 268,
 48,
 71,
 258,
 385,
 83,
 42,
 746,
 97,
 156,
 272,
 429,
 177,
 198,
 302,
 246,
 124,
 568,
 257,
 430,
 498,
 95,
 9,
 531,
 478,
 150,
 170,
 663,
 154,
 431,
 190,
 735,
 515,
 315,
 157,
 705,
 528,
 192,
 275,
 136,
 316,
 317,
 692,
 151,
 200,
 333,
 443,
 526,
 732,
 205,
 23,
 114,
 654,
 660,
 125,
 239,
 482,
 99,
 742,
 188,
 15,
 129,
 250,
 519,
 661,
 31,
 504,
 137,
 164,
 237,
 298,
 143,
 378,
 282,
 652,
 521,
 52,
 566,
 408,
 506,
 497,
 179,
 161,
 510,
 513,
 285,
 269,
 276,
 462,
 

In [21]:
#The common recommended movies of all users (userA, userB, userC) in group
commonRecMovAverage = set(aUserList) & set(bUserList) & set(cUserList)

#Total number of common recommended movies of all users (userA, userB, userC) in group
len(commonRecMovAverage)

1542

In [22]:
#Finding only the common movies between users with the recommendation score with UserA
recMovieA = recMovieA[recMovieA['movieId'].isin(commonRecMovAverage)]

#Finding only the common movies between users with the recommendation score with UserB
recMovieB = recMovieB[recMovieB['movieId'].isin(commonRecMovAverage)]

#Finding only the common movies between users with the recommendation score with UserC
recMovieC = recMovieC[recMovieC['movieId'].isin(commonRecMovAverage)]

In [23]:
recMovieA

Unnamed: 0,weighted average recommendation score,movieId
1,5.007778,50
2,4.939580,174
3,4.798163,64
4,4.764024,172
5,4.763139,98
...,...,...
1540,3.324132,412
1541,3.304386,122
1542,3.287322,546
1543,3.262478,29


In [24]:
#Calculate the average aggregation with the value of individual recommendation score of each common movie between the users (userA, userB, userC) in group
frames = [recMovieA, recMovieB, recMovieC]
result = pd.concat(frames)

GroupRecommendationAverage = result.groupby(['movieId'], as_index= False).mean().sort_values(by='weighted average recommendation score', ascending=False)
GroupRecommendationAverage.index = [x for x in range(1, len(GroupRecommendationAverage)+1)]
GroupRecommendationAverage

Unnamed: 0,movieId,weighted average recommendation score
1,50,4.768079
2,174,4.708470
3,64,4.630003
4,172,4.575854
5,98,4.548956
...,...,...
1538,554,3.119239
1539,412,3.113094
1540,29,3.089260
1541,122,3.082150


In [25]:
#Concating the movieId with the movie title from the collective movie list after average aggregation  
movies.loc[movies['movieId'].isin(GroupRecommendationAverage.head(20)['movieId'])]['title']

11                 Usual Suspects, The (1995)
21                          Braveheart (1995)
49                           Star Wars (1977)
55                        Pulp Fiction (1994)
63           Shawshank Redemption, The (1994)
68                        Forrest Gump (1994)
78                       Fugitive, The (1993)
95          Terminator 2: Judgment Day (1991)
97           Silence of the Lambs, The (1991)
126                     Godfather, The (1972)
167    Monty Python and the Holy Grail (1974)
171           Empire Strikes Back, The (1980)
172                Princess Bride, The (1987)
173            Raiders of the Lost Ark (1981)
180                 Return of the Jedi (1983)
190                            Amadeus (1984)
194                    Terminator, The (1984)
203                 Back to the Future (1985)
317                   Schindler's List (1993)
482                         Casablanca (1942)
Name: title, dtype: object

In [26]:
#Calculate the Least Misery aggregation with the value of individual recommendation score of each common movie between the users (userA, userB, userC) in group
GroupRecommendationLM = result.groupby(['movieId'], as_index = False).min().sort_values(by='weighted average recommendation score', ascending=False)
GroupRecommendationLM.index = [x for x in range(1, len(GroupRecommendationLM)+1)]
GroupRecommendationLM

Unnamed: 0,movieId,weighted average recommendation score
1,64,4.505790
2,50,4.440320
3,174,4.416857
4,98,4.377839
5,172,4.260990
...,...,...
1538,412,2.884996
1539,554,2.877763
1540,931,2.854517
1541,53,2.853632


In [27]:
#Concating the movieId with the movie title from the collective movie list after Least Misery aggregation 
movies.loc[movies['movieId'].isin(GroupRecommendationLM.head(20)['movieId'])]['title']

11                 Usual Suspects, The (1995)
21                          Braveheart (1995)
49                           Star Wars (1977)
55                        Pulp Fiction (1994)
63           Shawshank Redemption, The (1994)
78                       Fugitive, The (1993)
95          Terminator 2: Judgment Day (1991)
97           Silence of the Lambs, The (1991)
99                               Fargo (1996)
126                     Godfather, The (1972)
171           Empire Strikes Back, The (1980)
172                Princess Bride, The (1987)
173            Raiders of the Lost Ark (1981)
180                 Return of the Jedi (1983)
182                              Alien (1979)
190                            Amadeus (1984)
215            When Harry Met Sally... (1989)
317                   Schindler's List (1993)
356    One Flew Over the Cuckoo's Nest (1975)
482                         Casablanca (1942)
Name: title, dtype: object

In [28]:
#let's create Kendall-Tau distance calculation to evaluate the recommendation of our grouping (against Average Aggregation method)

#List of recommendations movies for 1st user in the group userA
aUserList = recMovieA['movieId'].tolist()

#List of recommendations movies for 1st user in the group userB
bUserList = recMovieB['movieId'].tolist()

#List of recommendations movies for 1st user in the group userC
cUserList = recMovieC['movieId'].tolist()

#listing the common movie Id from the Average Aggregation method
averageList = GroupRecommendationAverage['movieId'].tolist()
len(averageList)

1542

In [29]:
len(cUserList)

1542

In [30]:
#Kandall Tau Correlation between movie rank between userA with the commom group movie rank (aginst Average Aggregation method) 
tauAAVG, p_valueA = kendalltau(aUserList, averageList) 
#Kandall Tau Correlation between movie rank between userB with the commom group movie rank (aginst Average Aggregation method)
tauBAVG, p_valueB = kendalltau(bUserList, averageList) 
#Kandall Tau Correlation between movie rank between userC with the commom group movie rank (aginst Average Aggregation method)
tauCAVG, p_valueC = kendalltau(cUserList, averageList) 

print(f'KT Correlation for user A against Average aggregation method: {tauAAVG}')
print(f'KT Correlation for user B against Average aggregation method: {tauBAVG}')
print(f'KT Correlation for user C against Average aggregation method: {tauCAVG}')

KT Correlation for user A against Average aggregation method: 0.25831845677718657
KT Correlation for user B against Average aggregation method: 0.24405211297597612
KT Correlation for user C against Average aggregation method: 0.26604164089045546


In [31]:
p_valueA

3.6267835145732094e-52

In [32]:
#let's create Kendall-Tau distance calculation to evaluate the recommendation of our grouping (against Least Misery Aggregation method)

#listing the common movie Id from the Least Misery Aggregation method
lmList = GroupRecommendationLM['movieId'].tolist()

#Kandall Tau Correlation between movie rank between userA with the commom group movie rank (aginst Least Misery Aggregation method)
tauALM, p_valueALM = kendalltau(aUserList, lmList) 
#Kandall Tau Correlation between movie rank between userB with the commom group movie rank (aginst Least Misery Aggregation method)
tauBLM, p_valueBLM = kendalltau(bUserList, lmList) 
#Kandall Tau Correlation between movie rank between userC with the commom group movie rank (aginst Least Misery Aggregation method)
tauCLM, p_valueCLM = kendalltau(cUserList, lmList) 

print(f'KT Correlation for user A against LM aggregation method: {tauALM}')
print(f'KT Correlation for user B against LM aggregation method: {tauBLM}')
print(f'KT Correlation for user C against LM aggregation method: {tauCLM}')

KT Correlation for user A against LM aggregation method: 0.25153457883985586
KT Correlation for user B against LM aggregation method: 0.251670929736363
KT Correlation for user C against LM aggregation method: 0.952088651649551


In [33]:
#Let's find users that have less disagreement, instead of using pearson correlation, let's count userA against other users
#and find the ones that has the highest Kendall tau correlation
#let's choose users who at least have 30 common rated movies
similarUserA2 = [name for name,group in userSubsetGroupA] 
len(similarUserA2)


80

In [34]:
#Function for finding recommendation with a single user with other user
def produce_recommendation(userId):
    userSubsetGroup, userMatrix = user_subset_common_movie(userId)
    pearsonCorrelationDict = pearsonCorr(userMatrix, userSubsetGroup)
    recMovieX = similarUsersofUserID(userMatrix, pearsonCorrelationDict)
    return recMovieX
    
#Function for kendall_tau correlatuib between users
def kendall_tau_between_users(userId, otherUsers, recMovieA):
    recMovieX = produce_recommendation(userId)
    recMovieListX = recMovieX['movieId'].tolist() #list Of Recommended Movie For UserId
    aUserList = recMovieA['movieId'].tolist()
    commonRecMovAverage = set(aUserList) & set(recMovieListX)
    
    recMovieA = recMovieA[recMovieA['movieId'].isin(commonRecMovAverage)]
    recMovieX = recMovieX[recMovieX['movieId'].isin(commonRecMovAverage)]
    
    recMovieListA = recMovieA['movieId'].tolist()
    recMovieListX = recMovieX['movieId'].tolist()
    
    tauUser, p_valueUser = kendalltau(recMovieListA, recMovieListX) 
    return tauUser
    

In [35]:
#Create dictionary for storing userId with their kendall tau correlation value
KTDict = {}
for user in similarUserA2:
    x = kendall_tau_between_users(user, userSubsetGroupA, recMovieA)
    KTDict[user] = x 
    KTDict = dict(sorted(KTDict.items(), key=lambda item: item[1], reverse= True))

In [36]:
#Showing the itemms of all kendall tau correlation value
KTDict 

#let's take 749 as new userB and 682 new userC because it has the highest kendall tau correlatuib

{749: 0.2811782737471499,
 682: 0.2760443968750263,
 497: 0.2752949850645267,
 495: 0.2736556509512958,
 301: 0.27271946813050296,
 537: 0.27139535542412196,
 472: 0.27138625936465527,
 758: 0.26941088182017386,
 561: 0.2688124551427152,
 276: 0.2677333063297976,
 56: 0.265893690455094,
 92: 0.2656393215785393,
 308: 0.26228175004023435,
 406: 0.2622485337772701,
 727: 0.2618016636186655,
 417: 0.26157613365881105,
 1: 0.26048189318792825,
 660: 0.2603137666446132,
 95: 0.25918881715262787,
 416: 0.258523824794148,
 268: 0.25779600516442625,
 474: 0.25777539049910836,
 886: 0.2573673240339907,
 864: 0.2566283789982585,
 648: 0.2558601683845032,
 269: 0.25565844016683525,
 222: 0.25561668901306356,
 94: 0.25516723605791036,
 327: 0.25450010843634785,
 551: 0.2544965106368616,
 363: 0.2543566751224116,
 151: 0.25395406058501174,
 889: 0.2516707517134775,
 13: 0.2512455149001998,
 178: 0.2511103432582991,
 387: 0.25100811917154625,
 533: 0.24964586642157166,
 59: 0.2492223080676642,
 7: 0

In [37]:
#Finding top recommended movies of new userB 
newRecMovieB = produce_recommendation(749)

#Finding top recommended movies of new userC 
newRecMovieC = produce_recommendation(682)

In [38]:
#List of recommendations movies for 1st user in the group new userA
newAUserList = recMovieA['movieId'].tolist()

#List of recommendations movies for 1st user in the group new userB
newBUserList = newRecMovieB['movieId'].tolist()

#List of recommendations movies for 1st user in the group new userC
newCUserList = newRecMovieC['movieId'].tolist()

#List of recommendations movies for all users in the group users
newCommonRecMov = set(newAUserList) & set(newBUserList) & set(newCUserList)

newRecMovieA = recMovieA[recMovieA['movieId'].isin(newCommonRecMov)]
newRecMovieB = newRecMovieB[newRecMovieB['movieId'].isin(newCommonRecMov)]
newRecMovieC = newRecMovieC[newRecMovieC['movieId'].isin(newCommonRecMov)]

In [39]:
#Calculate the new average aggregation with the value of individual recommendation score of each common movie between the new users (new userA, new userB, new userC) in group
newFrames = [newRecMovieA, newRecMovieB , newRecMovieC]
newResult = pd.concat(newFrames)
newGroupRecommendationAverage =  newResult.groupby(['movieId'], as_index= False).mean().sort_values(by='weighted average recommendation score', ascending=False)
newGroupRecommendationAverage.index = [x for x in range(1, len(newGroupRecommendationAverage)+1)]
newGroupRecommendationAverage

Unnamed: 0,movieId,weighted average recommendation score
1,50,4.698143
2,174,4.605160
3,64,4.514545
4,172,4.494437
5,98,4.492951
...,...,...
1537,122,3.072752
1538,412,3.054437
1539,546,3.019849
1540,231,3.008005


In [40]:
#Concating the movie data with movie title
#movies.loc[movies['movieId'].isin(newGroupRecommendationAverage.head(20)['movieId'])]['title']
newGroupRecommendationAverage['movieName'] = newGroupRecommendationAverage['movieId'].map(movies['title'])
newGroupRecommendationAverage.head(20)

Unnamed: 0,movieId,weighted average recommendation score,movieName
1,50,4.698143,Legends of the Fall (1994)
2,174,4.60516,Brazil (1985)
3,64,4.514545,What's Eating Gilbert Grape (1993)
4,172,4.494437,"Princess Bride, The (1987)"
5,98,4.492951,Snow White and the Seven Dwarfs (1937)
6,173,4.423396,Raiders of the Lost Ark (1981)
7,12,4.377031,Mighty Aphrodite (1995)
8,127,4.376642,Supercop (1992)
9,181,4.353085,GoodFellas (1990)
10,22,4.347621,Taxi Driver (1976)


In [41]:
#Calculate the Least Misery aggregation with the value of individual recommendation score of each common movie between the new users (new userA, new userB, new userC) in group
newGroupRecommendationLM = newResult.groupby(['movieId'], as_index = False).min().sort_values(by='weighted average recommendation score', ascending=False)
newGroupRecommendationLM.index = [x for x in range(1, len(newGroupRecommendationLM)+1)]
newGroupRecommendationLM

Unnamed: 0,movieId,weighted average recommendation score
1,50,4.282362
2,174,4.179346
3,64,4.140541
4,172,4.128661
5,98,4.091056
...,...,...
1537,38,2.682526
1538,546,2.677297
1539,29,2.674228
1540,53,2.666939


In [42]:
newGroupRecommendationLM['movieName'] = newGroupRecommendationLM['movieId'].map(movies['title'])
newGroupRecommendationLM.head(20)

Unnamed: 0,movieId,weighted average recommendation score,movieName
1,50,4.282362,Legends of the Fall (1994)
2,174,4.179346,Brazil (1985)
3,64,4.140541,What's Eating Gilbert Grape (1993)
4,172,4.128661,"Princess Bride, The (1987)"
5,98,4.091056,Snow White and the Seven Dwarfs (1937)
6,12,4.047161,Mighty Aphrodite (1995)
7,127,4.028786,Supercop (1992)
8,22,4.020556,Taxi Driver (1976)
9,56,4.003605,Priest (1994)
10,173,3.982075,Raiders of the Lost Ark (1981)


In [43]:
#Kendall-Tau value calculation to evaluate the recommendation of our grouping (against Average Aggregation method)
newRecMovieA 
newAverageList = newGroupRecommendationAverage['movieId'].tolist()
newLmList = newGroupRecommendationLM['movieId'].tolist()
newListA = newRecMovieA['movieId'].tolist()
newListB = newRecMovieB['movieId'].tolist()
newListC = newRecMovieC['movieId'].tolist()

newTauAAVG, p_valueA = kendalltau(newListA, newAverageList) 
newTauBAVG, p_valueB = kendalltau(newListB, newAverageList) 
newTauCAVG, p_valueC = kendalltau(newListC, newAverageList) 

In [44]:
print(f'KT Correlation for user A against Average aggregation method: {newTauAAVG}')
print(f'KT Correlation for user B against Average aggregation method: {newTauBAVG}')
print(f'KT Correlation for user C against Average aggregation method: {newTauCAVG}')

KT Correlation for user A against Average aggregation method: 0.27102320132819807
KT Correlation for user B against Average aggregation method: 0.2538425883007323
KT Correlation for user C against Average aggregation method: 0.2757190894763899


In [45]:
#Kendall-Tau value calculation to evaluate the recommendation of our grouping (against Least Misery Aggregation method)
newTauALM, p_valueA = kendalltau(newListA, newLmList) 
newTauBLM, p_valueB = kendalltau(newListB, newLmList) 
newTauCLM, p_valueC = kendalltau(newListC, newLmList) 

In [46]:
print(f'KT Correlation for user A against LM aggregation method: {newTauALM}')
print(f'KT Correlation for user B against LM aggregation method: {newTauBLM}')
print(f'KT Correlation for user C against LM aggregation method: {newTauCLM}')

KT Correlation for user A against LM aggregation method: 0.2814111261872455
KT Correlation for user B against LM aggregation method: 0.25236437799708406
KT Correlation for user C against LM aggregation method: 0.9502937037005822
