In [1]:
#Importing the relevant libraries
import numpy as np
import pandas as pd
import math
import statistics as stat
import matplotlib.pyplot as plt

In [2]:
#Importing the Datasets ratings and movies
dratings = pd.read_csv(r"/Volumes/bp first/Tampere University/Recommender Systems/ml-100k/userdata.csv", sep='\t', header=None)
dratings.columns=['userId','movieId','rating','timestamp']

d = 'movieId | title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western'
column_names2 = d.split(' | ')

# Loading the movies dataset
movies_data = pd.read_csv('/Volumes/bp first/Tampere University/Recommender Systems/ml-100k/uitem.csv', sep='|',header=None,names=column_names2,encoding='latin-1')
dmovies=movies_data[['movieId','title']]
dmovies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [3]:
## Ratings Dataset
dratings  

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [4]:
## Movies Dataset
dmovies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [5]:
## Dropping 'Timestamp' as it is not required
dratings = dratings.drop(columns = ["timestamp"])

In [6]:
##Dataset after dropping the column 'timestamp'
dratings

Unnamed: 0,userId,movieId,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [7]:
# Calculating the length of unique users in the dataset just for the knowledge and check purpose
len(dratings["userId"].unique())

943

In [8]:
#Picking up a random user (userId= 155) 
uId = 405
inputMovies = dratings[dratings['userId']== uId]
inputMovies.head(10)

Unnamed: 0,userId,movieId,rating
12276,405,56,4
12383,405,592,1
12430,405,1582,1
12449,405,171,1
12460,405,580,1
12472,405,1409,1
12484,405,953,3
12604,405,994,1
12607,405,387,1
12718,405,1432,1


In [9]:
# Number of movies that selected user has rated/watched
len(inputMovies)

737

In [10]:
# Now we will find out the similar users like the selected user
# Alternately, we can say, finding the useres who rates similar movies as our selected user
similarUsersA = dratings[dratings['movieId'].isin(inputMovies['movieId'].tolist())]
len(similarUsersA['userId'].unique())

930

In [11]:
# Creating sub dataframes based on userId
similarUsersGroupA = similarUsersA.groupby(['userId'])

In [12]:
# Let's sort those users dataframes
# So that, users with most in common with the selected user will have the priority
similarUsersGroupA = sorted(similarUsersGroupA, key= lambda x:len(x[1]), reverse = True)

In [13]:
# Inspecting the 2nd top user (1st one is the same as our selected user)
similarUsersGroupA[1]

(846,
        userId  movieId  rating
 56664     846     1074       3
 56764     846       94       4
 56811     846      627       4
 56859     846       57       2
 57066     846      377       2
 ...       ...      ...     ...
 99238     846      210       5
 99549     846      318       5
 99669     846       40       2
 99670     846       98       4
 99809     846      101       4
 
 [346 rows x 3 columns])

In [14]:
#Length of the movies watched by the userId=846 and its respective ratings
len(similarUsersGroupA[1][1])

346

In [15]:
### Calculating The Similarity (Pearson Correlation) of the Selected (Input) User to the Similar User
# Here, limiting the number to 100 Similar Users while computing the Similarity Score using pearson corrrelation.
similarUsersGroupA = similarUsersGroupA[1:101] # Removing the first user (userId =405), as it is our selected user
len(similarUsersGroupA)

100

In [16]:
#Store the Pearson Correlation in a dictionary, 
# where the key is the user Id and the value is the coefficient
def pearsonCorr(similarUsersGroup):
    pearsonCorrelationDict = {}
    #For every similar user group in our subset
    for name, group in similarUsersGroup:

        #Let's start by sorting the input and current user group so the values aren't mixed up later on
        group = group.sort_values(by='movieId')
        inputMovies1 = inputMovies.sort_values(by='movieId')

        #Get the N (total similar movies watched) for the formula 
        #nRatings = len(group)

        #Get the review scores of the Selected user for the movies that they both have in common
        temp_df = inputMovies1[inputMovies1['movieId'].isin(group['movieId'].tolist())]

        #And then store them in a temporary buffer variable in a list format to facilitate future calculations
        selectedUserTempRatingList = temp_df['rating'].tolist()
        #print(selectedUserTempRatingList)

        #Let's also put the current user group reviews in a list format
        similarUserTempRatingList = group['rating'].tolist()

        #Calculating pearson similarity
        simXX = 0
        meanSelectedUserTempRating = stat.mean(selectedUserTempRatingList)
        for i in selectedUserTempRatingList:
            simXX = simXX + pow((i - meanSelectedUserTempRating),2)

        simYY = 0
        meanSimilarUserTempRaning = stat.mean(similarUserTempRatingList)
        for j in similarUserTempRatingList:
            simYY = simYY + pow((j - meanSimilarUserTempRaning),2)

        simXY = 0
        for i, j in zip(selectedUserTempRatingList, similarUserTempRatingList):
            simXY = simXY+ ((i - meanSelectedUserTempRating ) * (j-meanSimilarUserTempRaning))

        #If the denominator is different than zero, then divide, else, 0 correlation.
        if simXX != 0 and simYY != 0:
            pearsonCorrelationDict[name] = simXY/np.sqrt(simXX*simYY)
        else:
            pearsonCorrelationDict[name] = 0                    
    
    maxSimilarUser = dict(sorted(pearsonCorrelationDict.items(), key=lambda item: item[1], reverse= True))
    return maxSimilarUser

In [17]:
def similarUsersofUserID (pearsonCorrelationDict):
    
    #Converting the Pearson Correlation Dictionary into a Data Frame
    similarityScoreofSimilarUsers_data = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
    similarityScoreofSimilarUsers_data.head()
    
    #Similarity score of other similar users to our selected user (155)
    similarityScoreofSimilarUsers_data.columns = ['similarityScore']
    similarityScoreofSimilarUsers_data['userId'] = similarityScoreofSimilarUsers_data.index
    similarityScoreofSimilarUsers_data.index = range(len(similarityScoreofSimilarUsers_data))
    
    #Sorting the DataFrame by the parameter called 'SimilarityScore'
    topSimilarUsers=similarityScoreofSimilarUsers_data.sort_values(by='similarityScore', ascending=False)
    
    # Merging the top similar users similarity score with their ratings of movies
    topSimilarUsersRating = topSimilarUsers.merge(dratings, left_on='userId', right_on='userId', how='inner')
    
    # Calculating the mean ratings of all the similar users
    meanRb = topSimilarUsersRating.groupby('userId').mean()[['rating']]
    meanRb.columns = ['avgRating']
    meanRb['userId'] = meanRb.index
    meanRb.index = range(len(meanRb))
    
    # Merging the top similar users average rating score 
    topSimilarUsersRating = topSimilarUsersRating.merge(meanRb, left_on='userId', right_on='userId', how='inner')
    
    #Multiplies the similarity by the user's ratings
    topSimilarUsersRating['weightedRatingScore'] = topSimilarUsersRating['similarityScore']*(topSimilarUsersRating['rating']-topSimilarUsersRating['avgRating'])
    
    #Applies a sum to the topUsers after grouping it up by userId
    tempTopSimilarUsersRating = topSimilarUsersRating.groupby('movieId').sum()[['weightedRatingScore']]
    tempTopSimilarUsersRating.columns = ['sum_weightedRatingScore']
    tempTopSimilarUsersRating['movieId'] = tempTopSimilarUsersRating.index
    
    #Creates an empty dataframe
    recommendation_data = pd.DataFrame()
    
    #Now we take the weighted average
    meanRa = inputMovies['rating'].mean()
    recommendation_data['weighted average recommendation score'] = meanRa+(tempTopSimilarUsersRating['sum_weightedRatingScore']/topSimilarUsers['similarityScore'].sum())
    recommendation_data['movieId'] = recommendation_data.index
    recommendation_data = recommendation_data.sort_values(by='weighted average recommendation score', ascending=False)
    recommendation_data.index = [x for x in range(1,len(recommendation_data)+1)]
    
    return recommendation_data

In [18]:
## Calling the function Pearson Correlation for calculating the similarity index
pearsonCorrelationDictA = pearsonCorr(similarUsersGroupA)

## Calling the function for generating recommendations for every user
recMovieA = similarUsersofUserID(pearsonCorrelationDictA)
recMovieA

Unnamed: 0,weighted average recommendation score,movieId
1,3.085856,50
2,3.010456,174
3,2.965200,172
4,2.857734,64
5,2.830892,98
...,...,...
1495,1.376844,395
1496,1.371075,231
1497,1.294310,546
1498,1.253499,29


In [19]:
#lets see users who are similar to user A by looking at correlation dictionary
pearsonCorrelationDictA

{130: 0.37961846221436457,
 222: 0.3429066839146551,
 276: 0.3197632949832927,
 303: 0.31890705970857397,
 880: 0.31809860423634717,
 618: 0.3121222403179797,
 896: 0.3067435633859717,
 864: 0.2994859766318906,
 749: 0.2874513850195474,
 622: 0.2859064276858622,
 437: 0.2735623522390208,
 416: 0.2663795737832617,
 95: 0.2650805900462755,
 291: 0.2618014265086014,
 379: 0.25533391720072673,
 429: 0.2472260089383148,
 301: 0.2455439411554271,
 886: 0.24466510320470453,
 417: 0.24120007956206244,
 271: 0.23124177641790494,
 7: 0.21510987177036997,
 474: 0.20441515266675514,
 378: 0.20435360376779346,
 727: 0.20326537806118627,
 194: 0.19668865053987647,
 457: 0.19147292934392982,
 758: 0.19130664988230153,
 648: 0.19022962121178844,
 407: 0.18841138553822653,
 506: 0.18821284080254094,
 497: 0.18720478821842731,
 796: 0.1847371600144272,
 747: 0.18400537855859128,
 650: 0.18359849171374107,
 653: 0.1809037320071532,
 286: 0.17871489070964677,
 343: 0.17790025641209048,
 328: 0.17391260970

In [20]:
#Lets take 130 and 222, so lets produce recommendation for them
#The same approach is followed for the following two users as done for the userA.
#The approach is basically finding the most similar users for userB and userC 
#Then followed by calculating the pearson correlation similarity index and moreover generating recommendations for every user.
# Following are the users and their respective ids for understanding the meaning for every variable name.
# For UserB: - userId= 130
# For UserC: - userId = 222

In [21]:
# For UserB: - userId= 130
uId = 130
inputMovies = dratings[dratings['userId']== uId]
similarUsersB = dratings[dratings['movieId'].isin(inputMovies['movieId'].tolist())]
similarUsersB= dratings[dratings['movieId'].isin(inputMovies['movieId'].tolist())]
similarUsersGroupB= similarUsersB.groupby(['userId'])
similarUsersGroupB= sorted(similarUsersGroupB, key= lambda x:len(x[1]), reverse = True)
similarUsersGroupB= similarUsersGroupB[1:101] # Removing the first user (userId =130 as it is our selected user
pearsonCorrelationDictB = pearsonCorr(similarUsersGroupB)
recMovieB = similarUsersofUserID(pearsonCorrelationDictB)

In [22]:
recMovieB

Unnamed: 0,weighted average recommendation score,movieId
1,5.092521,50
2,5.018192,174
3,4.926366,172
4,4.892665,98
5,4.883274,64
...,...,...
1607,3.565534,120
1608,3.540767,231
1609,3.505565,122
1610,3.493753,931


In [23]:
# For UserC: - userId = 222
uId = 222
inputMovies = dratings[dratings['userId']== uId]
similarUsersC= dratings[dratings['movieId'].isin(inputMovies['movieId'].tolist())]
similarUsersC=dratings[dratings['movieId'].isin(inputMovies['movieId'].tolist())]
similarUsersGroupC=similarUsersC.groupby(['userId'])
similarUsersGroupC= sorted(similarUsersGroupC, key= lambda x:len(x[1]), reverse = True)
similarUsersGroupC= similarUsersGroupC[1:101] # Removing the first user (userId =222 as it is our selected user
pearsonCorrelationDictC = pearsonCorr(similarUsersGroupC)
recMovieC = similarUsersofUserID(pearsonCorrelationDictC)

In [24]:
recMovieC

Unnamed: 0,weighted average recommendation score,movieId
1,4.159497,50
2,4.069838,174
3,4.052330,98
4,3.985572,64
5,3.951120,172
...,...,...
1556,2.644530,235
1557,2.603150,231
1558,2.581719,931
1559,2.576554,122


In [25]:
## Mergeing the recommendation DataFrames for userA and userB 
u1_2_df = pd.merge(recMovieA, recMovieB, how='inner', on=['movieId'])
u1_2_df

Unnamed: 0,weighted average recommendation score_x,movieId,weighted average recommendation score_y
0,3.085856,50,5.092521
1,3.010456,174,5.018192
2,2.965200,172,4.926366
3,2.857734,64,4.883274
4,2.830892,98,4.892665
...,...,...,...
1484,1.376844,395,3.703338
1485,1.371075,231,3.540767
1486,1.294310,546,3.666743
1487,1.253499,29,3.489634


In [26]:
## Mergeing the recommendation DataFrames for userA,userB with useC 
uCombined_df = pd.merge(u1_2_df, recMovieC, how='inner', on=['movieId'])
uCombined_df

Unnamed: 0,weighted average recommendation score_x,movieId,weighted average recommendation score_y,weighted average recommendation score
0,3.085856,50,5.092521,4.159497
1,3.010456,174,5.018192,4.069838
2,2.965200,172,4.926366,3.951120
3,2.857734,64,4.883274,3.985572
4,2.830892,98,4.892665,4.052330
...,...,...,...,...
1482,1.376844,395,3.703338,2.783498
1483,1.371075,231,3.540767,2.603150
1484,1.294310,546,3.666743,2.655897
1485,1.253499,29,3.489634,2.535653


In [27]:
## Calculating the average aggregration dictionary for group of three users: - UserA, UserB, UserC
avgMovieRatings = {}
for i in range(0, len(uCombined_df)-1):
    sum = uCombined_df.iloc[i]['weighted average recommendation score_x'] + uCombined_df.iloc[i]['weighted average recommendation score_y'] + uCombined_df.iloc[i]['weighted average recommendation score']
    avgRating = sum/3
    tempDict = {uCombined_df.iloc[i]['movieId']: avgRating}
    avgMovieRatings.update(tempDict)
    
avgMovieRatings = dict(sorted(avgMovieRatings.items(), key=lambda item: item[1], reverse= True))
avgMovieRatings

{50.0: 4.112624601147859,
 174.0: 4.032828821850178,
 172.0: 3.947561862450241,
 98.0: 3.925295618969416,
 64.0: 3.9088599737966043,
 12.0: 3.825137485528483,
 181.0: 3.7998919772803497,
 127.0: 3.7873965226745923,
 173.0: 3.779689579054216,
 22.0: 3.7657521157270017,
 318.0: 3.6887775264687037,
 195.0: 3.6751479260272895,
 79.0: 3.6582088465010414,
 96.0: 3.655841417768795,
 56.0: 3.6516787931779375,
 183.0: 3.6513686435615456,
 168.0: 3.639536464742294,
 191.0: 3.638089358861778,
 100.0: 3.6136049151602054,
 176.0: 3.5915083432381323,
 204.0: 3.5661046006382104,
 357.0: 3.56604320765704,
 483.0: 3.5605523489391064,
 89.0: 3.533907732916516,
 144.0: 3.5145796843649353,
 69.0: 3.5036130517740474,
 210.0: 3.4847603016905127,
 603.0: 3.478416357377713,
 187.0: 3.4658503273842207,
 185.0: 3.4604906067830385,
 11.0: 3.458917351420497,
 28.0: 3.455850235683638,
 651.0: 3.4557378275367756,
 496.0: 3.4513212047371176,
 186.0: 3.424697068719181,
 1.0: 3.4180233429585427,
 180.0: 3.411804991103

In [28]:
# Formating the dataFrame by assigning the column names
avgMovieRatings_df = pd.DataFrame(avgMovieRatings.items())
avgMovieRatings_df = pd.DataFrame(avgMovieRatings.items(), columns = ['movieId', 'group_ratings'])
avgMovieRatings_df

Unnamed: 0,movieId,group_ratings
0,50.0,4.112625
1,174.0,4.032829
2,172.0,3.947562
3,98.0,3.925296
4,64.0,3.908860
...,...,...
1481,546.0,2.538983
1482,122.0,2.530923
1483,931.0,2.524667
1484,231.0,2.504997


In [29]:
## Calculating the least misery dictionary for group of three users: - UserA, UserB, UserC
leastMiseryRatings = {}
for i in range(0, len(uCombined_df)-1):
    min_val = int(min(uCombined_df.iloc[i]['weighted average recommendation score_x'], uCombined_df.iloc[i]['weighted average recommendation score_y'], uCombined_df.iloc[i]['weighted average recommendation score']))
    tempDict1 = {uCombined_df.iloc[i]['movieId']: min_val}
    leastMiseryRatings.update(tempDict1)
    
leastMiseryRatings = dict(sorted(leastMiseryRatings.items(), key=lambda item: item[1], reverse= True))
leastMiseryRatings

{50.0: 3,
 174.0: 3,
 172.0: 2,
 64.0: 2,
 98.0: 2,
 181.0: 2,
 12.0: 2,
 127.0: 2,
 173.0: 2,
 22.0: 2,
 318.0: 2,
 79.0: 2,
 96.0: 2,
 168.0: 2,
 191.0: 2,
 357.0: 2,
 204.0: 2,
 483.0: 2,
 100.0: 2,
 56.0: 2,
 496.0: 2,
 183.0: 2,
 195.0: 2,
 176.0: 2,
 144.0: 2,
 69.0: 2,
 651.0: 2,
 603.0: 2,
 186.0: 2,
 28.0: 2,
 185.0: 2,
 187.0: 2,
 423.0: 2,
 89.0: 2,
 199.0: 2,
 210.0: 2,
 194.0: 2,
 1.0: 2,
 196.0: 2,
 427.0: 2,
 234.0: 2,
 655.0: 2,
 511.0: 2,
 313.0: 2,
 223.0: 2,
 132.0: 2,
 216.0: 2,
 180.0: 2,
 208.0: 2,
 202.0: 2,
 7.0: 2,
 82.0: 2,
 215.0: 2,
 8.0: 2,
 11.0: 2,
 265.0: 2,
 134.0: 2,
 153.0: 2,
 484.0: 2,
 657.0: 2,
 474.0: 2,
 480.0: 2,
 9.0: 2,
 527.0: 2,
 211.0: 2,
 228.0: 2,
 83.0: 2,
 520.0: 2,
 479.0: 2,
 203.0: 2,
 182.0: 2,
 435.0: 2,
 97.0: 2,
 71.0: 2,
 588.0: 2,
 193.0: 2,
 258.0: 2,
 531.0: 2,
 526.0: 2,
 42.0: 2,
 385.0: 2,
 302.0: 2,
 272.0: 2,
 238.0: 2,
 87.0: 2,
 151.0: 2,
 197.0: 2,
 200.0: 2,
 143.0: 2,
 135.0: 2,
 178.0: 2,
 156.0: 2,
 95.0: 2,
 124

In [30]:
# Formating the dataFrame by assigning the column names
leastMiseryRatings_df = pd.DataFrame(leastMiseryRatings.items())
leastMiseryRatings_df = pd.DataFrame(leastMiseryRatings.items(), columns = ['movieId', 'group_ratings'])
leastMiseryRatings_df

Unnamed: 0,movieId,group_ratings
0,50.0,3
1,174.0,3
2,172.0,2
3,64.0,2
4,98.0,2
...,...,...
1481,450.0,1
1482,395.0,1
1483,231.0,1
1484,546.0,1


In [31]:
## Top 20 Recommendations for the Group of users (UserA, UserB, UserC) by the score of Average Aggregration
dmovies.loc[dmovies['movieId'].isin(avgMovieRatings_df.head(20)['movieId'].tolist())]

Unnamed: 0,movieId,title
11,12,"Usual Suspects, The (1995)"
21,22,Braveheart (1995)
49,50,Star Wars (1977)
55,56,Pulp Fiction (1994)
63,64,"Shawshank Redemption, The (1994)"
78,79,"Fugitive, The (1993)"
95,96,Terminator 2: Judgment Day (1991)
97,98,"Silence of the Lambs, The (1991)"
99,100,Fargo (1996)
126,127,"Godfather, The (1972)"


In [32]:
## Top 20 Recommendations for the Group of users (UserA, UserB, UserC) by the score of Least Misery
dmovies.loc[dmovies['movieId'].isin(leastMiseryRatings_df.head(20)['movieId'].tolist())]

Unnamed: 0,movieId,title
11,12,"Usual Suspects, The (1995)"
21,22,Braveheart (1995)
49,50,Star Wars (1977)
55,56,Pulp Fiction (1994)
63,64,"Shawshank Redemption, The (1994)"
78,79,"Fugitive, The (1993)"
95,96,Terminator 2: Judgment Day (1991)
97,98,"Silence of the Lambs, The (1991)"
99,100,Fargo (1996)
126,127,"Godfather, The (1972)"
