In [1]:
import numpy as np
import pandas as pd
import math
import statistics as stat
from scipy.stats import pearsonr, kendalltau
import matplotlib.pyplot as plt

In [2]:
#loading datasets

ratings = pd.read_csv("/Volumes/bp first/Tampere University/Recommender Systems/ml-latest-small/ratings.csv")
movies = pd.read_csv("/Volumes/bp first/Tampere University/Recommender Systems/ml-latest-small/movies.csv")

In [3]:
##Dropping the columns from the ratings--timestamp
ratings = ratings.drop(columns = ["timestamp"])

In [4]:
## Takes UserId and creates a User Matrix
def create_userX_matrix(userId):
    userX_matrix = ratings.query('userId == @userId',inplace = False)
    return userX_matrix
## Creates a user subset where one groups by userID which will be taken into consideration for calculating 
## Pearson correlation and find out the most similar users.
def user_subset_common_movie(userId):
    userX_matrix = create_userX_matrix(userId)
    users = ratings[ratings['movieId'].isin(userX_matrix['movieId'].tolist())]
    userSubsetGroup = users.groupby(['userId'])
    userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]) and len(x[1]) > 50, reverse=True)
    return userSubsetGroup, userX_matrix
## Function for calculating the Pearson Correlation Function
def pearsonCorr(inputMovies, similarUsersGroup):
    pearsonCorrelationDict = {}
    for name, group in similarUsersGroup:
        group = group.sort_values(by='movieId')
        inputMovies1 = inputMovies.sort_values(by='movieId')
        temp_df = inputMovies1[inputMovies1['movieId'].isin(group['movieId'].tolist())]
        selectedUserTempRatingList = temp_df['rating'].tolist()
        similarUserTempRatingList = group['rating'].tolist()

        simXX = 0
        meanSelectedUserTempRating = stat.mean(selectedUserTempRatingList)
        for i in selectedUserTempRatingList:
            simXX = simXX + pow((i - meanSelectedUserTempRating),2)
        simYY = 0
        meanSimilarUserTempRaning = stat.mean(similarUserTempRatingList)
        for j in similarUserTempRatingList:
            simYY = simYY + pow((j - meanSimilarUserTempRaning),2)
        simXY = 0
        for i, j in zip(selectedUserTempRatingList, similarUserTempRatingList):
            simXY = simXY+ ((i - meanSelectedUserTempRating ) * (j-meanSimilarUserTempRaning))

        if simXX != 0 and simYY != 0:
            pearsonCorrelationDict[name] = simXY/np.sqrt(simXX*simYY)
        else:
            pearsonCorrelationDict[name] = 0                    
    
    maxSimilarUser = dict(sorted(pearsonCorrelationDict.items(), key=lambda item: item[1], reverse= True))
    return maxSimilarUser

In [5]:
## Function for producing Recommendations for the specific User
def produceRec (inputMovies, pearsonCorrelationDict):
    similarityScoreofSimilarUsers_data = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
    similarityScoreofSimilarUsers_data.head()
    similarityScoreofSimilarUsers_data.columns = ['similarityScore']
    similarityScoreofSimilarUsers_data['userId'] = similarityScoreofSimilarUsers_data.index
    similarityScoreofSimilarUsers_data.index = range(len(similarityScoreofSimilarUsers_data))
    topSimilarUsers=similarityScoreofSimilarUsers_data.sort_values(by='similarityScore', ascending=False)
    topSimilarUsersRating = topSimilarUsers.merge(ratings, left_on='userId', right_on='userId', how='inner')
    topSimilarUsersRating.head()
    meanRb = topSimilarUsersRating.groupby('userId').mean()[['rating']]
    meanRb.columns = ['avgRating']
    meanRb['userId'] = meanRb.index
    meanRb.index = range(len(meanRb))
    topSimilarUsersRating = topSimilarUsersRating.merge(meanRb, left_on='userId', right_on='userId', how='inner')
    topSimilarUsersRating['weightedRatingScore'] = topSimilarUsersRating['similarityScore']*(topSimilarUsersRating['rating']-topSimilarUsersRating['avgRating'])
    tempTopSimilarUsersRating = topSimilarUsersRating.groupby('movieId').sum()[['weightedRatingScore']]
    tempTopSimilarUsersRating.columns = ['sum_weightedRatingScore']
    tempTopSimilarUsersRating['movieId'] = tempTopSimilarUsersRating.index
    

    recommendation_data = pd.DataFrame()
    meanRa = inputMovies['rating'].mean()
    recommendation_data['weighted average recommendation score'] = meanRa+(tempTopSimilarUsersRating['sum_weightedRatingScore']/topSimilarUsers['similarityScore'].sum())
    recommendation_data['movieId'] = recommendation_data.index
    recommendation_data = recommendation_data.sort_values(by='weighted average recommendation score', ascending=False)
    recommendation_data.index = [x for x in range(1,len(recommendation_data)+1)]
    recommendation_data = recommendation_data.rename(columns=  {'weighted average recommendation score': 'scores'})
    return recommendation_data

In [6]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [7]:
#UserA = 196 --> Taken Randomly 
userA = 196
userSubsetGroupA, userAMatrix = user_subset_common_movie(userA)
pearsonCorrelationDictA = pearsonCorr(userAMatrix, userSubsetGroupA)
recMovieA = produceRec(userAMatrix, pearsonCorrelationDictA)
recMovieA = recMovieA.merge(movies, left_on = "movieId", right_on = "movieId", how="left")

#User B and User C are the similar Users of User A --> Through Pearson Correlation
userB = 37
userSubsetGroupB, userBMatrix = user_subset_common_movie(userB)
pearsonCorrelationDictB = pearsonCorr(userBMatrix, userSubsetGroupB)
recMovieB = produceRec(userBMatrix, pearsonCorrelationDictB)
recMovieB = recMovieB.merge(movies, left_on = "movieId", right_on = "movieId", how="left")

userC = 48
userSubsetGroupC, userCMatrix = user_subset_common_movie(userC)
pearsonCorrelationDictC = pearsonCorr(userCMatrix, userSubsetGroupC)
recMovieC = produceRec(userCMatrix, pearsonCorrelationDictC)
recMovieC = recMovieC.merge(movies, left_on = "movieId", right_on = "movieId", how="left")

Mergeing dataFrames and performing inner join on recMovieA and recMovieC.
The agenda behind is to find out the common movies between User A and User C and checking movies which are not present in the recommendation List (recMovieB) of User B.
The question for why not a specific movie will be basically asked and the explanation will  be generated.

In [8]:
#pearsonCorrelationDictA

In [9]:
mergedInner_UserA_UserC = pd.merge(recMovieA, recMovieC, how="inner", on=['movieId'])
# df = mergedInner_UserA_UserC.merge(recMovieB, how="outer", indicator=True).loc[lambda x:x['_merge']=='left_only']
# df

In [10]:
def produce_rec_average_agg(recMovieA, recMovieB, recMovieC):
    aL = recMovieA['movieId'].tolist()
    bL = recMovieB['movieId'].tolist()
    cL = recMovieC['movieId'].tolist()

    ## Finding out the common movies between UserA, UserB and UserC
    commonRecMov = set(aL) & set(bL) & set(cL)

    ## Fetching the movies from the recommendation List which are found in the common recommendation of the movies
    recA = recMovieA[recMovieA['movieId'].isin(commonRecMov)]
    recB = recMovieB[recMovieB['movieId'].isin(commonRecMov)]
    recC = recMovieC[recMovieC['movieId'].isin(commonRecMov)]

    # Averaging the three user's recommendation with averaging method
    frames = [recA, recB, recC]
    result = pd.concat(frames)
    GrAv = result.groupby(['movieId'], as_index= False).mean().sort_values(by='scores', ascending=False)
    GrAv.index = [x for x in range(1, len(GrAv)+1)]
    #GrAv['title'] = GrAv['movieId'].map(movies['title'])
    groupAv = GrAv.merge(movies, left_on="movieId", right_on ="movieId", how="left")
    return groupAv                        

In [11]:
#showing the top 20 reccommended movie for average method
#20 recommended movies for the first iteration
groupRec = produce_rec_average_agg(recMovieA, recMovieB, recMovieC)
groupRec.head(20)
recMovieA.head(20)

Unnamed: 0,scores,movieId,title,genres
0,11.299886,110,Braveheart (1995),Action|Drama|War
1,10.234323,457,"Fugitive, The (1993)",Thriller
2,8.867873,2762,"Sixth Sense, The (1999)",Drama|Horror|Mystery
3,7.923131,104,Happy Gilmore (1996),Comedy
4,7.631838,2028,Saving Private Ryan (1998),Action|Drama|War
5,7.603641,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
6,7.385968,527,Schindler's List (1993),Drama|War
7,7.249728,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
8,6.960272,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
9,6.959424,1527,"Fifth Element, The (1997)",Action|Adventure|Comedy|Sci-Fi


In [12]:
print(len(groupRec), len(recMovieA), len(recMovieB), len(recMovieC))

9582 9673 9626 9665


In [13]:
a = set(recMovieA["movieId"].head(20).tolist())
b = set(recMovieB["movieId"].head(20).tolist())
c = set(recMovieC["movieId"].head(20).tolist())
g = set(groupRec["movieId"].head(20).tolist())

x = a - g
y = b - g
z = c - g
print(x)
print(y)
print(z)

{1968, 2997, 70286}
{608, 1089, 4226, 1219, 260, 231, 1258, 2959, 1136, 858, 344, 1210, 316, 541}
{608, 1089, 260, 356, 2571, 1196, 589, 1198, 2959, 1200, 1265, 1210, 79132}


In [14]:
## Unique genres and all genres of a particular group or user.  
def get_genres(rec):
    genres = rec["genres"].tolist()
    all_genres = []
    for s in genres:
        seperated = s.split("|")
        for s2 in seperated:
            all_genres.append(s2)
    unique_genres = list(set(all_genres))
    return all_genres, unique_genres

##
def genres_proportion(all_genres, unique_g):
    dict_genres = {}
    n = len(all_genres)
    for g in unique_g:
        b = all_genres.count(g)
        dict_genres[g] = round(b/n, 3)   
    dict_genres = dict(sorted(dict_genres.items(), key=lambda item: item[1], reverse= True)) 
    return dict_genres

def genres_rank(rec):
    all_genres, unique_genres = get_genres(rec)
    #make dictionary for all genres in the recommender system
    dict_genres = {}
    for g in unique_genres:
        dict_genres[g] = []  
    for l, m in rec.iterrows():
        gList = m["genres"].split("|")
        for gen in gList:
            dict_genres[gen].append(m["movieId"])   
    dict_genres = dict(sorted(dict_genres.items(), key=lambda item: item[1], reverse= True))
    new_dict_genres = {}
    for key, item in dict_genres.items():
        new_dict_genres[key] = []
        for mid in item:
            mid = float(rec[rec["movieId"] == mid]["scores"])
            mid = round(mid, 4)
            new_dict_genres[key].append(mid)      
    for key, item in new_dict_genres.items():
        new_dict_genres[key] = np.average(item) 
    new_dict_genres = dict(sorted(new_dict_genres.items(), key=lambda item: item[1], reverse= True))
    df = pd.DataFrame(list(new_dict_genres.items()),columns = ['genres','average_score'])
    return df

In [15]:
#Find the average recoommendation score for each genres in the group recommendation
top20_total_g, unique_g_top20 = get_genres(groupRec.head(20))
dictAgregate = genres_proportion(top20_total_g, unique_g_top20 )#ignoring the prediction score of the movie
genresRankGR = genres_rank(groupRec) #take into account the prediction score of the movie
genresRankGR

Unnamed: 0,genres,average_score
0,War,4.066933
1,Documentary,4.045009
2,Mystery,4.043733
3,Drama,4.041526
4,Fantasy,4.041327
5,Crime,4.040685
6,Western,4.040613
7,Horror,4.03996
8,Comedy,4.03672
9,Film-Noir,4.034769


In [16]:
dictAgregate

{'Drama': 0.173,
 'Thriller': 0.135,
 'Crime': 0.096,
 'Adventure': 0.096,
 'Comedy': 0.096,
 'Mystery': 0.077,
 'Sci-Fi': 0.058,
 'War': 0.058,
 'Action': 0.058,
 'Horror': 0.038,
 'Fantasy': 0.038,
 'Animation': 0.038,
 'Musical': 0.019,
 'Romance': 0.019}

In [17]:
#User A
userAGen, uniqueUserAGen = get_genres(recMovieA.head(20))
dictAGen = genres_proportion(userAGen, uniqueUserAGen )
userA_genre_score = genres_rank(recMovieA.head(20))
userA_genre_score

Unnamed: 0,genres,average_score
0,War,8.772567
1,Action,8.630367
2,Horror,7.9141
3,Drama,7.333089
4,Adventure,6.90582
5,Fantasy,6.898333
6,Thriller,6.8419
7,Mystery,6.68104
8,Crime,6.586275
9,Comedy,6.5509


In [18]:
# UserB
userBGen, uniqueUserBGen = get_genres(recMovieB.head(20))
dictBGen = genres_proportion(userBGen, uniqueUserBGen )
userB_genre_score = genres_rank(recMovieB.head(20))
userB_genre_score

Unnamed: 0,genres,average_score
0,Comedy,5.47538
1,Drama,5.41886
2,Crime,5.340187
3,Horror,5.309067
4,Thriller,5.2974
5,Romance,5.0344
6,Fantasy,4.9699
7,Mystery,4.95134
8,Sci-Fi,4.91962
9,Adventure,4.90402


In [19]:
# User C
userCGen, uniqueUserCGen = get_genres(recMovieC.head(20))
dictCGen = genres_proportion(userCGen, uniqueUserCGen )
userC_genre_score = genres_rank(recMovieC.head(20))
userC_genre_score

Unnamed: 0,genres,average_score
0,Sci-Fi,4.952371
1,Action,4.83848
2,Thriller,4.801229
3,Adventure,4.736171
4,IMAX,4.6414
5,Crime,4.599833
6,Horror,4.5873
7,Comedy,4.5863
8,Drama,4.567837
9,Fantasy,4.553967


# Q1. Why not Matrix? ( Movie Id: 2571)

Function for Creating Atomic Cases by considering the MovieId: 2571, MovieName: Matrix, The (1999)

In [20]:
def atomicCase(mId, recMovieA, recMovieB, recMovieC, groupRec):
    k = 20
    e = []
    boolValResult = mId in groupRec.movieId
    boolA = mId in recMovieA.movieId
    boolB = mId in recMovieB.movieId
    boolC = mId in recMovieC.movieId
    
    if(boolValResult == True):
        movieName = groupRec[groupRec["movieId"] == mId]["title"].values[0]
        indexValGroup = groupRec[groupRec["movieId"] == mId].index.values[0]
        indexValA = recMovieA[recMovieA["movieId"] == mId].index.values[0]
        indexValB = recMovieB[recMovieB["movieId"] == mId].index.values[0]
        indexValC = recMovieC[recMovieC["movieId"] == mId].index.values[0]
        
        if(indexValGroup in range(k, len(groupRec))):
            e.append(f"The Rank for the Movie {movieName} has a quite low rankings")
            e.append(f"The Reason for the k Value Low because the recommendation made for the {movieName}:")
            e.append(f"{movieName} :: > -- User A --> {indexValA}")
            e.append(f"{movieName} :: > -- User B --> {indexValB}")
            e.append(f"{movieName} :: > -- User C --> {indexValC}")
            return e
    else:
        if boolA == True and boolB == True:
            e.append(f'{movieName} is not recommended by User C')
        elif boolB == True and boolC == True:
            e.append(f'{movieName} is not recommended by User A')
        elif boolA == True and boolC == True:
            e.append(f'{movieName} is not recommended by User B')
        elif boolA == True:
            e.append(f'{movieName} is not recommended by User B and User C')
        elif boolB == True:
            e.append(f'{movieName} is not recommended by User B and User C')
        elif boolC == True:
            e.append(f'{movieName} is not recommended by User A and User B')
        else:
            e.append(f"Movie {movieName} does not exist in any user's recommendation list")
            return e
    

In [21]:
result = atomicCase(2571,  recMovieA, recMovieB, recMovieC, groupRec)
result

['The Rank for the Movie Matrix, The (1999) has a quite low rankings',
 'The Reason for the k Value Low because the recommendation made for the Matrix, The (1999):',
 'Matrix, The (1999) :: > -- User A --> 9672',
 'Matrix, The (1999) :: > -- User B --> 344',
 'Matrix, The (1999) :: > -- User C --> 0']

 # Q2. Why not "Documentary" movies?

In [22]:
def general_genres_rating(rec):
    gen, uniqueGen = get_genres(rec)
    dictGen = genres_proportion(gen, uniqueGen )
    genre_score = genres_rank(rec)
    return genre_score

def top_k(grec, k, *userRecs):
    allRecs = {}
    if k == "all":
        allRecs["group"] = grec
        allRecs["userA"] = userRecs[0]
        allRecs["userB"] = userRecs[1]
        allRecs["userC"] = userRecs[2] 
    else:
        allRecs["group"] = grec.head(k)
        allRecs["userA"] = userRecs[0].head(k)
        allRecs["userB"] = userRecs[1].head(k)
        allRecs["userC"] = userRecs[2].head(k)
    #produce genre's rating for each reccommendation
    listOfGenreAvg = {}
    for key, val in allRecs.items():
        listOfGenreAvg[key] = general_genres_rating(val)
        
    return listOfGenreAvg

def wn_group(genres, grec, *userRecs):
    explanationsList = []
    k = 20
    listOfGenreAvg20 = top_k(grec, k, *userRecs)   
    # check the genre in question existance in each reccommendation
    for key, val in listOfGenreAvg20.items():
        if(genres not in val["genres"].tolist()):
            strExp = f'{genres} is not in {key} top{k} reccomendation'
            explanationsList.append(strExp)
    # check the genre in top 40 of each reccommendation
    k = 40
    listOfGenreAvg40 = top_k(grec, k, *userRecs)
    for key, val in listOfGenreAvg40.items():
        if(genres not in val["genres"].tolist()):
            strExp = f'{genres} is not in {key} top{k} reccomendation'
            explanationsList.append(strExp)
    return explanationsList

In [23]:
#Western, Film-Noir, Documentary
y = wn_group("Documentary", groupRec, recMovieA, recMovieB, recMovieC )
y

['Documentary is not in group top20 reccomendation',
 'Documentary is not in userA top20 reccomendation',
 'Documentary is not in userB top20 reccomendation',
 'Documentary is not in userC top20 reccomendation',
 'Documentary is not in group top40 reccomendation',
 'Documentary is not in userA top40 reccomendation',
 'Documentary is not in userB top40 reccomendation',
 'Documentary is not in userC top40 reccomendation']

# Q3. Why not rank Matrix first?

In [24]:
def avgGenresRanking(m_Id,groupRec,genresRankGR):
    avgScoreGenres = []
    indexOfMovie = groupRec[groupRec["movieId"] == m_Id].values[0]
    genresListed = indexOfMovie[3] #Returning all the genres
    genresSeperateList = genresListed.split("|")
    for score in genresSeperateList:
        tempVal = genresRankGR[genresRankGR["genres"] == score].index.values[0]
        #print(tempVal)
        avgScoreGenres.append(tempVal)
    meanAvgGenresScore = np.mean(avgScoreGenres)
    noOfGenresPerMovie = len(avgScoreGenres)
    return meanAvgGenresScore, noOfGenresPerMovie

In [25]:
def getMovieRankingByAllUsers(mId):
    e = []
    boolValResult = mId in groupRec.movieId
    boolA = mId in recMovieA.movieId
    boolB = mId in recMovieB.movieId
    boolC = mId in recMovieC.movieId
    if(boolValResult == True):
        movieName = groupRec[groupRec["movieId"] == mId]["title"].values[0]
        indexValGroup = groupRec[groupRec["movieId"] == mId].index.values[0]
        indexValA = recMovieA[recMovieA["movieId"] == mId].index.values[0]
        indexValB = recMovieB[recMovieB["movieId"] == mId].index.values[0]
        indexValC = recMovieC[recMovieC["movieId"] == mId].index.values[0]
        e.append(f"The Reason for the k Value Low because the recommendation made for the {movieName}:")
        e.append(f"{movieName} :: > -- User A --> {indexValA+1}")
        e.append(f"{movieName} :: > -- User B --> {indexValB+1}")
        e.append(f"{movieName} :: > -- User C --> {indexValC+1}")
        return e

In [26]:
m_Id = 457
#6,16,32,296
avgScoreRatingsGenres = []
qtyScore = []
indexOfMId = groupRec[groupRec["movieId"] == m_Id].index.values[0]
for i in range(0, indexOfMId):
    movieIdTemp = groupRec[groupRec.index == i]["movieId"].values[0]
    tempVal1, lenGenres = avgGenresRanking(movieIdTemp,groupRec,genresRankGR)
    avgScoreRatingsGenres.append(tempVal1)
    qtyScore.append(lenGenres)
res=getMovieRankingByAllUsers(m_Id)
print(res)
print("Average Score Rating of Genres :", avgScoreRatingsGenres)
print("Number of Genres per movie :",qtyScore)

['The Reason for the k Value Low because the recommendation made for the Fugitive, The (1993):', 'Fugitive, The (1993) :: > -- User A --> 2', 'Fugitive, The (1993) :: > -- User B --> 8384', 'Fugitive, The (1993) :: > -- User C --> 7758']
Average Score Rating of Genres : [7.0, 7.25]
Number of Genres per movie : [3, 4]
