In [None]:
# There are two types of collaborative filtering
# 1) User to user collaborative filter
# 2) Item to Item collaborative filter
# Usually item to item collab filter works better than user to user blc noramlly there r more users in the system than the no of products or categories in a system


# **Collaborative Filtering using K neighrest neighbour**

In [1]:
# Extract dataset

from zipfile import ZipFile

# Create a ZipFile Object and load sample.zip in it
with ZipFile('/content/drive/My Drive/datasets/ml-100k.zip', 'r') as zipObj:
   # Extract all the contents of zip file in current directory
   zipObj.extractall()

In [None]:
# we r interested in u.data. it has complete rating of movies along with userid
# this dataset contains 1682 movies and 943 users. It's a tab seperated file. cols are userid and itemid which is id for the movie.
# u.item contains information about the movies themselves. like title, genres, imdb url etc.
# u.data contains the information about the user. such as userid, movieid, the rating and date at which the rating was given.
# for our purpose we r gonna combine 2 files i.e. u.data and u.item


In [1]:
import pandas as pd
import numpy as np

datafile = "./ml-100k/u.data"
data = pd.read_csv(datafile, sep = '\t', header = None, names = ['userId', 'itemId', 'rating', 'timestamp'])
# header = None mean the file has no col name so col number will be asssigned.
data.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [2]:
movieInfoFile = './ml-100k/u.item'
movieInfo = pd.read_csv(movieInfoFile, sep = '|', header = None, index_col = False, encoding='latin-1',
                        names = ['itemId', 'title'], usecols = [0, 1])

# the datset was giving utf-8 encoding error therefore i used latin-1 encoding, on which the dataset worked perfectly ok
# usecols mean dont use all the columns present in the dataset. just use 0 and 1 column. 
# if we dont set index_col = False then pandas by default would have taken itemId col as an row index no col. 

movieInfo.head()

Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [3]:
# now merging both the tables with a common col name in both the dataframes

data = pd.merge(data, movieInfo, left_on='itemId', right_on= 'itemId')

data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [4]:
data.loc[0: 10, ['userId']]
# loc will give u 11 values from the userId col. 

Unnamed: 0,userId
0,196
1,63
2,226
3,154
4,306
5,296
6,34
7,271
8,201
9,209


In [5]:
toyStoryUsers = data[data.title =='Toy Story (1995)']
# this will give us subset of dataframe with title as Toy story. It is a kind of filter. 
toyStoryUsers

Unnamed: 0,userId,itemId,rating,timestamp,title
3397,308,1,4,887736532,Toy Story (1995)
3398,287,1,5,875334088,Toy Story (1995)
3399,148,1,4,877019411,Toy Story (1995)
3400,280,1,4,891700426,Toy Story (1995)
3401,66,1,3,883601324,Toy Story (1995)
...,...,...,...,...,...
3844,895,1,4,879437950,Toy Story (1995)
3845,747,1,5,888639138,Toy Story (1995)
3846,786,1,4,882841828,Toy Story (1995)
3847,800,1,4,887646283,Toy Story (1995)


In [10]:
# how to sort values in a dataframe
data = pd.DataFrame.sort_values(data, ['userId', 'itemId'], ascending = [0, 1])
# it will sort the dataframe data on the cols userId and itemId.
# ascending = [0, 1] mean 1st col will be sort in descending order and 2nd col will be sort in ascending order.
data.head()


Unnamed: 0,userId,itemId,rating,timestamp,title
23781,943,2,5,888639953,GoldenEye (1995)
65410,943,9,3,875501960,Dead Man Walking (1995)
35098,943,11,4,888639000,Seven (Se7en) (1995)
43773,943,12,5,888639093,"Usual Suspects, The (1995)"
57040,943,22,4,888639042,Braveheart (1995)


In [18]:
# for finding the max userId and itemId

numUsers = max(data.userId)
numMovies = max(data.itemId)

# we can also see how many movies were rated by each user and the number of users that rated each movie

moviesPerUser = data.userId.value_counts()
# this will tell how many times each user appears in the dataframe. i.e how many movies each user rated. 
usersPerMovie = data.title.value_counts()
# this will tell how many times each title appears in the dataframe. i.e. how many users rated each movie 

print(moviesPerUser)
print("-------------")
print(usersPerMovie)

405    737
655    685
13     636
450    540
276    518
      ... 
147     20
19      20
572     20
636     20
895     20
Name: userId, Length: 943, dtype: int64
-------------
Star Wars (1977)                                             583
Contact (1997)                                               509
Fargo (1996)                                                 508
Return of the Jedi (1983)                                    507
Liar Liar (1997)                                             485
                                                            ... 
Homage (1995)                                                  1
I Don't Want to Talk About It (De eso no se habla) (1993)      1
Power 98 (1995)                                                1
Invitation, The (Zaproszenie) (1986)                           1
Dadetown (1995)                                                1
Name: title, Length: 1664, dtype: int64


In [22]:
# lets write a function to find top N favourite movies of a user.

def favoriteMovies(activeUser, N):
    user = data[data.userId == activeUser]
    top_movies = pd.DataFrame.sort_values(user, ['rating'], ascending = [0])[:N]
    # 0 mean sort in descending order.
    # fetch all the records/rows till N. 
    return list(top_movies.title) # we only need title col of that user. and this will return a list not df.

print(favoriteMovies(5, 3))  # 5 is the userId and 3 is the total top 3 rated movies by that user.

['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)']


In [None]:
# let's get down to finding some recommendations now.

# Using Neigbour based collaborative filtering model

In [23]:
# we r going to find k nearest neigbours of the user. the one who are very similar to our active user. and user 
# their rating to predict what the rating of the active user will be for the movie he hasn't rated yet. 
# the activeuser is whom we are finding the recommendation. 

# we will represent each user as a vector. each element in the vector will be the rating of the specific movie 
# by that user. since we have 1600 odd movies so each vector will contain 1600 elements. 

userItemRatingMatrix = pd.pivot_table(data, values = 'rating', index= ['userId'],
                                     columns=['itemId'])
# each row will be a user and cols will be rating for each movie
# index hoga userId and uski values hongi ratings. cols name will be itemId.
userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [94]:
# Compute similarity blw two users.
# using correlation

from scipy.spatial.distance import correlation

def similarity(user1, user2):
    # 1st we will normalize the ratings given by each user.
    user1 = np.array(user1) - np.nanmean(user1) # np.nanmean() will ignore all the nan values.
    user2 = np.array(user2) - np.nanmean(user2)
    # Now we will find the similarity blw 2 users.
    # 1st we will subset each user to be represented only by the ratings for the movies both the users have in common.
    # we will calculate the similarity blw only those movies which both the users have rated. 
    # for this we will take intersection blw both the users. so will find similar movies both the user had rated.
    commonItemIds = [i for i in range(len(user1)) if user1[i] > 0 and user2[i] > 0]
    # gives us movies in which both the users have ratings.
    if len(commonItemIds) == 0:
        # if there are no movies in common
        return 0
    else:
        user1 = np.array([user1[i] for i in commonItemIds])
        user2 = np.array([user2[i] for i in commonItemIds])
        return correlation(user1, user2)

# The above function is explained in steps below

In [104]:
# user1 = userItemRatingMatrix.loc[1]
# user2 = userItemRatingMatrix.loc[2]
# usera = np.array(user1) - np.nanmean(user1)
# userb = np.array(user2) - np.nanmean(user2)
# usera

In [105]:
# commonItemIds = []
# for i in range(len(usera)):
#     if usera[i] > 0 and userb[i] > 0:
#         commonItemIds.append(i)
        
# for i in commonItemIds:
#     print(usera[i])
#     # returns the rating at specific index

# # OR
# usertt = np.array([usera[i] for i in commonItemIds])
# usertt

# Continue 

In [97]:
# using above similarity function, lets find the nearest neighbours of the active user, i.e. by using above funtion
# we can find the nearest neighbors to avtive user by comapring the active user with all the other users. 
def nearestNeighbourRatings(activeUser, k):
    # k is the no of nearest neighbors
    # this function will find the k nearest neighbours of the active user, then 
    # it will find the rating of the nearest neighbours for all the movies to predict the rating 
    # for other movies of active user. Jin movies ki rating nhi available activeUser k pass, un movies 
    # ki rating vo nearest neighbours ki rating ko dekh kr kry ga.
    
    similarityMatrix = pd.DataFrame(index = userItemRatingMatrix.index, columns = ['Similarity'])
    # userItemRatingMatrix.index is the userId
    # creates an empty matrix whose rows index is userIds and one creates one col name simiarity. 
    # this similairty col is currently empty. 
    # we will fill it as soon as we compute the similairty blw each user and active user
    
    for i in userItemRatingMatrix.index:
        similarityMatrix.loc[i] = similarity(userItemRatingMatrix.loc[activeUser], userItemRatingMatrix.loc[i])
        # this will compute the similarity blw 2 users and store it at the corresponding element of similarity-
        # Matrix that we setup.
    # now lets sort the similarity matrix by Similarity value in descending order.
    similarityMatrix = pd.DataFrame.sort_values(similarityMatrix, ['Similarity'], ascending = [0])
    # now we will pick the top k rows of this matrix and that will give us the k nearest neighbors
    nearestNeighbors = similarityMatrix[:k]
    # we will use the rating of these nearest neighbors to predict the rating for the activeUser that he hasnt rated.
    # to do this lets find the rating of each of the nearest neighbors for every other movie. 
    neighborItemRatings = userItemRatingMatrix.loc[nearestNeighbors.index]
    predictItemRating = pd.DataFrame(index = userItemRatingMatrix.columns, columns= ["Rating"])
    # lets fill this up 
    for i in userItemRatingMatrix.columns:
        # for each item 
        # start with the average rating of the user
        predictedRating = np.nanmean(userItemRatingMatrix.loc[activeUser])
        for j in neighborItemRatings.index:
            # for each neighbor in the neighbor list 
            if userItemRatingMatrix.loc[j, i] > 0:
                # if the neighbor has rated that item 
                # add the rating of the neighbor for that item adjusted by the average rating of the neighbor
                # weighted by similarity of the neighbor to the activeUser.
                predictedRating += (userItemRatingMatrix.loc[j, i] - np.nanmean(userItemRatingMatrix.loc[j]))* nearestNeighbors.loc[j, 'Similarity']
                    
        # now we have predictedRating for every item for the active user.
        # we just need to add the rating to the predictedRatingMatrix. 
        predictItemRating.loc[i, 'Rating'] = predictedRating
        
    return predictItemRating
        
    

# Rough work

In [103]:
# similarityMatrix = pd.DataFrame(index = userItemRatingMatrix.index, columns = ['Similarity'])
# similarityMatrix.loc[3] = 3
# similarityMatrix

In [102]:
# nearestNeighbors = similarityMatrix[:5]
# bj = userItemRatingMatrix.loc[nearestNeighbors.index]
# bj

# Continue

In [100]:
# Now we have found all the ratings of the active user using k nearest neighbors. i.e. all the other users whose 
# rating is similar to activeUser.
# we can sort these ratings now and can find the top N recommendations for an activeUser. 

def topNRecommendations(activeUser, N):
    predictItemRating = nearestNeighbourRatings(activeUser, 10)
    # it will use the 10 nearest neighbors to find the predicted ratings for all the movies of active user. 
    # lets just drop the rating for the movies which the user has allready watched. 
    single_row = userItemRatingMatrix.loc[activeUser]
    moviesAllreadyWatched = list(single_row.loc[single_row > 0].index)   
    # we fetched the list of ratings which the user had allready given. and we removed them from the 
    # predictItemRating list. then we have only ratings for the movies which the user has allready not watched.
    # we will sort those ratings and find top N recommendations.
    predictItemRating = predictItemRating.drop(moviesAllreadyWatched)
    # so now predictItemRating doesnt have any rating which is allready watched by the user.
    # now we will sort the top rated movies in descending order.
    topRecommendations = pd.DataFrame.sort_values(predictItemRating,
                                                  ["Rating"], ascending = [0])[:N]
    # This will return the list of ids of the movies which are at the top recommendation
    # we will replace ids with movie title. 
    topRecommendationsTitles = (movieInfo.loc[movieInfo.itemId.isin(topRecommendations.index)])
    return list(topRecommendationsTitles.title)
    

In [109]:
# lets print the top N recommendation and favorite movies of the user. 
activeUser = 5
N = 5
print(favoriteMovies(activeUser, N))
print("\n", topNRecommendations(activeUser, 5))


['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)', 'Wrong Trousers, The (1993)', 'Blues Brothers, The (1980)']

 ['Truth About Cats & Dogs, The (1996)', 'Sense and Sensibility (1995)', 'Scream (1996)', 'L.A. Confidential (1997)', 'First Wives Club, The (1996)']


In [83]:
# Rough work

# hh = userItemRatingMatrix.loc[1]
# hh
# hh.loc[hh > 0].index

In [82]:
# R. work
# movieInfo.itemId.isin([1,5,6,7,8])