In [1]:
import numpy as np 
import pandas as pd 

In [2]:
dataFile='./ml-100k/u.data'
data=pd.read_csv(dataFile,sep="\t",header=None,
                 names=['userId','itemId','rating','timestamp'])

In [3]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [4]:
movieInfoFile="./ml-100k/u.item"
movieInfo=pd.read_csv(movieInfoFile,sep="|", header=None, index_col=False,
                     names=["itemId","title"], usecols=[0,1],encoding = 'latin')

In [5]:
movieInfo.head()

Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [6]:
data=pd.merge(data,movieInfo,left_on='itemId',right_on="itemId")
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [7]:
userIds=data.userId # a Pandas series object
userIds2=data[['userId']] # a Pandas DataFrame object

In [10]:
data=pd.DataFrame.sort_values(data,['userId','itemId'],ascending=[0,1])
numUsers=max(data.userId)
numMovies=max(data.itemId)

In [11]:
moviesPerUser=data.userId.value_counts()
usersPerMovie=data.title.value_counts()

usersPerMovie

Star Wars (1977)                                     584
Contact (1997)                                       509
Fargo (1996)                                         508
Return of the Jedi (1983)                            507
Liar Liar (1997)                                     485
English Patient, The (1996)                          481
Scream (1996)                                        478
Toy Story (1995)                                     452
Air Force One (1997)                                 431
Independence Day (ID4) (1996)                        429
Raiders of the Lost Ark (1981)                       420
Godfather, The (1972)                                413
Pulp Fiction (1994)                                  394
Twelve Monkeys (1995)                                392
Silence of the Lambs, The (1991)                     390
Jerry Maguire (1996)                                 384
Chasing Amy (1997)                                   379
Rock, The (1996)               

In [12]:
def favoriteMovies(activeUser,N):
    topMovies=pd.DataFrame.sort_values(
        data[data.userId==activeUser],['rating'],ascending=[0])[:N]
    return list(topMovies.title)

print(favoriteMovies(5,3)) 

['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)']


In [13]:
userItemRatingMatrix=pd.pivot_table(data, values='rating',
                                    index=['userId'], columns=['itemId'])

In [14]:
userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,,,,,,,,,,,...,,,,,,,,,,
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [15]:
#Normalize user's rating; for the required pair of users find the correlation of the common movies' ratings

from scipy.spatial.distance import correlation 
def similarity(user1,user2):
    user1=np.array(user1)-np.nanmean(user1) # np.nanmean() returns the mean of an array after ignoring and NaN values 
    user2=np.array(user2)-np.nanmean(user2)
    commonItemIds=[i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]
    if len(commonItemIds)==0:
        return 0
    else:
        user1=np.array([user1[i] for i in commonItemIds])
        user2=np.array([user2[i] for i in commonItemIds])
        return correlation(user1,user2)

In [16]:
# Similarity with nearest neighbors...(often item similarity could be better than user similarity)
def nearestNeighbourRatings(activeUser,K):

    similarityMatrix=pd.DataFrame(index=userItemRatingMatrix.index,
                                  columns=['Similarity'])

    for i in userItemRatingMatrix.index:
        similarityMatrix.loc[i]=similarity(userItemRatingMatrix.loc[activeUser],
                                          userItemRatingMatrix.loc[i])

    similarityMatrix=pd.DataFrame.sort_values(similarityMatrix,
                                              ['Similarity'],ascending=[0])

    nearestNeighbours=similarityMatrix[:K]
    neighbourItemRatings=userItemRatingMatrix.loc[nearestNeighbours.index]
    predictItemRating=pd.DataFrame(index=userItemRatingMatrix.columns, columns=['Rating'])
    for i in userItemRatingMatrix.columns:
        predictedRating=np.nanmean(userItemRatingMatrix.loc[activeUser])
        for j in neighbourItemRatings.index:
            if userItemRatingMatrix.loc[j,i]>0:
                predictedRating += (userItemRatingMatrix.loc[j,i]
                                    -np.nanmean(userItemRatingMatrix.loc[j]))*nearestNeighbours.loc[j,'Similarity']
        predictItemRating.loc[i,'Rating']=predictedRating
    return predictItemRating



In [17]:
# KNN Regression
def topNRecommendations(activeUser,N):
    predictItemRating=nearestNeighbourRatings(activeUser,10)
    moviesAlreadyWatched=list(userItemRatingMatrix.loc[activeUser]
                              .loc[userItemRatingMatrix.loc[activeUser]>0].index)
    predictItemRating=predictItemRating.drop(moviesAlreadyWatched)
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,
                                                ['Rating'],ascending=[0])[:N]
    topRecommendationTitles=(movieInfo.loc[movieInfo.itemId.isin(topRecommendations.index)])
    return list(topRecommendationTitles.title)

In [18]:
activeUser=5
print(favoriteMovies(activeUser,5),"\n",topNRecommendations(activeUser,3))

  dist = 1.0 - uv / np.sqrt(uu * vv)


['Men in Black (1997)', 'Blade Runner (1982)', 'Empire Strikes Back, The (1980)', 'Wrong Trousers, The (1993)', 'Blues Brothers, The (1980)'] 
 ['Truth About Cats & Dogs, The (1996)', 'Scream (1996)', 'First Wives Club, The (1996)']


Alternative approach would be to use Item based collaborative filtering...

https://www.udemy.com/course/recommendation-systems/learn/lecture/4695452#overview