In [2]:
import numpy as np
import pandas as pd

In [3]:
dataFile='C:\\NLP\\ml1\\u.data'
data=pd.read_csv(dataFile,sep="\t",header=None,names=['userId','itemId','rating','timestamp'])

In [4]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [5]:
movieInfoFile='C:\\NLP\\ml1\\u.item'
movieInfo=pd.read_csv(movieInfoFile,sep="|",header=None,index_col=False,names=['itemId','title'],usecols=[0,1],encoding='ISO-8859-1')

In [6]:
movieInfo.head()

Unnamed: 0,itemId,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [7]:
data=pd.merge(data,movieInfo,left_on='itemId',right_on='itemId')

In [8]:
data.head()

Unnamed: 0,userId,itemId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,63,242,3,875747190,Kolya (1996)
2,226,242,5,883888671,Kolya (1996)
3,154,242,3,879138235,Kolya (1996)
4,306,242,5,876503793,Kolya (1996)


In [9]:
data.loc[0:10,['userId']]

Unnamed: 0,userId
0,196
1,63
2,226
3,154
4,306
5,296
6,34
7,271
8,201
9,209


In [11]:
def favoriteMovies(activeUser,N):
    topMovies=pd.DataFrame.sort_values(data[data.userId==activeUser],['rating'],ascending=[0])[:N]
    return list(topMovies.title)
favoriteMovies(5,5)

['Empire Strikes Back, The (1980)',
 'Blade Runner (1982)',
 'Wrong Trousers, The (1993)',
 'Duck Soup (1933)',
 'Return of the Pink Panther, The (1974)']

In [12]:
#Create pivot Table with row as userId and columns as Movie Id and value as RAting for each movie by the user
userItemRatingMatrix=pd.pivot_table(data,values='rating',index=['userId'],columns=['itemId'])

In [14]:
userItemRatingMatrix.head()

itemId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [17]:
#finding similarity between users rating selection
from scipy.spatial.distance import correlation
def similarity(user1,user2):
    user1=np.array(user1)-np.nanmean(user1)
    user2=np.array(user2)-np.nanmean(user2)
    
    #Find movies for which both user has provided some rating
    commonItemIds=[i for i in range(len(user1)) if user1[i]>0 and user2[i]>0]
    if len(commonItemIds)==0:
        return 0
    else:
        user1=np.array([user1[i] for i in commonItemIds])
        user2=np.array([user2[i] for i in commonItemIds])
        return correlation(user1,user2)

In [18]:
#Finding the nearest neighbour for a particular user based on similarity
def nearestNeighboursRatings(activeUser,K):
    similarityMatrix=pd.DataFrame(index=userItemRatingMatrix.index,columns=['Similarity'])
    for i in userItemRatingMatrix.index:
        similarityMatrix.loc[i]=similarity(userItemRatingMatrix.loc[activeUser],userItemRatingMatrix.loc[i])
        similarityMatrix=pd.DataFrame.sort_values(similarityMatrix,['Similarity'],ascending=False)
        nearestNeighbours=similarityMatrix[:K]
        neighbourItemRatings=userItemRatingMatrix.loc[nearestNeighbours.index]
        predictItemRating=pd.DataFrame(index=userItemRatingMatrix.columns,columns=['Rating'])
        for i in userItemRatingMatrix.columns:
            predictedRating=np.nanmean(userItemRatingMatrix.loc[activeUser])
            for j in neighbourItemRatings.index:
                if userItemRatingMatrix.loc[j,1]>0:
                    predictedRating+=(userItemRatingMatrix.loc[j,i]-np.nanmean(userItemRatingMatrix.loc[j]))*nearestNeighbours.loc[j,'Similarity']
            predictItemRating.loc[i,'Rating']=predictedRating
        return predictItemRating        

In [26]:
def topNRecommendations(activeUser,N):
    predictItemRating=nearestNeighboursRatings(activeUser,10)
   
    moviesAlreadyWatched=list(userItemRatingMatrix.loc[activeUser].loc[userItemRatingMatrix.loc[activeUser]>0].index)
    predictItemRating=predictItemRating.drop(moviesAlreadyWatched)
    topRecommendations=pd.DataFrame.sort_values(predictItemRating,["Rating"],ascending=False)[:N]
    topRecommendationTitles=(movieInfo.loc[movieInfo.itemId.isin(topRecommendations.index)])
    print(topRecommendationTitles)
    return list(topRecommendationTitles.title)

In [27]:
print(topNRecommendations(5,10))

    itemId                                              title
2        3                                  Four Rooms (1995)
3        4                                  Get Shorty (1995)
4        5                                     Copycat (1995)
5        6  Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6        7                              Twelve Monkeys (1995)
7        8                                        Babe (1995)
8        9                            Dead Man Walking (1995)
9       10                                 Richard III (1995)
10      11                               Seven (Se7en) (1995)
11      12                         Usual Suspects, The (1995)
['Four Rooms (1995)', 'Get Shorty (1995)', 'Copycat (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Twelve Monkeys (1995)', 'Babe (1995)', 'Dead Man Walking (1995)', 'Richard III (1995)', 'Seven (Se7en) (1995)', 'Usual Suspects, The (1995)']


In [None]:
def matrixFactorization(R,K,steps=10,gamma=0.001,lamda=0.02):
    N=len(R.index)
    M=len(R.columns)
    P=pd.DataFrame(np.random.rand(N,K),index=R.index)
    Q=pd.DataFrame(np.random.rand(M,K),index=R.columns)