In [1]:
import pandas as pd
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

In [2]:
df = pd.read_csv('./ml-100k/u.data',sep='\t',names=['userID', 'movieID', 'rating', 'time'])

In [3]:
df.drop('time', axis='columns', inplace=True)

In [4]:
average_ratings = df['rating'].sum() / df['rating'].count()

In [5]:
average_ratings

3.52986

In [6]:
muserID = 399

In [7]:
tdf = df.groupby(['userID'])
dKBu = {}
for i,f in tdf:
    f_count = f['movieID'].count()
    dKBu[i] = 1/(f_count+10)
    dKBu[i] = dKBu[i]*(f['rating']-average_ratings).sum()

In [8]:
KBu = pd.DataFrame.from_dict(dKBu, orient='index')
KBu.columns = ['KBu']
KBu['userID'] = KBu.index

In [9]:
dKBu[296]

0.605799872611465

In [10]:
df = df.merge(KBu,left_on="userID",right_on="userID",how='inner')

In [11]:
tdf = df.groupby(['movieID'])
dKBi = {}
for i,f in tdf:
    f_count = f['userID'].count()
    dKBi[i] = 1/(f_count+25)
    dKBi[i] = dKBi[i] * (f['rating']-f['KBu']-average_ratings).sum()
    

In [12]:
KBi = pd.DataFrame.from_dict(dKBi, orient='index')
KBi.columns = ['KBi']
KBi['movieID'] = KBi.index

In [13]:
df = df.merge(KBi,on="movieID",how='inner')

In [14]:
df['KBui'] = average_ratings + df['KBu'] + df['KBi']

In [15]:
df['rating_new'] = df['rating']-df['KBui']

In [16]:
d = pd.pivot_table(df,values='rating_new',index='movieID',columns='userID')
d = d.fillna(0)
dt = d.copy()

In [17]:
movie_similarity = 1-pairwise_distances( d.values, metric="cosine" )
np.fill_diagonal( movie_similarity, 0 )
ratings_matrix = pd.DataFrame( movie_similarity )


In [18]:
ratings_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.000000,-0.003603,-0.012158,-0.035144,0.039609,0.022780,0.021228,0.052695,-0.004018,-0.015940,...,0.077118,0.0,0.00000,0.00000,0.028896,0.0,0.0,0.0,0.039629,-0.018718
1,-0.003603,0.000000,-0.001984,0.050458,0.002228,-0.003639,-0.016975,0.040146,-0.089526,-0.007177,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.0,0.0,0.022621,0.016415
2,-0.012158,-0.001984,0.000000,-0.113536,-0.005417,0.060713,-0.024408,-0.094746,-0.016745,-0.039964,...,0.000000,0.0,0.00000,0.00000,0.183583,0.0,0.0,0.0,0.000000,0.004880
3,-0.035144,0.050458,-0.113536,0.000000,-0.149507,-0.019089,0.005145,0.110511,0.033087,0.010968,...,0.000000,0.0,-0.11599,-0.11599,0.089827,0.0,0.0,0.0,0.001824,-0.048334
4,0.039609,0.002228,-0.005417,-0.149507,0.000000,-0.031989,-0.015494,0.038203,-0.042447,-0.036512,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.027241
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,1.0,1.0,0.000000,0.000000
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,1.0,0.0,1.0,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,1.0,1.0,0.0,0.000000,0.000000
1680,0.039629,0.022621,0.000000,0.001824,0.000000,0.000000,0.041703,0.117235,0.041731,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,0.0,0.0,0.0,0.000000,0.000000


In [19]:
user_movies = df[['movieID']].where(df['userID']==muserID).dropna()
not_watched = df[~df['movieID'].isin(user_movies['movieID'].tolist())]['movieID'].unique()

In [28]:

def getRating(not_watched,pos=False):
    r_mean_a = average_ratings + dKBu[muserID]

    films = {}

    for i in not_watched:
        df_tp30 = pd.DataFrame()

        df_tp30['Sij'] = ratings_matrix.iloc[i-1]
        df_tp30['movieID'] = df_tp30.index+1
        df_tp30 = df_tp30.sort_values(by=['Sij'], ascending=False).head(30)
        
        ndf = df[(df['movieID'].isin(df_tp30['movieID'].tolist())) & (df['userID']==muserID)]

        if ndf['movieID'].count() == 0:
            films[i] = 0
        else:
            ndf = ndf.merge(df_tp30,on='movieID',how='inner')

            if pos is True:
                ndf = ndf[ndf['Sij']>0]

            r_KBui = r_mean_a + dKBi[i]

            Sx = (ndf['Sij'] * ndf['rating_new']).sum()
            Sy = ndf['Sij'].abs().sum()
            films[i] = r_KBui + (Sx / Sy)
            

        
    return films


    

In [29]:
%%time
films = getRating(not_watched)

CPU times: user 11.4 s, sys: 349 ms, total: 11.8 s
Wall time: 10.9 s


In [22]:
nd = pd.DataFrame.from_dict(films, orient='index')
nd.columns = ['raitng']
nd['moviesID'] = nd.index
top_films = nd.sort_values(by='raitng', ascending=False)[0:10]

In [23]:
top_films

Unnamed: 0,raitng,moviesID
65,5.075504,65
207,5.019786,207
316,4.67306,316
515,4.603637,515
408,4.502725,408
315,4.456098,315
169,4.44717,169
242,4.418403,242
750,4.342368,750
853,4.195976,853


In [34]:
top_films

Unnamed: 0,raitng,moviesID
65,5.075504,65
207,5.019786,207
316,4.67306,316
515,4.603637,515
408,4.502725,408
315,4.456098,315
169,4.44717,169
242,4.418403,242
750,4.342368,750
853,4.195976,853


In [24]:
films = getRating(not_watched,True)
nd = pd.DataFrame.from_dict(films, orient='index')
nd.columns = ['raitng']
nd['moviesID'] = nd.index
top_films_p = nd.sort_values(by='raitng', ascending=False)[0:10]

In [25]:
top_films_p

Unnamed: 0,raitng,moviesID
65,5.075504,65
207,5.019786,207
316,4.67306,316
515,4.603637,515
408,4.502725,408
315,4.456098,315
169,4.44717,169
242,4.418403,242
750,4.342368,750
853,4.195976,853


In [26]:
out = {
    "average_rating": round(average_ratings,4),
    "predicators_positive_top10": top_films_p['moviesID'].tolist(),
    "predicators_top10": top_films['moviesID'].tolist()
}

In [27]:
out

{'average_rating': 3.5299,
 'predicators_positive_top10': [65,
  207,
  316,
  515,
  408,
  315,
  169,
  242,
  750,
  853],
 'predicators_top10': [65, 207, 316, 515, 408, 315, 169, 242, 750, 853]}

In [None]:
import json
with open("lab06s.json", "w") as outfile:
    json.dump(out, outfile)
