In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data_prep.csv', sep=';')

In [3]:
df.Movie_Id.nunique()

17770

In [4]:
df.Cust_Id.nunique()

480189

In [5]:
cust_sample=df.Cust_Id.sample(10000)
movie_sample=df.Movie_Id.sample(5000)

In [6]:
import surprise
from surprise import Reader, Dataset

In [7]:
reader=Reader(rating_scale=(0.5, 5.0))
#data = Dataset.load_from_df(df[['Cust_Id','Movie_Id','Rating']],reader)
#Далее для облегчения вычислений
data = Dataset.load_from_df(df[df.Cust_Id.isin(cust_sample) &
                               df.Movie_Id.isin(movie_sample)][['Cust_Id', 'Movie_Id', 'Rating']], reader)

In [8]:
from surprise import SVD

In [9]:
svd=SVD()
trainingSet=data.build_full_trainset()

In [10]:
svd.fit(trainingSet)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x20f8076d730>

In [11]:
testSet=trainingSet.build_testset()
predictions=svd.test(testSet)

In [12]:
titles = pd.read_csv('movie_titles.csv', encoding = "ISO-8859-1", 
                     header = None, 
                     names = ['Movie_Id', 'Year', 'Name'])

In [13]:
from collections import defaultdict

In [14]:
def get_top3_recommendations(predictions, topN = 3):
     
    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
     
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]
     
    return top_recs

In [15]:
top3_recommendations = get_top3_recommendations(predictions)

In [16]:
top3_recommendations

defaultdict(list,
            {1436762: [(7742, 4.6749033637302855),
              (11521, 4.422091870727236),
              (14187, 4.344403056543064)],
             79160: [(2743, 4.30514394875645),
              (12870, 4.25371654724434),
              (6428, 4.195980807021064)],
             1204327: [(12870, 5.0), (14240, 5.0), (14550, 4.96075139194799)],
             1286051: [(11164, 4.989069018156612),
              (8438, 4.9048777534576224),
              (270, 4.846492395834034)],
             1198785: [(11022, 4.5158106281804855),
              (6145, 4.319314046687543),
              (3371, 4.308147875461413)],
             2554745: [(16006, 4.93544278687875),
              (13840, 4.81364109011948),
              (17219, 4.809583186608766)],
             1033433: [(6428, 4.655526056273812),
              (13073, 4.622791050278278),
              (8327, 4.597142048359508)],
             514312: [(11164, 5.0), (14691, 5.0), (12293, 4.997440120568109)],
             244266: 

In [17]:
def print_recs(i):
    for (a, b) in top3_recommendations[i]:
        print(titles[titles.Movie_Id == a]['Name'].values[0], np.round(b,2))

In [18]:
i = np.random.choice(list(top3_recommendations.keys()))
print(i)
print_recs(i)

969909
Curb Your Enthusiasm: Season 1 4.78
City of God 4.71
Monty Python and the Holy Grail 4.64


In [19]:
films = data.df[(data.df.Cust_Id == i) & (data.df.Rating == 5)]['Movie_Id'].values
titles[titles.Movie_Id.isin(films)]['Name'].values

array(['Pirates of the Caribbean: The Curse of the Black Pearl',
       'Shrek 2', 'Curb Your Enthusiasm: Season 1', 'Starsky & Hutch',
       'City of God', 'Akira', "Kiki's Delivery Service",
       'Monty Python and the Holy Grail', 'Eurotrip', 'The Green Mile'],
      dtype=object)