In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import surprise
import sklearn 
%matplotlib inline

movies = pd.read_csv('../data/movies.csv')
ratings = pd.read_csv('../data/ratings.csv')

Функция которая принимает датафрейм.

In [2]:
def build_df(movies,ratings):
    movies=movies.drop('genres', axis=1)
    ratings=ratings.drop('timestamp', axis=1)
    df=ratings.merge(movies, left_on='movieId', right_on='movieId',)
    y=df['rating']
    X=df.drop('rating', axis=1)
    return X,y

In [3]:
X, y = build_df(movies,ratings)

# Popularity based

In [4]:
from sklearn.preprocessing import MinMaxScaler
    
class PopularityRecommender():
    
    def fit(self, X, y):
        self.X=X
        Ratings=X
        Ratings['rating']=y
        self.movie_ids = Ratings['movieId'].unique()
        Ratings=Ratings.set_index('movieId')
        summ=[]
        for i in range(len(self.movie_ids)):
            summ.append(Ratings.loc[self.movie_ids[i]]['rating'].sum())
        film_weights=pd.DataFrame({'movieId': self.movie_ids, 'weight': summ})
        self.weights=film_weights
        scaler = MinMaxScaler()
        self.weights['weight']=scaler.fit_transform(self.weights['weight'].to_numpy().reshape(-1, 1))
        self.weights=dict(zip(self.weights['movieId'],self.weights['weight']))

    def predict_one(self, user_id, item_id):
        if item_id in self.weights:
            return self.weights[item_id]
        else:
            return np.mean(list(self.weights.values()))
        
        
    def predict(self, X):
        y = []
        for i,row in X.iterrows():
            y.append(self.predict_one(row['userId'], row['movieId']))
        return y
        
    def recomend_items(self, user_id, n_items=10):
        Ratings=self.X
        watched=list(Ratings[Ratings['userId']==user_id]['movieId'])
        movie_ids = np.array(self.movie_ids)
        non_watched=movie_ids[np.where(~np.isin(movie_ids, watched))]
        df_non_watched=pd.DataFrame({'movieId': non_watched, 'userId': [user_id]*len(non_watched)})
        df_non_watched  = df_non_watched.merge(movies)
        y=self.predict(df_non_watched)
        df_non_watched["score"] = y
        df_non_watched.sort_values(by="score", ascending=False, inplace=True)
        return df_non_watched.iloc[:n_items]
    
    def get_most_popular(self, n_items=10):
        movies=self.X
        movies=movies.drop_duplicates(subset=['movieId'])
        pop=pd.DataFrame.from_dict({'movieId': self.weights.keys(), 'weight': self.weights.values()})
        pop=pop.merge(movies, left_on='movieId', right_on='movieId',)
        return pop.sort_values(by='weight', ascending=False).head(n_items)

In [5]:
pop_rec=PopularityRecommender()
pop_rec.fit(X,y)

In [6]:
user_id = 407

In [7]:
X[X["userId"] == user_id ]

Unnamed: 0,userId,movieId,title,rating
1734,407,260,Star Wars: Episode IV - A New Hope (1977),4.0
6925,407,1196,Star Wars: Episode V - The Empire Strikes Back...,4.0
8405,407,1240,"Terminator, The (1984)",3.0
9157,407,1291,Indiana Jones and the Last Crusade (1989),4.0
10900,407,2028,Saving Private Ryan (1998),4.0
12825,407,2571,"Matrix, The (1999)",5.0
14462,407,2959,Fight Club (1999),5.0
17162,407,58559,"Dark Knight, The (2008)",5.0
30575,407,293,Léon: The Professional (a.k.a. The Professiona...,3.0
36056,407,4993,"Lord of the Rings: The Fellowship of the Ring,...",5.0


In [8]:
pop_rec.get_most_popular(10)

Unnamed: 0,movieId,weight,userId,title,rating
232,318,1.0,2,"Shawshank Redemption, The (1994)",3.0
20,356,0.975775,1,Forrest Gump (1994),4.0
16,296,0.917706,1,Pulp Fiction (1994),3.0
166,2571,0.830068,1,"Matrix, The (1999)",5.0
34,593,0.826861,1,"Silence of the Lambs, The (1991)",4.0
15,260,0.756323,1,Star Wars: Episode IV - A New Hope (1977),5.0
7,110,0.680442,1,Braveheart (1995),4.0
192,2959,0.663342,1,Fight Club (1999),5.0
28,527,0.661917,1,Schindler's List (1993),5.0
26,480,0.635554,1,Jurassic Park (1993),4.0


In [9]:
pop_rec.recomend_items(user_id)

Unnamed: 0,movieId,userId,title,genres,score
225,318,407,"Shawshank Redemption, The (1994)",Crime|Drama,1.0
19,356,407,Forrest Gump (1994),Comedy|Drama|Romance|War,0.975775
15,296,407,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,0.917706
33,593,407,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,0.826861
7,110,407,Braveheart (1995),Action|Drama|War,0.680442
27,527,407,Schindler's List (1993),Drama|War,0.661917
25,480,407,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,0.635554
470,589,407,Terminator 2: Judgment Day (1991),Action|Sci-Fi,0.633416
4,50,407,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,0.615604
0,1,407,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.600285


## Сontent based

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine

class ContentRecommender():
    
    def fit(self, X, y):
        self.X=X
        Ratings=X
        movies=X
        Ratings['rating']=y
        self.movie_ids = Ratings['movieId'].unique()
        self.tfidf = TfidfVectorizer(stop_words='english')
        movies=movies.drop_duplicates(subset=['movieId'])
        overview_matrix = self.tfidf.fit_transform(movies['title']).toarray()
        self.film_vectors=dict()
        for i in range(len(self.movie_ids)):
            self.film_vectors[self.movie_ids[i]]=overview_matrix[i]
        
        user_ids=Ratings['userId'].unique()
        self.user_vectors=dict()
        for ID in user_ids:
            user_movies=Ratings[Ratings['userId'] == ID]['movieId'].to_numpy()
            user_raitings=Ratings[Ratings['userId'] == ID]['rating'].to_numpy()
            summ=0
            for i in range(len(user_raitings)):
                summ+=user_raitings[i]*self.film_vectors[user_movies[i]]
            self.user_vectors[ID]=summ

    def predict_one(self, user_id, item_id):
        if item_id in self.film_vectors:
            return cosine(self.user_vectors[user_id],self.film_vectors[item_id])
        else:
            return 0
        
    def predict(self, X):
        y = []
        for i,row in X.iterrows():
            y.append(self.predict_one(row['userId'], row['movieId']))
        return y
        
    def recomend_items(self, user_id, n_items=10):
        Ratings=self.X
        watched=list(Ratings[Ratings['userId']==user_id]['movieId'])
        movie_ids = np.array(self.movie_ids)
        non_watched=movie_ids[np.where(~np.isin(movie_ids, watched))]
        df_non_watched=pd.DataFrame({'movieId': non_watched
                                     , 'userId': [user_id]*len(non_watched)})
        df_non_watched  = df_non_watched.merge(movies)
        y=self.predict(df_non_watched)
        df_non_watched["score"] = y
        df_non_watched.sort_values(by="score", ascending=True, inplace=True)
        return df_non_watched.iloc[:n_items]
    
    def features(self,user_id, n_features=10):
        voc=dict(sorted(self.tfidf.vocabulary_.items(), key=lambda x: x[1]))
        h=pd.DataFrame({'user_vector':self.user_vectors[user_id],'features':voc.keys()})
        return h.sort_values(by='user_vector',ascending=False)

In [11]:
cont_rec=ContentRecommender()
cont_rec.fit(X,y)

In [12]:
X[X["userId"] == user_id ]

Unnamed: 0,userId,movieId,title,rating
1734,407,260,Star Wars: Episode IV - A New Hope (1977),4.0
6925,407,1196,Star Wars: Episode V - The Empire Strikes Back...,4.0
8405,407,1240,"Terminator, The (1984)",3.0
9157,407,1291,Indiana Jones and the Last Crusade (1989),4.0
10900,407,2028,Saving Private Ryan (1998),4.0
12825,407,2571,"Matrix, The (1999)",5.0
14462,407,2959,Fight Club (1999),5.0
17162,407,58559,"Dark Knight, The (2008)",5.0
30575,407,293,Léon: The Professional (a.k.a. The Professiona...,3.0
36056,407,4993,"Lord of the Rings: The Fellowship of the Ring,...",5.0


In [13]:
cont_rec.recomend_items(407, n_items=10)


Unnamed: 0,movieId,userId,title,genres,score
130,2116,407,"Lord of the Rings, The (1978)",Adventure|Animation|Children|Fantasy,0.592164
3966,147936,407,The Lord's Lantern in Budapest (1999),Comedy|Drama,0.734251
5744,62437,407,W. (2008),Drama,0.752873
5946,58627,407,Never Back Down (2008),Action,0.752873
1222,36529,407,Lord of War (2005),Action|Crime|Drama|Thriller|War,0.783924
165,2628,407,Star Wars: Episode I - The Phantom Menace (1999),Action|Adventure|Sci-Fi,0.783988
1291,77561,407,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,0.793248
1893,61160,407,Star Wars: The Clone Wars (2008),Action|Adventure|Animation|Sci-Fi,0.795891
5573,62299,407,Alone in the Dark II (2008),Action|Horror,0.797819
1971,112175,407,How to Train Your Dragon 2 (2014),Action|Adventure|Animation,0.7983


In [14]:
cont_rec.features(407)

Unnamed: 0,user_vector,features
6695,7.797095,rings
4879,7.140930,lord
133,5.625215,2008
5137,4.427792,matrix
120,4.244941,1999
...,...,...
3022,0.000000,flowers
3021,0.000000,flower
3020,0.000000,flow
3019,0.000000,florette


# Collaborative filtering

In [15]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import GridSearchCV

class CollaborativeFilteringRecommender():
    
    def fit(self, X, y):
        self.X=X
        Ratings=X
        movies=X
        Ratings['rating']=y
        self.movie_ids = Ratings['movieId'].unique()
        
        reader = Reader(rating_scale=(1, 5))
        data = Dataset.load_from_df(Ratings[['userId','movieId','rating']], reader)
        param_grid = {'n_factors':[50,100],'n_epochs':[30],  'lr_all':[0.005],'reg_all':[0.02]}
        self.gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'],refit=True,cv=3)
        self.gs.fit(data)


    def predict_one(self, user_id, item_id):
        return self.gs.predict(user_id,item_id).est
        
    def predict(self, X):
        y = []
        for i,row in X.iterrows():
            y.append(self.predict_one(row['userId'], row['movieId']))
        return y
        
    def recomend_items(self, user_id, n_items=10):
        Ratings=self.X
        watched=list(Ratings[Ratings['userId']==user_id]['movieId'])
        movie_ids = np.array(self.movie_ids)
        non_watched=movie_ids[np.where(~np.isin(movie_ids, watched))]
        df_non_watched=pd.DataFrame({'movieId': non_watched
                                     , 'userId': [user_id]*len(non_watched)})
        df_non_watched  = df_non_watched.merge(movies)
        y=self.predict(df_non_watched)
        df_non_watched["score"] = y
        df_non_watched.sort_values(by="score", ascending=False, inplace=True)
        return df_non_watched.iloc[:n_items]
    

In [16]:
coll_rec=CollaborativeFilteringRecommender()
coll_rec.fit(X,y)

In [17]:
X[X["userId"] == user_id ]

Unnamed: 0,userId,movieId,title,rating
1734,407,260,Star Wars: Episode IV - A New Hope (1977),4.0
6925,407,1196,Star Wars: Episode V - The Empire Strikes Back...,4.0
8405,407,1240,"Terminator, The (1984)",3.0
9157,407,1291,Indiana Jones and the Last Crusade (1989),4.0
10900,407,2028,Saving Private Ryan (1998),4.0
12825,407,2571,"Matrix, The (1999)",5.0
14462,407,2959,Fight Club (1999),5.0
17162,407,58559,"Dark Knight, The (2008)",5.0
30575,407,293,Léon: The Professional (a.k.a. The Professiona...,3.0
36056,407,4993,"Lord of the Rings: The Fellowship of the Ring,...",5.0


In [18]:
coll_rec.recomend_items(407, n_items=10)

Unnamed: 0,movieId,userId,title,genres,score
225,318,407,"Shawshank Redemption, The (1994)",Crime|Drama,4.573699
2038,56782,407,There Will Be Blood (2007),Drama|Western,4.52725
1163,3275,407,"Boondock Saints, The (2000)",Action|Crime|Drama|Thriller,4.499927
713,750,407,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,4.480678
72,1213,407,Goodfellas (1990),Crime|Drama,4.458057
230,48516,407,"Departed, The (2006)",Crime|Drama|Thriller,4.456581
383,2324,407,Life Is Beautiful (La Vita è bella) (1997),Comedy|Drama|Romance|War,4.439491
2405,1104,407,"Streetcar Named Desire, A (1951)",Drama,4.438826
1109,7361,407,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,4.436484
751,5618,407,Spirited Away (Sen to Chihiro no kamikakushi) ...,Adventure|Animation|Fantasy,4.429843
