In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import turicreate
import math
import random

In [2]:
RANDOM_SEED = 42

In [3]:
movies_data: pd.DataFrame = pd.read_csv('ml-20m/movies.csv')
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings_data = pd.read_csv('ml-20m/ratings.csv')
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [5]:
tags_data = pd.read_csv('ml-20m/tags.csv')
tags_data.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


## Popularity

In [6]:
movies_data['Number of ratings'] = 0
for i in range(movies_data.shape[0]):
    number_of_ratings = ratings_data[ratings_data['movieId'] == movies_data.at[i, 'movieId']].shape[0]
    movies_data.at[i, 'Number of ratings'] = number_of_ratings
movies_data.head()


Unnamed: 0,movieId,title,genres,Number of ratings
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,49695
1,2,Jumanji (1995),Adventure|Children|Fantasy,22243
2,3,Grumpier Old Men (1995),Comedy|Romance,12735
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2756
4,5,Father of the Bride Part II (1995),Comedy,12161


In [7]:
movies_data['Average rating'] = 0.0
for i in range(movies_data.shape[0]):
    mean = ratings_data[ratings_data['movieId'] == movies_data.at[i, 'movieId']]['rating'].mean()
    movies_data.at[i, 'Average rating'] = mean if not math.isnan(mean) else 0
movies_data.head()

Unnamed: 0,movieId,title,genres,Number of ratings,Average rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,49695,3.92124
1,2,Jumanji (1995),Adventure|Children|Fantasy,22243,3.211977
2,3,Grumpier Old Men (1995),Comedy|Romance,12735,3.15104
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,2756,2.861393
4,5,Father of the Bride Part II (1995),Comedy,12161,3.064592


In [8]:
m = movies_data['Number of ratings'].quantile(0.90)
m

1265.0

In [9]:
C = movies_data['Average rating'].mean()
C

3.071863792650544

In [10]:
def weighted_rating(x, m=m, C=C):
    v = x['Number of ratings']
    R = x['Average rating']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [11]:
movies_data['Score'] = movies_data.apply(weighted_rating, axis=1)

In [12]:
def recommend_aggregated(n=5):
    return movies_data.sort_values(by='Score', ascending=False).head(n)

In [13]:
recommend_aggregated(2)

Unnamed: 0,movieId,title,genres,Number of ratings,Average rating,Score
315,318,"Shawshank Redemption, The (1994)",Crime|Drama,63366,4.44699,4.420076
843,858,"Godfather, The (1972)",Crime|Drama,41355,4.364732,4.326359


In [14]:
## Content Filter

In [15]:
set('|'.join(movies_data['genres'].unique()).split('|'))

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [16]:
for i in range(movies_data.shape[0]):
    if movies_data.at[i, 'genres'] == '(no genres listed)':
        movies_data.at[i, 'genres'] = 'None'
    movies_data.at[i, 'genres'] = movies_data.at[i, 'genres'].replace('|', ' ')
movies_data.head()

Unnamed: 0,movieId,title,genres,Number of ratings,Average rating,Score
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,49695,3.92124,3.900155
1,2,Jumanji (1995),Adventure Children Fantasy,22243,3.211977,3.204437
2,3,Grumpier Old Men (1995),Comedy Romance,12735,3.15104,3.143886
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2756,2.861393,2.927607
4,5,Father of the Bride Part II (1995),Comedy,12161,3.064592,3.065277


In [17]:
tags_data[tags_data['movieId'] == movies_data.at[0, 'movieId']]['tag'].unique()

array(['Watched', 'computer animation', 'Disney animated feature',
       'Pixar animation', 'TÃ©a Leoni does not star in this movie',
       'Pixar', 'animation', 'family', 'Tom Hanks', 'witty', 'adventure',
       'animated', 'clever', 'comedy', 'fantasy', 'bright',
       'DARING RESCUES', 'fanciful', 'HEROIC MISSION', 'humorous',
       'light', 'rousing', 'TOYS COME TO LIFE', 'UNLIKELY FRIENDSHIPS',
       'warm', 'time travel', 'kids movie', 'Disney', 'Tim Allen',
       'action figure', 'action figures', 'Buzz Lightyear',
       'CG animation', 'toy', 'toys', 'Woody', 'villian hurts toys',
       'pixar', 'disney', 'children', 'é˜®ä¸€é¸£', 'funny', '3D',
       'Cartoon', 'ya boy', 'cgi', 'rated-G', 'lots of heart',
       'Animation', 'want to see again', 'imdb top 250', 'buddy movie',
       'the boys', 'very good', 'Best of Rotten Tomatoes: All Time',
       'John Lasseter', 'USA', 'classic', 'avi', 'buy', 'fun', 'Want',
       'CGI', 'soothing', 'almost favorite', 'friendshi

In [18]:
movies_data['Tags'] = ''
for i in range(movies_data.shape[0]):
    tags = tags_data[tags_data['movieId'] == movies_data.at[i, 'movieId']]['tag'].unique()
    new_tags = []
    for tag in tags:
        if isinstance(tag, float) and math.isnan(tag):
            new_tags.append('')
        else:
            if isinstance(tag, float):
                print(tag)
            new_tags.append(tag.lower().replace(' ', ''))
    movies_data.at[i, 'Tags'] = ' '.join(set(new_tags))

In [19]:
movies_data.head()

Unnamed: 0,movieId,title,genres,Number of ratings,Average rating,Score,Tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,49695,3.92124,3.900155,cganimation woody imdbtop250 verygood cgi brig...
1,2,Jumanji (1995),Adventure Children Fantasy,22243,3.211977,3.204437,childish badcgi game chrisvanallsburg magicboa...
2,3,Grumpier Old Men (1995),Comedy Romance,12735,3.15104,3.143886,grunrunning old howarddeutch oldpeoplethatisac...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2756,2.861393,2.927607,revenge clv characters chickflick
4,5,Father of the Bride Part II (1995),Comedy,12161,3.064592,3.065277,dianekeaton touching comedy remake sequel itth...


In [20]:
movies_data['Metadata'] = ''
for i in range(movies_data.shape[0]):
    genres = movies_data['genres'].iloc[i].split(' ')
    tags = movies_data['Tags'].iloc[i].split(' ')
    metadata = ' '.join([data.lower() for data in genres + tags])
    movies_data.at[i, 'Metadata'] = metadata

In [21]:
movies_data.head()

Unnamed: 0,movieId,title,genres,Number of ratings,Average rating,Score,Tags,Metadata
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,49695,3.92124,3.900155,cganimation woody imdbtop250 verygood cgi brig...,adventure animation children comedy fantasy cg...
1,2,Jumanji (1995),Adventure Children Fantasy,22243,3.211977,3.204437,childish badcgi game chrisvanallsburg magicboa...,adventure children fantasy childish badcgi gam...
2,3,Grumpier Old Men (1995),Comedy Romance,12735,3.15104,3.143886,grunrunning old howarddeutch oldpeoplethatisac...,comedy romance grunrunning old howarddeutch ol...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,2756,2.861393,2.927607,revenge clv characters chickflick,comedy drama romance revenge clv characters ch...
4,5,Father of the Bride Part II (1995),Comedy,12161,3.064592,3.065277,dianekeaton touching comedy remake sequel itth...,comedy dianekeaton touching comedy remake sequ...


In [22]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
movies_data['Metadata'] = movies_data['Metadata'].fillna('')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data['Metadata'])

In [23]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [24]:
corpus_index = [n for n in movies_data['Metadata']]

indices = pd.Series(movies_data.index, index=movies_data['title']).drop_duplicates()

title = "Toy Story (1995)"
idx = indices[title]
sim_scores = []
for i, j in enumerate(cosine_sim[idx]):
    k = movies_data['Score'].iloc[i]
    if j != 0 :
        sim_scores.append((i, j, k))

In [25]:
sim_scores = sorted(sim_scores, key=lambda x: (x[1],x[2]) , reverse=True)
sim_scores = sim_scores[0:10]
rest_indices = [i[0] for i in sim_scores]

In [26]:
data_x = movies_data[['title', 'Score']].iloc[rest_indices]

data_x['Cosine Similarity'] = 0
for i, j in enumerate(sim_scores):
    data_x['Cosine Similarity'].iloc[i]=round(sim_scores[i][1],2)

data_x

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,title,Score,Cosine Similarity
0,Toy Story (1995),3.900155,1.0
3027,Toy Story 2 (1999),3.801328,0.52
2270,"Bug's Life, A (1998)",3.579241,0.26
11026,Cars (2006),3.407532,0.25
6271,Finding Nemo (2003),3.844745,0.24
27270,Brother Bear 2 (2006),3.072597,0.23
24849,The Magic Crystal (2011),3.071412,0.23
590,Pinocchio (1940),3.46116,0.23
26714,The Gruffalo's Child (2011),3.074395,0.22
25816,The Land Before Time VII: The Stone of Cold Fi...,3.073607,0.22


In [27]:
def recommend_cosine(title, n=5):
    idx = indices[title]
    sim_scores = []
    for i, j in enumerate(cosine_sim[idx]):
        k = movies_data['Score'].iloc[i]
        if j != 0 :
            sim_scores.append((i, j, k))

    sim_scores = sorted(sim_scores, key=lambda x: (x[1],x[2]) , reverse=True)
    sim_scores = sim_scores[0:n]
    rest_indices = [i[0] for i in sim_scores]

    data_x = movies_data[['title', 'Score']].iloc[rest_indices]

    data_x['Cosine Similarity'] = 0
    for i, j in enumerate(sim_scores):
        data_x['Cosine Similarity'].iloc[i]=round(sim_scores[i][1],2)

    return data_x

In [28]:
recommend_cosine('Jumanji (1995)', 5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,title,Score,Cosine Similarity
1,Jumanji (1995),3.204437,1.0
11136,"Thief Lord, The (2006)",3.074437,0.26
25742,Snow Queen (2002),3.070961,0.26
26268,The Cave of the Golden Rose (1991),3.070622,0.26
21923,Back to the Secret Garden (2001),3.070566,0.26


## Collaborative

In [59]:
n_reduced = 1000000

In [60]:
user_movies_data = ratings_data[:n_reduced].pivot(index='movieId', columns='userId', values='rating').fillna(0)
user_movies_data

userId,1,2,3,4,5,6,7,8,9,10,...,6734,6735,6736,6737,6738,6739,6740,6741,6742,6743
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,4.0,...,0.5,4.0,4.0,3.0,0.0,4.0,0.0,4.0,0.0,4.0
2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
3,0.0,4.0,0.0,0.0,0.0,3.0,3.0,5.0,0.0,0.0,...,0.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,3.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130462,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
130490,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
movies_data_reduced = [movies_data.iloc[i] for i in range(movies_data.shape[0]) if user_movies_data.index.tolist().__contains__(movies_data.at[i, 'movieId'])]

In [62]:
movies_data_reduced = pd.DataFrame(movies_data_reduced)

In [63]:
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_data_reduced['Metadata'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=movies_data_reduced.index.tolist())

In [64]:
svd = TruncatedSVD(n_components=19)
latent_matrix = svd.fit_transform(tfidf_df)

In [75]:
n = 20
latent_matrix_1_df = pd.DataFrame(latent_matrix[:,0:n], index=movies_data_reduced['title'].tolist())

In [76]:
latent_matrix_2 = svd.fit_transform(user_movies_data)
latent_matrix_2_df = pd.DataFrame(latent_matrix_2, index=movies_data_reduced['title'].tolist())

In [84]:
a_1 = np.array(latent_matrix_1_df.loc['Jumanji (1995)']).reshape(1, -1)
a_2 = np.array(latent_matrix_2_df.loc['Jumanji (1995)']).reshape(1, -1)

score_1 = cosine_similarity(latent_matrix_1_df, a_1).reshape(-1)
score_2 = cosine_similarity(latent_matrix_2_df, a_2).reshape(-1)

hybrid = ((score_1 + score_2)/2.0)

dictDf = {'content': score_1 , 'collaborative': score_2, 'hybrid': hybrid}
similar = pd.DataFrame(dictDf, index=latent_matrix_2_df.index)

similar.sort_values('hybrid', ascending=False, inplace=True)
similar

Unnamed: 0,content,collaborative,hybrid
Jumanji (1995),1.000000,1.000000,1.000000
Casper (1995),0.987449,0.910505,0.948977
Free Willy (1993),0.969169,0.866445,0.917807
"Indian in the Cupboard, The (1995)",0.988560,0.824272,0.906416
Babe (1995),0.975708,0.807475,0.891592
...,...,...,...
Brothers in Trouble (1995),0.004610,-0.113331,-0.054360
Bigger Than the Sky (2005),0.017355,-0.128319,-0.055482
"Quick and the Dead, The (1987)",-0.031035,-0.098038,-0.064536
In This Our Life (1942),0.023629,-0.155431,-0.065901


In [85]:
def recommend_hybrid(title, n=5):
    a_1 = np.array(latent_matrix_1_df.loc[title]).reshape(1, -1)
    a_2 = np.array(latent_matrix_2_df.loc[title]).reshape(1, -1)

    score_1 = cosine_similarity(latent_matrix_1_df, a_1).reshape(-1)
    score_2 = cosine_similarity(latent_matrix_2_df, a_2).reshape(-1)

    hybrid = ((score_1 + score_2)/2.0)

    dictDf = {'content': score_1 , 'collaborative': score_2, 'hybrid': hybrid}
    similar = pd.DataFrame(dictDf, index = latent_matrix_2_df.index )

    return similar.sort_values('hybrid', ascending=False)

In [86]:
recommend_hybrid('Jumanji (1995)')

Unnamed: 0,content,collaborative,hybrid
Jumanji (1995),1.000000,1.000000,1.000000
Casper (1995),0.987449,0.910505,0.948977
Free Willy (1993),0.969169,0.866445,0.917807
"Indian in the Cupboard, The (1995)",0.988560,0.824272,0.906416
Babe (1995),0.975708,0.807475,0.891592
...,...,...,...
Brothers in Trouble (1995),0.004610,-0.113331,-0.054360
Bigger Than the Sky (2005),0.017355,-0.128319,-0.055482
"Quick and the Dead, The (1987)",-0.031035,-0.098038,-0.064536
In This Our Life (1942),0.023629,-0.155431,-0.065901


## Turicreate

In [87]:
ratings_train, ratings_test = train_test_split(ratings_data, test_size=0.1)

In [88]:
train_ratings_sframe = turicreate.SFrame(ratings_train)
test_ratings_sframe = turicreate.SFrame(ratings_test)

In [89]:
popularity_model = turicreate.popularity_recommender.create(train_ratings_sframe, user_id='userId', item_id='movieId', target='rating')

In [111]:
popularity_recomm = popularity_model.recommend(users=[1, 2, 3, 4, 5],k=1)

for recom in popularity_recomm:
    print(movies_data[movies_data['movieId'] == recom['movieId']]['title'])

12291    All Passion Spent (1986)
Name: title, dtype: object
12291    All Passion Spent (1986)
Name: title, dtype: object
12291    All Passion Spent (1986)
Name: title, dtype: object
12291    All Passion Spent (1986)
Name: title, dtype: object
12291    All Passion Spent (1986)
Name: title, dtype: object


#### Recommender recommends same movies to different users

## Surprise

In [114]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

In [112]:
Mapping_file = dict(zip(movies_data['title'].tolist(), movies_data['movieId'].tolist()))

In [115]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_data[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=.25)
algorithm = SVD()
algorithm.fit(trainset)
predictions = algorithm.test(testset)

accuracy.rmse(predictions)

RMSE: 0.7892


0.7892339030966591

In [118]:
def pred_user_rating(ui):
    if ui in ratings_data['userId'].unique():
        ui_list = ratings_data[ratings_data['userId'] == ui]['movieId'].tolist()
        d = {k: v for k,v in Mapping_file.items() if not v in ui_list}
        predictedL = []
        for i, j in d.items():
            predicted = algorithm.predict(ui, j)
            predictedL.append((i, predicted[3]))
        pdf = pd.DataFrame(predictedL, columns = ['movies', 'ratings'])
        pdf.sort_values('ratings', ascending=False, inplace=True)
        pdf.set_index('movies', inplace=True)
        return pdf.head(10)
    else:
        print("User Id does not exist in the list!")
        return None

In [120]:
pred_user_rating(2)


Unnamed: 0_level_0,ratings
movies,Unnamed: 1_level_1
Gladiator (1992),5.0
The Imitation Game (2014),5.0
"Phantom of the Opera, The (2004)",5.0
Gladiator (2000),5.0
"Shawshank Redemption, The (1994)",5.0
"Lord of the Rings: The Fellowship of the Ring, The (2001)",5.0
Prime Suspect (1991),5.0
"Dark Knight, The (2008)",5.0
"Lord of the Rings: The Return of the King, The (2003)",5.0
Voices from the List (2004),5.0
