In [2]:
## dieu
import pandas as pd 
import numpy as np 
# sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.decomposition import TruncatedSVD


In [3]:
metadata = pd.read_csv('../data/movies_metadata.csv')

In [4]:
m = metadata['vote_count'].quantile(0.90)
C = metadata['vote_average'].mean()

q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [5]:
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
q_movies = q_movies.sort_values('score', ascending=False)
q_movies[['title', 'vote_count', 'vote_average', 'score']].head(20)

Unnamed: 0,title,vote_count,vote_average,score
314,The Shawshank Redemption,8358.0,8.5,8.445869
834,The Godfather,6024.0,8.5,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453
12481,The Dark Knight,12269.0,8.3,8.265477
2843,Fight Club,9678.0,8.3,8.256385
292,Pulp Fiction,8670.0,8.3,8.251406
522,Schindler's List,4436.0,8.3,8.206639
23673,Whiplash,4376.0,8.3,8.205404
5481,Spirited Away,3968.0,8.3,8.196055
2211,Life Is Beautiful,3643.0,8.3,8.187171


In [6]:
#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

#Output the shape of tfidf_matrix
print(tfidf_matrix.shape)
print(tfidf.get_feature_names()[5000:5010])

(45466, 75827)
['avails', 'avaks', 'avalanche', 'avalanches', 'avallone', 'avalon', 'avant', 'avanthika', 'avanti', 'avaracious']


In [7]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix[18000])
print(cosine_sim.shape)
print(cosine_sim)

(45466, 1)
[[0.        ]
 [0.        ]
 [0.        ]
 ...
 [0.04413835]
 [0.01250697]
 [0.        ]]


In [8]:
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()
indices[:18001]

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Paranormal Activity 3          17996
Puncture                       17997
Polisse                        17998
A Little Bit of Heaven         17999
Year of the Carnivore          18000
Length: 18001, dtype: int64

In [9]:
def get_recommendations(title):
    # Get the index of the movie that matches the title
    idx = indices[title]
    n_films = int(input())
    
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix[idx])
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n_films + 1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [10]:
result = get_recommendations('Year of the Carnivore')
for i,j in enumerate(result):
    print(i+1,j)

1 The Skateboard Kid II
2 Sammy and Rosie Get Laid
3 I Was a Shoplifter
4 Long Day's Journey Into Night
5 The Comedian
6 A Turtle’s Tale 2: Sammy’s Escape From Paradise
7 Tomato Red
8 Surviving Life (Theory and Practice)
9 Sleepaway Camp IV: The Survivor
10 A Turtle's Tale: Sammy's Adventures
11 Dominick and Eugene
12 Ad Fundum
13 One Fine Day
14 The Barber
15 Summer in the City
16 The Trouble with Harry
17 Boy Meets Girl
18 Time of Eve: The Movie
19 Fear
20 One More Time


In [11]:
tfidf_matrix

<45466x75827 sparse matrix of type '<class 'numpy.float64'>'
	with 1210882 stored elements in Compressed Sparse Row format>

In [12]:
svd = TruncatedSVD(n_components=2000,n_iter=15,random_state=8)
tfidf_matrix_svd = svd.fit_transform(tfidf_matrix)

In [13]:
def get_recommendations_svd(title):
    # Get the index of the movie that matches the title
    idx = indices[title]
    n_films = int(input())
    
    cosine_sim = linear_kernel(tfidf_matrix_svd, tfidf_matrix_svd[idx].reshape(1,-1))
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:n_films + 1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [14]:
result_svd = get_recommendations_svd('Year of the Carnivore')
for i,j in enumerate(result_svd):
    print(i+1,j)

ValueError: invalid literal for int() with base 10: ''

In [18]:
def weighted_recommendations(film):
    filmscores = {}
    scores = []
    result = {}
    m = metadata['vote_count'].quantile(0.90)
    C = metadata['vote_average'].mean()

    for i,j in enumerate(get_recommendations(film)):
        x = metadata[metadata['original_title'] == j]
        v = x['vote_count']
        R = x['vote_average']
        score = (v/(v+m) * R) + (m/(m+v) * C)
        for v,w in enumerate(score):
            #print("score:",w)
            filmscores[j] = w
            scores.append(w)
    
    scores.sort()
    scores.reverse()
    for sc in scores :
        for key in filmscores:
            if filmscores[key] == sc :
                result[key] == sc
        if len(result) == 10:
            break
    return result

In [19]:
weighted_recommendations('Year of the carnivore')

KeyError: 'Year of the carnivore'