In [1]:
import pandas as pd

movies_database = pd.read_csv('../data/raw/movies.csv')
movies_database.shape

(19265, 16)

In [3]:
movies_database.head()

Unnamed: 0,id,year,title,description,genres,director,actors,writers,music,art,producer,imdb_id,tmdb_id,imdb_rating,tmdb_rating,available_now
0,1,2008,The Girl from Monaco,A brilliant and neurotic attorney goes to Mona...,"Comedy,Drama",Anne Fontaine,"Fabrice Luchini,Helene de Saint-Pere,Jeanne Ba...","Anne Fontaine,Benoît Graffin",Philippe Rombi,,"Philippe Carcassonne,Bruno Pesary,Christine Ra...",tt1139800,15342.0,5.8,4.8,yes
1,2,2008,Every Jack has a Jill,Jack is encouraged to take the romantic Paris ...,"Comedy,Melodrama",Jennifer Devoldère,"Justin Lee Bartha,Billy Boyd,Maurice Bénichou,...",Jennifer Devoldère,,"Hervé Gallet,Alix Deschamps","Bruno Chiche,Nicolas Duval Adassovsky,Maxime R...",tt1094668,32338.0,6.1,5.4,yes
2,3,2009,Dorian Gray,A vain London playboy offers his soul in excha...,"Thriller,Sci-Fi",Oliver Parker,"Colin Firth,Nathan Rosen,Caroline Goodall,John...","Toby Finlay,Oscar Wilde",Charlie Mole,"John Beard,Ruth Myers,Niamh Coulter","Barnaby Thompson,Paul Brett,Simon Fawcett",tt1235124,23488.0,6.3,5.9,yes
3,4,2008,How to Lose Friends & Alienate People,A British writer struggles to fit in at a high...,"Comedy,Drama",Robert B. Willow,"Simon Pegg,Jeff Bridges,Danny Huston,Jillian A...","Peter Straughan,Toby Young",David Arnold,"John Beard,Ray Chan,Anthony Gasparro","Elizabeth Karlsen,Laurie Borg,Stephen Woolley",tt0455538,13092.0,6.4,6.1,yes
4,5,2009,Cell 211,The story of two men on different sides of a p...,"Action,Thriller",Daniel Monzon,"Luis Tosar,Vicente Romero,Fernando Soto,Luis Z...","Daniel Monzon,F.P. Gandull,Jorge Guerricaechev...",Roque Baños,"Antón Laguna,Montse Sancho","Álvaro Augustín,Juan Gordon,Emma Lustres",tt1242422,33273.0,7.6,7.4,yes


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
movies_database['description'] = movies_database['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(movies_database['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(19265, 38411)

In [5]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [6]:
indices = pd.Series(movies_database.index, index=movies_database['id']).drop_duplicates()

In [8]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_database['id'].iloc[movie_indices]

In [10]:
recoms = get_recommendations(3)

In [13]:
recoms

15681    16212
17789    18822
6943      7012
8210      8361
18328    19721
359        360
3837      3874
1946      1970
17684    18615
8192      8343
Name: id, dtype: int64

In [14]:
for rec in recoms:
    print(rec)
    print(movies_database[movies_database['id'] == rec]['title'])

16212
15681    Dorian Gray
Name: title, dtype: object
18822
17789    Catch and Release
Name: title, dtype: object
7012
6943    24 Hours Rain
Name: title, dtype: object
8361
8210    My Week with Marilyn
Name: title, dtype: object
19721
18328    Penny Dreadful
Name: title, dtype: object
360
359    I Don't Believe You Anymore
Name: title, dtype: object
3874
3837    Where's The Head?
Name: title, dtype: object
1970
1946    Beauty and the Least
Name: title, dtype: object
18615
17684    Stay
Name: title, dtype: object
8343
8192    Unrequited
Name: title, dtype: object
