In [1]:
pip install numpy pandas scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
movies = pd.read_csv('movies.csv')  
movies['genres'] = movies['genres'].str.replace('|', ' ')
movies['genres'] = movies['genres'].fillna('')
print(movies.head())

   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy based on novel sec

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['genres'])
print(tfidf_matrix.shape)  


(4803, 22)


In [4]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
print(cosine_sim.shape) 

(4803, 4803)


In [5]:
print(movies['title'].head(20))  



0                                          Avatar
1        Pirates of the Caribbean: At World's End
2                                         Spectre
3                           The Dark Knight Rises
4                                     John Carter
5                                    Spider-Man 3
6                                         Tangled
7                         Avengers: Age of Ultron
8          Harry Potter and the Half-Blood Prince
9              Batman v Superman: Dawn of Justice
10                               Superman Returns
11                              Quantum of Solace
12     Pirates of the Caribbean: Dead Man's Chest
13                                The Lone Ranger
14                                   Man of Steel
15       The Chronicles of Narnia: Prince Caspian
16                                   The Avengers
17    Pirates of the Caribbean: On Stranger Tides
18                                 Men in Black 3
19      The Hobbit: The Battle of the Five Armies


In [6]:

indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()
def get_recommendations(title, cosine_sim=cosine_sim):
    try:
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        movie_indices = [i[0] for i in sim_scores]
        return movies['title'].iloc[movie_indices].tolist()
    except KeyError:
        print(f"Movie '{title}' not found in the dataset.")
        return []
movie_title = 'Spider-Man 3' 
recommendations = get_recommendations(movie_title)
print("Recommendations for:", movie_title)
print(recommendations)

Recommendations for: Spider-Man 3
['Spider-Man 3', 'Batman v Superman: Dawn of Justice', "Pirates of the Caribbean: Dead Man's Chest", 'Pirates of the Caribbean: On Stranger Tides', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Spider-Man 2', 'The Amazing Spider-Man 2', 'The Mummy: Tomb of the Dragon Emperor', 'The Hobbit: An Unexpected Journey']
