In [23]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
data= pd.read_csv("TMDB_movie_dataset_v11.csv")

In [25]:
data.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords'],
      dtype='object')

In [26]:
data=data[['id', 'title', 'overview', 'genres', 'keywords']]

In [27]:
data.head()

Unnamed: 0,id,title,overview,genres,keywords
0,27205.0,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc..."
1,157336.0,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction","rescue, future, spacecraft, race against time,..."
2,155.0,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller","joker, sadism, chaos, secret identity, crime f..."
3,19995.0,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","future, society, culture clash, space travel, ..."
4,24428.0,The Avengers,When an unexpected enemy emerges and threatens...,"Science Fiction, Action, Adventure","new york city, superhero, shield, based on com..."


In [28]:
data['keywords']= data['overview']+data['genres']

In [29]:
data.head()

Unnamed: 0,id,title,overview,genres,keywords
0,27205.0,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","Cobb, a skilled thief who commits corporate es..."
1,157336.0,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction",The adventures of a group of explorers who mak...
2,155.0,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller",Batman raises the stakes in his war on crime. ...
3,19995.0,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","In the 22nd century, a paraplegic Marine is di..."
4,24428.0,The Avengers,When an unexpected enemy emerges and threatens...,"Science Fiction, Action, Adventure",When an unexpected enemy emerges and threatens...


In [30]:
data= data.drop(columns=['overview', 'genres'])

In [31]:
data.head()

Unnamed: 0,id,title,keywords
0,27205.0,Inception,"Cobb, a skilled thief who commits corporate es..."
1,157336.0,Interstellar,The adventures of a group of explorers who mak...
2,155.0,The Dark Knight,Batman raises the stakes in his war on crime. ...
3,19995.0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
4,24428.0,The Avengers,When an unexpected enemy emerges and threatens...


In [32]:
data.isnull().sum()

id          1027608
title       1027608
keywords    1027613
dtype: int64

In [33]:
data = data.dropna(subset=['title', 'keywords'])
data = data[(data['title'] != '') & (data['keywords'] != '')]

In [34]:
data.isnull().sum()

id          0
title       0
keywords    0
dtype: int64

In [35]:
cv = CountVectorizer(max_features=10000, stop_words='english')

In [36]:
vector_data= cv.fit_transform(data['keywords'].values.astype('U')).toarray()

In [37]:
vector_data.shape

(9994, 10000)

In [38]:
similarity= cosine_similarity(vector_data)

In [39]:
similarity

array([[1.        , 0.14048787, 0.03009646, ..., 0.        , 0.        ,
        0.        ],
       [0.14048787, 1.        , 0.0338255 , ..., 0.        , 0.        ,
        0.        ],
       [0.03009646, 0.0338255 , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.14142136,
        0.06593805],
       [0.        , 0.        , 0.        , ..., 0.14142136, 1.        ,
        0.09325048],
       [0.        , 0.        , 0.        , ..., 0.06593805, 0.09325048,
        1.        ]])

In [51]:
def recommend(movie_title):
    index = data[data['title'] == movie_title].index[0]
    similar_movies = sorted(((i, similarity[index][i]) for i in range(len(similarity[index]))), key=lambda x: x[1], reverse=True)[1:6]
    recommended_titles = [data.iloc[movie[0]].title for movie in similar_movies]
    return recommended_titles

In [52]:
recommended_movies= recommend("The Dark Knight")
for movie_title in recommended_movies:
    print(movie_title)

NameError: name 'idx' is not defined

In [None]:
import pickle

In [None]:
pickle.dump(data, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [None]:
pickle.load(open('similarity.pkl', 'rb'))