In [1]:
import numpy as np
import pandas as pd

In [2]:
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movie_df = pd.read_csv('tmdb_5000_movies.csv')

In [4]:
print("credits:",credits.shape)
print("movies dataframe:",movie_df.shape)

credits: (4803, 4)
movies dataframe: (4803, 20)


In [6]:
credits_column_renamed = credits.rename(index=str, columns={'movie_id':'id'})
movie_df_merge = movie_df.merge(credits_column_renamed,on='id')

In [7]:
movies_cleaned_df = movie_df_merge.drop(columns = ['homepage','title_x','title_y','status','production_countries'])

In [8]:
movies_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 18 columns):
budget                  4803 non-null int64
genres                  4803 non-null object
id                      4803 non-null int64
keywords                4803 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
tagline                 3959 non-null object
vote_average            4803 non-null float64
vote_count              4803 non-null int64
cast                    4803 non-null object
crew                    4803 non-null object
dtypes: float64(3), int64(4), object(11)
memory usage: 712.9+ KB


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(min_df=3, analyzer='word', token_pattern=r'\w{1,}',
                     ngram_range = (1,3),
                     stop_words = 'english')

movies_cleaned_df['overview'] = movies_cleaned_df['overview'].fillna('')

In [13]:
tfv_matrix = tfv.fit_transform(movies_cleaned_df['overview'])

In [14]:
tfv_matrix

<4803x10415 sparse matrix of type '<class 'numpy.float64'>'
	with 127193 stored elements in Compressed Sparse Row format>

In [15]:
tfv_matrix.shape

(4803, 10415)

In [17]:
from sklearn.metrics.pairwise import sigmoid_kernel

sig = sigmoid_kernel(tfv_matrix,tfv_matrix)

In [18]:
sig[0]

array([0.76163448, 0.76159416, 0.76159416, ..., 0.76159416, 0.76159416,
       0.76159416])

In [21]:
indices = pd.Series(movies_cleaned_df.index,index=movies_cleaned_df['original_title']).drop_duplicates()

In [22]:
indices

original_title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [23]:
indices['Newlyweds']

4799

In [24]:
sig[4799]

array([0.76159416, 0.76159416, 0.76159438, ..., 0.76159432, 0.76159416,
       0.76159479])

In [25]:
list(enumerate(sig[indices['Newlyweds']]))

[(0, 0.7615941559557649),
 (1, 0.7615941559557649),
 (2, 0.7615943792052133),
 (3, 0.7615945565001923),
 (4, 0.7615945780152884),
 (5, 0.7615943268299626),
 (6, 0.7615948191687398),
 (7, 0.7615943470083448),
 (8, 0.7615943903808948),
 (9, 0.761594688358109),
 (10, 0.7615941559557649),
 (11, 0.7615941559557649),
 (12, 0.7615941559557649),
 (13, 0.7615941559557649),
 (14, 0.7615941559557649),
 (15, 0.7615941559557649),
 (16, 0.7615943548512393),
 (17, 0.76159496557644),
 (18, 0.7615942820186986),
 (19, 0.7615945083121634),
 (20, 0.7615942928644458),
 (21, 0.7615949834421317),
 (22, 0.7615941559557649),
 (23, 0.761594428422618),
 (24, 0.7615941559557649),
 (25, 0.7615941559557649),
 (26, 0.7615941559557649),
 (27, 0.7615944534166859),
 (28, 0.7615941559557649),
 (29, 0.761594305379781),
 (30, 0.7615943362100155),
 (31, 0.7615944505217749),
 (32, 0.7615941559557649),
 (33, 0.7615944287462042),
 (34, 0.7615941559557649),
 (35, 0.7615941559557649),
 (36, 0.7615941559557649),
 (37, 0.76159439

In [26]:
sorted(list(enumerate(sig[indices['Newlyweds']])), key=lambda x: x[1], reverse=True)

[(4799, 0.7616344769958086),
 (616, 0.7616048180003857),
 (2689, 0.7616040137754826),
 (869, 0.7616023462370238),
 (3969, 0.7615999252108214),
 (1576, 0.7615998981568298),
 (2290, 0.7615997926823583),
 (1032, 0.7615997304206809),
 (3145, 0.7615995828740623),
 (2531, 0.7615992287095679),
 (504, 0.7615991582262818),
 (866, 0.761598689439311),
 (1157, 0.7615985027054996),
 (2962, 0.7615983457425377),
 (242, 0.7615982597110722),
 (4576, 0.7615982169631158),
 (1223, 0.7615982059179011),
 (3479, 0.7615982048229571),
 (2586, 0.7615979979090771),
 (2688, 0.7615979288529928),
 (3155, 0.7615979191512952),
 (2869, 0.7615978895603),
 (3559, 0.761597883711739),
 (4641, 0.7615978743617422),
 (1632, 0.7615977966503327),
 (4616, 0.7615977253920972),
 (1071, 0.7615977205092846),
 (3393, 0.76159764686818),
 (1970, 0.7615974169291484),
 (1856, 0.7615973816003898),
 (1385, 0.7615973013985569),
 (3583, 0.7615972684238029),
 (4591, 0.7615971608939307),
 (1110, 0.7615971516950014),
 (237, 0.7615970796479883)

In [27]:
def give_rec(title,sig=sig):
    idx = indices[title]
    sig_scores = list(enumerate(sig[idx]))
    sig_scores = sorted(sig_scores, key=lambda x:x[1],reverse=True)
    sig_scores = sig_scores[1:11]
    movie_indices = [i[0] for i in sig_scores]
    return movies_cleaned_df['original_title'].iloc[movie_indices]

In [29]:
give_rec('Spy Kids')

1302    Spy Kids 2: The Island of Lost Dreams
1155                  Spy Kids 3-D: Game Over
1769      Spy Kids: All the Time in the World
4044                               Go for It!
3359                              In Too Deep
1631                                 Mr. 3000
1825                Jimmy Neutron: Boy Genius
339                           The Incredibles
3793                     The Velocity of Gary
1081                       Revolutionary Road
Name: original_title, dtype: object