In [1]:
import pandas as pd
import numpy as np

credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

In [2]:
#credits.head()

In [3]:
#movies.head()

In [4]:
print("Credits: ", credits.shape)
print("Movies: ", movies.shape)

Credits:  (4803, 4)
Movies:  (4803, 20)


In [5]:
# Rename "movie_id" column to "id"
credits_column_renamed = credits.rename(index=str, columns={"movie_id": "id"})

# Merge the two data frames
movies_merge = movies.merge(credits_column_renamed, on = 'id')

#print("Merged: ", movies_merge.head())

movies_merged.head()

In [6]:
movies_merge.shape

(4803, 23)

In [7]:
movies_cleaned = movies_merge.drop(columns=['homepage', 'title_x', 'title_y', 'status','production_countries'])
#print(movies_cleaned.head())
#print(movies_cleaned.info())
#print(movies_cleaned.head(1)['overview'])

In [8]:
movies_cleaned.shape

(4803, 18)

In [9]:
# Prepare to analyze and match the contents of the 'Overview' column to recommend movies using TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

In [10]:
# Fit the TF-IDF onto the 'Overview' column
tfv_matrix = tfv.fit_transform(movies_cleaned['overview'].values.astype('U'))
#print(tfv_matrix)

In [11]:
#tfv_matrix.shape

In [12]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
#print(sig[0])

In [13]:
# Reverse mapping of indices and movie titles
indices = pd.Series(movies_cleaned.index, index=movies_cleaned['original_title']).drop_duplicates()

# Test indices accuracy
#print(indices)
#print(indices['Newlyweds'])
#print(sig[4799])
#print(list(enumerate(sig[indices['Newlyweds']])))
#print(sorted(list(enumerate(sig[indices['Newlyweds']])), key=lambda x: x[1], reverse=True))

In [14]:
def give_recommendations(movies, sig=sig):
    sig_scores = []
    for i in range(len(sig[0])):
        sig_scores.append(0)
    idxList = []
    for title in movies:
        # Get index corresponding to original title
        idx = indices[title]
        idxList.append(idx)
        # Get the pairwise similarity scores
        sig_scores_one = sig[idx]
        # Add the scores to the aggregate list
        for i in range(len(sig_scores_one)):
            sig_scores[i] = sig_scores[i] + sig_scores_one[i]

    sig_scores = list(enumerate(sig_scores))
    # Sort the movies
    sig_scores = sorted(sig_scores, key = lambda x: x[1], reverse = True)

    # Get the top 10 matches
    final_list = []
    i = 0
    while len(final_list) < 10:
        isInList = False
        for j in idxList:
            if j == sig_scores[i][0]:
                isInList = True
                break;
        if not isInList:
            final_list.append(sig_scores[i])
        i = i + 1
        

    # Map to movie indices
    movie_indices = [i[0] for i in final_list]
    
    # Return top 10 most similar movies
    return movies_cleaned['original_title'].iloc[movie_indices]

In [15]:
print("Type in as many movie titles as you like. When you're ready to calculate recommendations, enter 'q'. ")
print()
movies = []
while(1):
    textIn = input("Type a movie title ('q' to quit and get results) ")
    if textIn == 'q' or textIn == 'Q':
        break
    if textIn in movies_cleaned['original_title'].unique():
        movies.append(textIn)
    else:
        print("Title not valid. Try a different spelling. Otherwise, the database does not contain your title")
if len(movies) > 0:
    print(give_recommendations(movies))
else:
    print("You did not enter any movies!")


Type in as many movie titles as you like. When you're ready to calculate recommendations, enter 'q'. 

Type a movie title ('q' to quit and get results) Avatar
Type a movie title ('q' to quit and get results) q
1341                Obitaemyy Ostrov
634                       The Matrix
3604                       Apollo 18
2130                    The American
775                        Supernova
529                 Tears of the Sun
151                          Beowulf
311     The Adventures of Pluto Nash
847                         Semi-Pro
942                 The Book of Life
Name: original_title, dtype: object
