In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.compose import ColumnTransformer
import ast #This is used to parse strings that look like python objects into actual python objects

In [2]:
data = pd.read_csv("tmdb_5000_movies.csv")
df = pd.DataFrame()
df['title'] = data['title']
df['overview'] = data['overview']
df['overview'] = df['overview'].fillna("no overview available")
#The genres column is stored as a string but we want to use it like a dictionary which is why we use literal_eval to make that string
#that is designed like a dictionary into an actual dictionary
df['genres'] = data['genres'].apply(lambda x: [n["name"] for n in ast.literal_eval(x)[:10]])
df['vote_average'] = data['vote_average']

df.head()

Unnamed: 0,title,overview,genres,vote_average
0,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]",7.2
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]",6.9
2,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]",6.3
3,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]",7.6
4,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]",6.1


In [3]:

# TfidfVectorizer converts each movie overview into a numerical vector using Term Frequency–Inverse Document Frequency (TF-IDF).
#     It assigns higher weights to words that are important in a specific overview but less common across all overviews.
#     The result is a vector for each overview, where each dimension represents the importance of a word.
#     This helps capture the unique meaning of each text while reducing the influence of common or uninformative words (like "the", "movie").
pre_vectorization = df['overview'].copy()
vectorizer = TfidfVectorizer()
overview_vector = vectorizer.fit_transform(pre_vectorization)
temp = vectorizer.get_feature_names_out()


In [4]:

# cosine_similarity measures how similar two overviews are by computing the cosine of the angle between their TF-IDF vectors.
#     A cosine similarity of 1 means the overviews are identical in terms of their content.
#     A cosine similarity of 0 means they share no similarity.
#     The output is a square matrix where entry [i][j] shows how similar overview i is to overview j.
overview_similarity = cosine_similarity(overview_vector)


In [5]:
def top_recommended(similarity_vector, top_n=5):
    # Get pairs of (index, score), sort by score descending
    scores = list(enumerate(similarity_vector))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    # Skip the first one (it's the movie itself)
    top_indices = [i[0] for i in scores[1:top_n+1]]
    return top_indices


In [6]:

def recommended(movie_title, similarity_matrix):
    movie_index = df[df['title'] == movie_title].index[0]
    similarity_index = similarity_matrix[movie_index]
    similar_movie_index = top_recommended(similarity_index,5)
    #similar_movie = df['title'][similar_movie_index]
    return similar_movie_index



In [7]:

recommended_movie = recommended('John Carter', overview_similarity)
for y in recommended_movie:
    print(df['title'][y])


Get Carter
The Marine 4: Moving Target
The Hurricane
Raising Cain
Mad Max: Fury Road


In [8]:
#I built a movie recommendation system that uses multiple features
#To make the vectorizer give higher scores to similar words across rows such that 'action' across multiple rows are more corellated 
#We use TfidfVectorizer(use_idf=False, norm=None) or CountVectorizer()
transformer = ColumnTransformer(transformers=[
    ('overview_transformed', TfidfVectorizer(), 'overview'),
    ('genres_transformed', TfidfVectorizer(), 'genres'),
    
],
                               )

t_df = df.copy()#.drop('title', axis=1)
tmp_genres = []

for x in range(len(t_df['genres'])):
    word_list = t_df['genres'][x]
    word = ""
    for y in range(len(word_list)):
        word = word+" "+word_list[y]
    tmp_genres.append(word)

t_df['genres'] = tmp_genres
t_df.head()

Unnamed: 0,title,overview,genres,vote_average
0,Avatar,"In the 22nd century, a paraplegic Marine is di...",Action Adventure Fantasy Science Fiction,7.2
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",Adventure Fantasy Action,6.9
2,Spectre,A cryptic message from Bond’s past sends him o...,Action Adventure Crime,6.3
3,The Dark Knight Rises,Following the death of District Attorney Harve...,Action Crime Drama Thriller,7.6
4,John Carter,"John Carter is a war-weary, former military ca...",Action Adventure Science Fiction,6.1


In [9]:
multi_fit_tf = transformer.fit_transform(t_df)
multi_fit_tf.shape #This returns the number of rows, as well as the TfidfVectorizer representation(rows, TfidfVectorizer representation)

(4803, 21284)

In [10]:
multi_fit_tf_similarity = cosine_similarity(multi_fit_tf)

In [11]:
def multi_recommend(movie, multi_vector):
    multi_movie_idx = t_df[t_df['title'] == movie].index[0]
    tmp_multi_vector = list(enumerate(multi_vector[multi_movie_idx]))
    sort_ed = sorted(tmp_multi_vector, key=lambda x:x[1], reverse=True)
    top_mov = [x[0] for x in sort_ed[1:6]]
    return top_mov

mov_idx = multi_recommend('John Carter', multi_fit_tf_similarity)
for x in mov_idx:
    print(t_df['title'][x])


The Helix... Loaded
Captain America: The Winter Soldier
Star Trek Into Darkness
Captain America: The First Avenger
Avengers: Age of Ultron
