# Movie Recommendation using TF-IDF

Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import json

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

Load the dataframe..

In [None]:
# source = https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata?select=tmdb_5000_movies.csv

url = 'https://raw.githubusercontent.com/ghifari226/movie_recommendation_using_tf-idf/master/tmdb_5000_movies.csv'
df = pd.read_csv(url)

In [None]:
print("number of rows:", len(df))
print("number of columns:", len(df.columns))

number of rows: 4803
number of columns: 20


Determine which columns should be used for comparison

In [None]:
df.iloc[0]

budget                                                          237000000
genres                  [{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage                                      http://www.avatarmovie.com/
id                                                                  19995
keywords                [{"id": 1463, "name": "culture clash"}, {"id":...
original_language                                                      en
original_title                                                     Avatar
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                     150.437577
production_companies    [{"name": "Ingenious Film Partners", "id": 289...
production_countries    [{"iso_3166_1": "US", "name": "United States o...
release_date                                                   2009-12-10
revenue                                                        2787965087
runtime                               

These are the two columns that will be used for comparison

In [None]:
df.genres[0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [None]:
df.keywords[0]

'[{"id": 1463, "name": "culture clash"}, {"id": 2964, "name": "future"}, {"id": 3386, "name": "space war"}, {"id": 3388, "name": "space colony"}, {"id": 3679, "name": "society"}, {"id": 3801, "name": "space travel"}, {"id": 9685, "name": "futuristic"}, {"id": 9840, "name": "romance"}, {"id": 9882, "name": "space"}, {"id": 9951, "name": "alien"}, {"id": 10148, "name": "tribe"}, {"id": 10158, "name": "alien planet"}, {"id": 10987, "name": "cgi"}, {"id": 11399, "name": "marine"}, {"id": 13065, "name": "soldier"}, {"id": 14643, "name": "battle"}, {"id": 14720, "name": "love affair"}, {"id": 165431, "name": "anti war"}, {"id": 193554, "name": "power relations"}, {"id": 206690, "name": "mind and soul"}, {"id": 209714, "name": "3d"}]'

the tags are available in json format, we will use json library to handle the data and create the string from each "name" values

showing tags(5) from the keywords

In [None]:
json.loads(df.genres[0])

[{'id': 28, 'name': 'Action'},
 {'id': 12, 'name': 'Adventure'},
 {'id': 14, 'name': 'Fantasy'},
 {'id': 878, 'name': 'Science Fiction'}]

In [None]:
json.loads(df.keywords[0])[:5]

[{'id': 1463, 'name': 'culture clash'},
 {'id': 2964, 'name': 'future'},
 {'id': 3386, 'name': 'space war'},
 {'id': 3388, 'name': 'space colony'},
 {'id': 3679, 'name': 'society'}]

eliminate the whitespaces on each tag

In [None]:
for i in range(5):
  print(''.join(json.loads(df.keywords[0])[i]['name'].split()))

cultureclash
future
spacewar
spacecolony
society


In [None]:
def convert_to_string(row):
  genres = json.loads(row['genres'])
  genres = ' '.join(''.join(j['name'].split()) for j in genres)

  return genres

In [None]:
df['string'] = df.apply(convert_to_string, axis=1)

In [None]:
X = df['string'][0]+' '+df['string'][1]

In [None]:
tfidf = TfidfVectorizer()

In [None]:
df['string'][0]

0       Action Adventure Fantasy ScienceFiction
1                      Adventure Fantasy Action
2                        Action Adventure Crime
3                   Action Crime Drama Thriller
4               Action Adventure ScienceFiction
                         ...                   
4798                      Action Crime Thriller
4799                             Comedy Romance
4800               Comedy Drama Romance TVMovie
4801                                           
4802                                Documentary
Name: string, Length: 4803, dtype: object

In [None]:
X

'Action Adventure Fantasy ScienceFiction Adventure Fantasy Action'

In [None]:
y = tfidf.fit_transform([X])

In [None]:
y.toarray()

array([[0.5547002, 0.5547002, 0.5547002, 0.2773501]])

Recommendation

In [None]:
get_index = pd.Series(df.index, index=df['title'])

In [None]:
# picking 5 random titles
titles = list(df['title'].sample(5, random_state=42))
titles

['I Spy',
 'Split Second',
 'Gossip',
 'Vicky Cristina Barcelona',
 'Harry Potter and the Half-Blood Prince']

In [None]:
index = [get_index[title] for title in titles]
index

[596, 3372, 2702, 2473, 8]

In [None]:
convert_to_string(df.iloc[596])

'Action Adventure Comedy Thriller'