# Movie recommendation system

## Problem Statement
Would we be able to predict which movies might or might not be a commercial success? </br>
This dataset collects part of the knowledge from the API TMDB, which contains only </br>
5000 movies out of the total number. 

## Dictionary
movie_id</br>
title</br>
overview</br>
genres</br>
keywords</br>
cast</br>
crew</br>

In [1]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import sqlite3
import json

In [2]:
movies_raw = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv")
credits_raw = pd.read_csv("https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv")

In [3]:
movies_raw.head(1)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
credits_raw.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
conn = sqlite3.connect('movie_database.db')

In [6]:
movies_raw.to_sql('movies', conn, if_exists='replace', index=False)
credits_raw.to_sql('credits', conn, if_exists='replace', index=False)

4803

In [7]:
query = """
SELECT c.movie_id, m.title, m.overview, m.genres, m.keywords, c.cast, c.crew
FROM movies m
JOIN credits c
ON m.title = c.title;
"""
df_raw = pd.read_sql_query(query, conn)

In [8]:
conn.close()

In [9]:
df_raw.head(5)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [10]:
df_processed = df_raw.copy()

In [11]:
df_processed['genres'] = df_raw['genres'].apply(lambda x: [entry['name'] for entry in json.loads(x)] if pd.notna(x) else [])
df_processed['keywords'] = df_raw['keywords'].apply(lambda x: [entry['name'] for entry in json.loads(x)] if pd.notna(x) else [])
df_processed['cast'] = df_raw['cast'].apply(lambda x: [entry['name'] for entry in json.loads(x)][:3] if pd.notna(x) else [])
df_processed['crew'] = df_processed['crew'].apply(lambda x: [entry['name'] for entry in json.loads(x) if entry['job'] == 'Director'][0] if x and any(entry['job'] == 'Director' for entry in json.loads(x)) else '')
df_processed['overview'] = df_processed['overview'].apply(lambda x: [x] if pd.notna(x) else [])
df_processed['overview'] = df_raw['overview'].apply(lambda x: [x] if pd.notna(x) else [])


In [12]:
df_processed.sample(5, random_state=1010)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
4160,98549,The Legend of Hell's Gate: An American Conspiracy,"[In 1870s Texas, a ruthless bounty hunter and ...","[Action, Adventure, History, Western]",[based on real events],"[Eric Balfour, Lou Taylor Pucci, Henry Thomas]",Tanner Beard
1587,36593,The Naked Gun 33⅓: The Final Insult,[Frank Drebin is persuaded out of retirement t...,"[Comedy, Crime]","[undercover, spoof, state prison]","[Leslie Nielsen, Priscilla Presley, George Ken...",Peter Segal
2022,14635,The Rookie,[Jim Morris never made it out of the minor lea...,"[Drama, Family]","[father son relationship, baseball, sports tea...","[Dennis Quaid, Rachel Griffiths, Beth Grant]",John Lee Hancock
2221,26171,Everybody's Fine,"[Eight months after the death of his wife, Fra...",[Drama],"[family relationships, doctor, retired, visit,...","[Robert De Niro, Drew Barrymore, Kate Beckinsale]",Kirk Jones
3822,37495,Four Lions,[Four Lions tells the story of a group of Brit...,"[Comedy, Crime, Drama]","[terrorism, british farce]","[Riz Ahmed, Nigel Lindsay, Kayvan Novak]",Chris Morris


In [13]:
def remove_spaces(text):
    if isinstance(text, list):
        return [entry.replace(' ', '') for entry in text]
    else:
        return text.replace(' ', '')

# Apply the function to the specified columns
df_processed['genres'] = df_processed['genres'].apply(remove_spaces)
df_processed['cast'] = df_processed['cast'].apply(remove_spaces)
df_processed['crew'] = df_processed['crew'].apply(remove_spaces)
df_processed['keywords'] = df_processed['keywords'].apply(remove_spaces)

In [14]:
df_processed.sample(5, random_state=1010)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
4160,98549,The Legend of Hell's Gate: An American Conspiracy,"[In 1870s Texas, a ruthless bounty hunter and ...","[Action, Adventure, History, Western]",[basedonrealevents],"[EricBalfour, LouTaylorPucci, HenryThomas]",TannerBeard
1587,36593,The Naked Gun 33⅓: The Final Insult,[Frank Drebin is persuaded out of retirement t...,"[Comedy, Crime]","[undercover, spoof, stateprison]","[LeslieNielsen, PriscillaPresley, GeorgeKennedy]",PeterSegal
2022,14635,The Rookie,[Jim Morris never made it out of the minor lea...,"[Drama, Family]","[fathersonrelationship, baseball, sportsteam, ...","[DennisQuaid, RachelGriffiths, BethGrant]",JohnLeeHancock
2221,26171,Everybody's Fine,"[Eight months after the death of his wife, Fra...",[Drama],"[familyrelationships, doctor, retired, visit, ...","[RobertDeNiro, DrewBarrymore, KateBeckinsale]",KirkJones
3822,37495,Four Lions,[Four Lions tells the story of a group of Brit...,"[Comedy, Crime, Drama]","[terrorism, britishfarce]","[RizAhmed, NigelLindsay, KayvanNovak]",ChrisMorris


In [15]:
df_processed['tags'] = df_processed[df_processed.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

df_processed.sample(10)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,tags
3007,1832,Dogma,[The latest battle in the eternal war between ...,"[Fantasy, Comedy, Adventure]","[angel, wisconsin, churchservice, church, afte...","[BenAffleck, MattDamon, LindaFiorentino]",KevinSmith,"Dogma,['The latest battle in the eternal war b..."
349,116745,The Secret Life of Walter Mitty,[A timid magazine photo manager who lives life...,"[Adventure, Comedy, Drama, Fantasy]","[himalaya, photographer, magazine, iceland, da...","[BenStiller, KristenWiig, PattonOswalt]",BenStiller,"The Secret Life of Walter Mitty,['A timid maga..."
1570,59967,Looper,"[In the futuristic action thriller Looper, tim...","[Action, Thriller, ScienceFiction]","[suicide, assassin, drugaddiction, future, tim...","[JosephGordon-Levitt, BruceWillis, EmilyBlunt]",RianJohnson,"Looper,[""In the futuristic action thriller Loo..."
173,65759,Happy Feet Two,[Mumble the penguin has a problem: his son Eri...,"[Animation, Comedy, Family]","[penguin, musical, aftercreditsstinger, 3d]","[ElijahWood, RobinWilliams, Pink]",GeorgeMiller,"Happy Feet Two,['Mumble the penguin has a prob..."
2263,10448,Rapa Nui,[Inter-tribal rivalry leads to a competition t...,[Adventure],"[inlovewithenemy, indigenous, island]","[JasonScottLee, EsaiMorales, SandrineHolt]",KevinReynolds,"Rapa Nui,['Inter-tribal rivalry leads to a com..."
2996,74534,The Best Exotic Marigold Hotel,[British retirees travel to India to take up r...,"[Drama, Comedy]","[hotel, basedonnovel, india, ensemblecast, eld...","[BillNighy, MaggieSmith, TomWilkinson]",JohnMadden,"The Best Exotic Marigold Hotel,['British retir..."
541,9425,Soldier,[Sergeant Todd is a veteran soldier for an eli...,"[Action, War, ScienceFiction]","[spacemarine, dystopia, alienplanet, geneticen...","[KurtRussell, JasonScottLee, JasonIsaacs]",PaulW.S.Anderson,"Soldier,['Sergeant Todd is a veteran soldier f..."
1553,9880,The Princess Diaries,[A socially awkward but very bright 15-year-ol...,"[Comedy, Family, Romance]","[heirtothethrone, grandmothergranddaughterrela...","[AnneHathaway, JulieAndrews, HeatherMatarazzo]",GarryMarshall,"The Princess Diaries,['A socially awkward but ..."
399,7484,Open Season,"[Boog, a domesticated 900lb. Grizzly bear find...","[Adventure, Animation, Family]","[hunter, mountains, garage, grizzlybear, bunny...","[MartinLawrence, AshtonKutcher, GarySinise]",JillCulton,"Open Season,['Boog, a domesticated 900lb. Griz..."
3598,10987,Halloween: The Curse of Michael Myers,"[Six years ago, Michael Myers terrorized the t...","[Horror, Thriller]","[massmurder, nudity, halloween, attempttoescap...","[DonaldPleasence, PaulRudd, MarianneHagan]",JoeChappelle,"Halloween: The Curse of Michael Myers,[""Six ye..."


In [16]:
df_processed["overview"] = df_processed["overview"].apply(lambda x: [str(x)])
df_processed["genres"] = df_processed["genres"].apply(lambda x: [str(genre) for genre in x])
df_processed["keywords"] = df_processed["keywords"].apply(lambda x: [str(keyword) for keyword in x])
df_processed["cast"] = df_processed["cast"].apply(lambda x: [str(actor) for actor in x])
df_processed["crew"] = df_processed["crew"].apply(lambda x: [str(crew_member) for crew_member in x])

In [21]:
df_final = df_processed
df_final["tags"] = df_processed["overview"] + df_processed["genres"] + df_processed["keywords"] + df_processed["cast"] + df_processed["crew"]

In [22]:
df_final["tags"] = df_final["tags"].apply(lambda x: ",".join(x).replace(",", " "))

In [23]:
df_final.drop(columns = ["genres", "keywords", "cast", "crew", "overview"], inplace = True)

In [27]:
df_final.iloc[0].tags

"['In the 22nd century  a paraplegic Marine is dispatched to the moon Pandora on a unique mission  but becomes torn between following orders and protecting an alien civilization.'] Action Adventure Fantasy ScienceFiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver J a m e s C a m e r o n"

In [28]:
df_final.to_csv("../data/processed/clean_data.csv", index = False)

conn = sqlite3.connect("../data/movies_database.db")

movies_raw.to_sql("clean_movies_data", conn, if_exists = "replace", index = False)

4803

In [34]:
# KNN modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_final["tags"])

model = NearestNeighbors(n_neighbors = 6, algorithm = "brute", metric = "cosine")
model.fit(tfidf_matrix)

def get_movie_recommendations(movie_title):
    movie_index = df_final[df_final["title"] == movie_title].index[0]
    distances, indices = model.kneighbors(tfidf_matrix[movie_index])
    similar_movies = [(df_final["title"][i], distances[0][j]) for j, i in enumerate(indices[0])]
    return similar_movies[1:]

def recommend(input_movie):
    recommendations = get_movie_recommendations(input_movie)
    print("Film recommendations '{}'".format(input_movie))
    for movie, distance in recommendations:
        print("- Film: {}".format(movie))

In [42]:
recommend("Meet Joe Black")

Film recommendations 'Meet Joe Black'
- Film: The Best Years of Our Lives
- Film: Dragon Nest: Warriors' Dawn
- Film: Summer Catch
- Film: One Hour Photo
- Film: Christmas with the Kranks


In [30]:
from pickle import dump

dump(model, open("../models/knn_neighbors-6_algorithm-brute_metric-cosine.sav", "wb"))