**Imports**

In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from pickle import dump

**Loading the data**

In [2]:
url1 = 'https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv'
url2 = 'https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv'

df1 = pd.read_csv(url1)
df2 = pd.read_csv(url2)

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [4]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


**Processing the data with SQLite3**

In [5]:
connection = sqlite3.connect("movie.db")

In [6]:
df1.to_sql('movies', connection, if_exists='replace', index=False)
df2.to_sql('credits', connection, if_exists='replace', index=False)

4803

In [7]:
query = 'SELECT * FROM movies INNER JOIN credits ON movies.title = credits.title'

raw_data = pd.read_sql_query(query, connection)
raw_data = raw_data.loc[:, ~raw_data.columns.duplicated()]

In [8]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [9]:
df = raw_data[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [10]:
#Disable SettingWithCopyWarning
pd.set_option('mode.chained_assignment', None)

In [11]:
for i in range(len(df.genres)):
    genres_list = []
    for d in json.loads(df.genres[i]):
        genres_list.append(d['name'])
    df.genres[i] = f','.join(genres_list)    

In [12]:
for i in range(len(df.keywords)):
    key_list = []
    for d in json.loads(df.keywords[i]):
        key_list.append(d['name'])
    df.keywords[i] = f','.join(key_list)

In [13]:
for i in range(len(df.cast)):
    cast_list = []
    for d in json.loads(df.cast[i]):
        cast_list.append(d['name'])
    df.cast[i] = f','.join(cast_list[0:3])

In [14]:
for i in range(len(df.crew)):
    director = next((d['name'] for d in json.loads(df.crew[i]) if d['job'] == 'Director'), None)
    df.crew[i] = director

In [15]:
for i in range(len(df.genres)):
    df.genres[i] = df.genres[i].replace(' ', '').replace(',', ' ')

In [16]:
for i in range(len(df.keywords)):
    df.keywords[i] = df.keywords[i].replace(' ', '').replace(',', ' ')

In [17]:
for i in range(len(df.cast)):
    df.cast[i] = df.cast[i].replace(' ', '').replace(',', ' ')

In [18]:
for i in range(len(df.crew)):
    df.crew[i] = str(df.crew[i]).replace(' ', '')

In [19]:
df.tail()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
4804,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,Action Crime Thriller,unitedstates–mexicobarrier legs arms paperknif...,CarlosGallardo JaimedeHoyos PeterMarquardt,RobertRodriguez
4805,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,Comedy Romance,,EdwardBurns KerryBishé MarshaDietlein,EdwardBurns
4806,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",Comedy Drama Romance TVMovie,date loveatfirstsight narration investigation ...,EricMabius KristinBooth CrystalLowe,ScottSmith
4807,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,,,DanielHenney ElizaCoupe BillPaxton,DanielHsia
4808,25975,My Date with Drew,Ever since the second grade when he first saw ...,Documentary,obsession camcorder crush dreamgirl,DrewBarrymore BrianHerzlinger CoreyFeldman,BrianHerzlinger


In [20]:
df['tags'] = df['overview'] + ' ' + df['genres'] + ' ' + df['keywords'] + ' ' + df['cast'] + ' ' + df['crew']

In [21]:
df.tags[1]

"Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of the Earth with Will Turner and Elizabeth Swann. But nothing is quite as it seems. Adventure Fantasy Action ocean drugabuse exoticisland eastindiatradingcompany loveofone'slife traitor shipwreck strongwoman ship alliance calypso afterlife fighter pirate swashbuckler aftercreditsstinger JohnnyDepp OrlandoBloom KeiraKnightley GoreVerbinski"

In [22]:
df.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
tags        3
dtype: int64

In [23]:
df['tags'] = df['tags'].fillna('')

**Transforming the text in array**

In [24]:
vectorizer = TfidfVectorizer()
vector = vectorizer.fit_transform(df['tags'])

**Train the model**

In [25]:
model=NearestNeighbors(metric='cosine')
model.fit(vector)

**Make recommendation**

In [83]:
def rec(movie):
    recs=[]
    movie_index = df[df['title'] == movie].index[0]
    distances, indices = model.kneighbors(vector[movie_index], n_neighbors=6)
    similar_movies = [(df['title'][i], distances[0][j]) for j, i in enumerate(indices[0])]
    for m in range(len(similar_movies)-1):
        recs.append(similar_movies[1:][m][0])
    return recs

In [84]:
print(rec('Avatar'))

['Aliens', 'Battle: Los Angeles', 'Falcon Rising', 'Apollo 18', 'Titan A.E.']


**Save the model**

In [None]:
dump(model, open("../models/knn_neighbors_project.sav", "wb"))