In [1]:
import pandas as pd
import sqlalchemy
from sqlalchemy import *
import re
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
engine = create_engine("sqlite:///data/movies-db.sqlite")
conn = engine.connect()
inspector = inspect(engine)
tables = inspector.get_table_names()
tables

[]

In [3]:
full = pd.read_csv('movies.csv').dropna()

In [4]:
value = full[full['Title']=='Toy Story'].index.values[0]

In [5]:
value

0

In [6]:
selections = full.drop(columns=['Unnamed: 0','movieId','imdbID','Title','Year','Ratings','Released','Runtime','Plot','Poster','imdbVotes'])

In [7]:
kmeans = KMeans(n_clusters=800, random_state = 42)
kmeans.fit(selections)
clusters = kmeans.predict(selections)
full['clusters']=clusters

In [8]:
# use index of entered value to output cluster and sort

In [9]:
new_full = full[full['clusters'] == full['clusters'][value]].reset_index()
plots_arr = new_full['Plot'].to_numpy()
plots_l = list(plots_arr)

In [10]:
new_full['Title']

0                                          Toy Story
1                                       The Rescuers
2                            Who Framed Roger Rabbit
3                                        Toy Story 2
4                           The Emperor's New Groove
5                                              Shrek
6                                     Monsters, Inc.
7                                            Ice Age
8                                       Brother Bear
9                        The Twelve Tasks of Asterix
10                                Asterix and Caesar
11                                Asterix in Britain
12    DuckTales the Movie: Treasure of the Lost Lamp
13                                    Over the Hedge
14                                     Monster House
15                                      Flushed Away
16                               Horton Hears a Who!
17                                             Ponyo
18                               A Town Called

In [11]:
corpus = []
for i in range(0, len(plots_l)):
    text = re.sub('[^a-zA-Z]', ' ', plots_l[i])
    text = text.lower()
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    corpus.append(text)

In [12]:
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
X = vectorizer.fit_transform(corpus)

In [13]:
my_tags = vectorizer.get_feature_names()

In [14]:
my_tag_matrix = pd.DataFrame(0, index=np.arange(len(plots_l)), columns=my_tags)

In [15]:
for i in range(0,len(my_tag_matrix)):
    for j in range(0,len(my_tags)):        
        if my_tags[j] in new_full['Plot'].iloc[i]:
            my_tag_matrix.iloc[i][my_tags.index(my_tags[j])] = 1

In [16]:
my_tag_matrix['imdbID'] = new_full['imdbID']

In [17]:
new_selections = my_tag_matrix.drop(columns = ['imdbID'])

In [18]:
new_selections

Unnamed: 0,asterix,baby,bear,best,boy,brings,cat,children,city,cowboy,...,velma,village,villain,visit,wants,water,weren,woody,world,young
0,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
thing = new_selections.iloc[value]

In [33]:
knn = NearestNeighbors(n_neighbors=4)

In [34]:
knn.fit(new_selections)
arr = knn.kneighbors([thing], return_distance = False)

In [22]:
arr

array([[ 0, 16,  9,  2]])

In [23]:
results = []
for i in arr[0]:
    url = f'https://imdb.com/title/{new_full["imdbID"][i]}'
    plot= new_full['Plot'].iloc[i]
    title = new_full['Title'].iloc[i]
    poster = new_full['Poster'].iloc[i]
    temp_dict = {
        'title':title,
        'url':url,
        'plot':plot,
        'poster':poster
    }
    results.append(temp_dict)

In [24]:
results

[{'title': 'Toy Story',
  'url': 'https://imdb.com/title/tt0114709',
  'plot': "A cowboy doll is profoundly threatened and jealous when a new spaceman figure supplants him as top toy in a boy's room.",
  'poster': 'https://m.media-amazon.com/images/M/MV5BMDU2ZWJlMjktMTRhMy00ZTA5LWEzNDgtYmNmZTEwZTViZWJkXkEyXkFqcGdeQXVyNDQ2OTk4MzI@._V1_SX300.jpg'},
 {'title': 'Horton Hears a Who!',
  'url': 'https://imdb.com/title/tt0451079',
  'plot': 'Horton the Elephant struggles to protect a microscopic community from his neighbors who refuse to believe it exists.',
  'poster': 'https://m.media-amazon.com/images/M/MV5BYzk5YzhjODYtZDc3Mi00OTYyLWIyYzAtMzZjYTljYTYzODM1XkEyXkFqcGdeQXVyODU2MDg1NzU@._V1_SX300.jpg'},
 {'title': 'The Twelve Tasks of Asterix',
  'url': 'https://imdb.com/title/tt0072901',
  'plot': 'A group of indomitable Gauls are challenged by Roman Emperor Julius Caesar to accomplish twelve impossible tasks.',
  'poster': 'https://m.media-amazon.com/images/M/MV5BYjZlOWFmMTMtMTRkNy00NDA0LTg2