In [4]:
import pandas as pd
import sqlalchemy
from sqlalchemy import *
import re
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
engine = create_engine("sqlite:///movies-db.sqlite")
conn = engine.connect()
inspector = inspect(engine)
tables = inspector.get_table_names()
tables

['final_movies.csv']

In [6]:
full = pd.read_sql('final_movies.csv', con=conn).dropna()

In [4]:
selections = full.drop(columns=['Unnamed: 0','movieId','imdbID','Title','Year','Ratings','Released','Runtime','Plot','Poster','imdbVotes'])

In [5]:
kmeans = KMeans(n_clusters=800, random_state = 42)
kmeans.fit(selections)
clusters = kmeans.predict(selections)
full['clusters']=clusters

In [None]:
# use index of entered value to output cluster and sort

In [17]:
searched = 'The Shawshank Redemption'
value = full[full['Title']==searched].index.values[0]

In [18]:
new_full = full[full['clusters'] == full['clusters'][value]].reset_index()
plots_arr = new_full['Plot'].to_numpy()
plots_l = list(plots_arr)

In [19]:
corpus = []
for i in range(0, len(plots_l)):
    text = re.sub('[^a-zA-Z]', ' ', plots_l[i])
    text = text.lower()
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    text=re.sub("(\\d|\\W)+"," ",text)
    corpus.append(text)

In [20]:
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
X = vectorizer.fit_transform(corpus)

In [21]:
my_tags = vectorizer.get_feature_names()

In [22]:
my_tag_matrix = pd.DataFrame(0, index=np.arange(len(plots_l)), columns=my_tags)

In [23]:
for i in range(0,len(my_tag_matrix)):
    for j in range(0,len(my_tags)):        
        if my_tags[j] in new_full['Plot'].iloc[i]:
            my_tag_matrix.iloc[i][my_tags.index(my_tags[j])] = 1

In [24]:
my_tag_matrix['imdbID'] = new_full['imdbID']

In [25]:
new_value = new_full[new_full['Title']==searched].index.values[0]

In [26]:
new_selections = my_tag_matrix.drop(columns = ['imdbID'])

In [27]:
new_selections

Unnamed: 0,accused,age,armenian,aunt,berlin,best,black,born,boy,boys,...,village,visit,war,way,wife,woman,world,year,years,young
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
110,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
111,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
thing = new_selections.iloc[new_value]

In [29]:
knn = NearestNeighbors(n_neighbors=4)

In [30]:
knn.fit(new_selections)
arr = knn.kneighbors([thing], return_distance = False)

In [31]:
arr

array([[ 0, 64, 56, 51]])

In [32]:
results = []
for i in arr[0]:
    url = f'https://imdb.com/title/{new_full["imdbID"][i]}'
    plot= new_full['Plot'].iloc[i]
    title = new_full['Title'].iloc[i]
    poster = new_full['Poster'].iloc[i]
    temp_dict = {
        'title':title,
        'url':url,
        'plot':plot,
        'poster':poster
    }
    results.append(temp_dict)

In [33]:
results

[{'title': 'The Shawshank Redemption',
  'url': 'https://imdb.com/title/tt0111161',
  'plot': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
  'poster': 'https://m.media-amazon.com/images/M/MV5BMDFkYTc0MGEtZmNhMC00ZDIzLWFmNTEtODM1ZmRlYWMwMWFmXkEyXkFqcGdeQXVyMTMxODk2OTU@._V1_SX300.jpg'},
 {'title': 'People on the Alps',
  'url': 'https://imdb.com/title/tt0034701',
  'plot': 'Mail author for translation. Kodos hegycsucsok, fekete fenyvesek vilagaban el a havasok nehezsorsu nepe. Csutak Gergely favago nyomorusagos eletet felesege irant erzett nagy szerelme es ...',
  'poster': 'https://m.media-amazon.com/images/M/MV5BMTQyODI2OTExMV5BMl5BanBnXkFtZTcwNzk3MDA0MQ@@._V1_SX300.jpg'},
 {'title': 'Horses of God',
  'url': 'https://imdb.com/title/tt2369047',
  'plot': 'A fictional account of the lives of the men responsible for the suicide bombings in Casablanca in 2003.',
  'poster': 'https://m.media-amazon.com/images/M/MV