In [1]:
import numpy as np 
import pandas as pd

import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.neighbors import NearestNeighbors


In [2]:
movie_df = pd.read_csv('../py2/exported.csv')
keywords = pd.read_csv('../py2/keywords.csv')
credits = pd.read_csv('../py2/credits.csv')

movie_df['genres'] = movie_df['genres'].tolist()
movie_df['title'] = movie_df['title'].astype('string')
movie_df['vote_average'] = movie_df['vote_average'].astype('float16')
keywords['id'] = keywords['id'].astype('string')
movie_df['id'] = movie_df['id'].astype('string')
credits['id'] = credits['id'].astype('string')



In [3]:
credits = credits.drop(columns='cast', axis=1)


In [4]:
selectColumns = ['title','genres', 'imdb_id', 'release_date', 'vote_average']

i = 1
for column in selectColumns:
    movie_df[column] = movie_df[column].fillna('') 

movie_df = movie_df.drop(columns=['Unnamed: 0'])

keywords['id'].info

<bound method Series.info of 0           862
1          8844
2         15602
3         31357
4         11862
          ...  
31619     84419
31620    390959
31621    289923
31622    439050
31623    111109
Name: id, Length: 31624, dtype: string>

In [5]:
use_df = pd.merge(movie_df, keywords, on='id', how='inner')

use1_df = pd.merge(use_df, credits, on='id', how='inner')

use1_df.to_csv('use.csv', index=False)

In [7]:
use_df.to_csv('movies_md.csv', index=False)

In [6]:
selectColumns2 = ['crew', 'keywords']

i = 1
for column in selectColumns2:
    use1_df[column] = use1_df[column].fillna('') 


use1_df['vote_average'] = use1_df['vote_average'].astype('string')

useColumns = use1_df['title']+""+use1_df['crew']+''+use1_df['genres']+''+use1_df['keywords']+''+use1_df['vote_average']

useColumns

0        Toy Story[{'credit_id': '52fe4284c3a36847f8024...
1        Jumanji[{'credit_id': '52fe44bfc3a36847f80a7cd...
2        Grumpier Old Men[{'credit_id': '52fe466a925141...
3        Waiting to Exhale[{'credit_id': '52fe447792514...
4        Father of the Bride Part II[{'credit_id': '52f...
                               ...                        
30684    House of Horrors[{'credit_id': '58152c13925141...
30685    Shadow of the Blair Witch[{'credit_id': '56ff6...
30686    The Burkittsville 7[{'credit_id': '5403d669c3a...
30687    Subdue[{'credit_id': '5894a97d925141426c00818c...
30688    Century of Birthing[{'credit_id': '52fe4af1c3a...
Length: 30689, dtype: string

In [7]:
vectorizer = TfidfVectorizer()

useVectorize = vectorizer.fit_transform(useColumns)

In [8]:

nn = NearestNeighbors(metric='cosine', algorithm='brute')

nn.fit(useVectorize)

distances, indices = nn.kneighbors(useVectorize, n_neighbors=30)

distances.shape

(30689, 30)

In [9]:
moviesList = use1_df['title']
moviesList

0                          Toy Story
1                            Jumanji
2                   Grumpier Old Men
3                  Waiting to Exhale
4        Father of the Bride Part II
                    ...             
30684               House of Horrors
30685      Shadow of the Blair Witch
30686            The Burkittsville 7
30687                         Subdue
30688            Century of Birthing
Name: title, Length: 30689, dtype: string

In [111]:
def find_similar_movies(movie_title, n =29):
    movie_title = difflib.get_close_matches(movie_title, moviesList)
    
    title = movie_title

    errorMsg = {'error': 'Could not find movie'}

    if not title:
        return (errorMsg)


    if title:

        movie = title[0]
        idx = use1_df[use1_df['title'] == movie].index[0]
        distances, indices = nn.kneighbors(useVectorize[idx], n_neighbors=n+1)
        return use1_df.iloc[indices[0][1:]][['title','vote_average']]






In [112]:
find_similar_movies('iron man')

Unnamed: 0,title,vote_average
12639,Iron Man 2,6.6015625
16453,Iron Man 3,6.80078125
20080,Avengers: Age of Ultron,7.30078125
20084,Ant-Man,7.0
20087,Thor: Ragnarok,0.0
5526,Hulk,5.30078125
8701,Fantastic Four,5.5
20089,Captain America: Civil War,7.1015625
20090,Doctor Strange,7.1015625
29019,Logan,7.6015625


In [19]:
from flask import Flask, request, jsonify
from flask_cors import CORS

app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}})

@app.route('/recommend', methods=['GET'])
def recommend():

    movie_title = request.args.get('title')
    n = int(request.args.get('n', 29))
    similar_movies = find_similar_movies(movie_title, n)

    if 'Error' in similar_movies:
        return jsonify({'this': similar_movies}), 404
    
    return jsonify(similar_movies.to_dict(orient ='records'))

if __name__ == '__main__':
    app.run(debug=True)

Unnamed: 0,title,vote_average
29019,Logan,7.6015625
4622,Minority Report,7.1015625
8182,The Assassination of Richard Nixon,6.3984375
5535,Bad Boys II,6.30078125
6854,The Bourne Supremacy,7.19921875
7112,Alexander,5.6015625
3697,A.I. Artificial Intelligence,6.80078125
66,Bed of Roses,5.1015625
6855,Catwoman,4.19921875
6275,Man on Fire,7.30078125
