# Recommender Exercise

In [14]:
import json
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
import requests
data_request = requests.get('https://lazyprogrammer.me/course_files/nlp/tmdb_5000_movies.csv')

In [16]:
import io
df = pd.read_csv(io.BytesIO(data_request.content))

In [17]:
df.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [18]:
kw_inputs = df['keywords']
gr_inputs = df['genres']

In [19]:
max_keywords = 0
for keywords in kw_inputs:
    data = json.loads(keywords)
    for keyword in data:
        max_keywords = max(max_keywords, keyword['id'])
max_keywords

238222

In [20]:
def tokenize(data):
    keywords, genres = json.loads(data)
    return list(
        map(str, [x['id'] for x in keywords] + [x['id'] + max_keywords for x in genres])
    )

In [21]:
vectorizer = TfidfVectorizer(tokenizer=tokenize)
documents = ['[' + x + ',' + y + ']' for x, y in zip(kw_inputs, gr_inputs)]
vectors = vectorizer.fit_transform(documents)



In [22]:
def query(idx):
    vector = vectors[idx].transpose()
    sim = vectors.dot(vector).transpose().toarray()[0]
    recommend = [(sim[i], i) for i in range(vectors.shape[0]) if i != idx]
    recommend.sort(reverse=True)
    return recommend[:5]

In [23]:
query(0)

[(0.24292495296943448, 47),
 (0.19919895504223384, 1287),
 (0.19211953448814917, 61),
 (0.1780810748621272, 1201),
 (0.17400067175268868, 3724)]

In [24]:
def query_title(title):
    title = title.lower()
    idx = next(filter(lambda i: df['title'][i].lower() == title, range(vectors.shape[0])))
    results = query(idx)
    return [(s, df['title'][i]) for s, i in results]

In [25]:
query_title('avatar')

[(0.24292495296943448, 'Star Trek Into Darkness'),
 (0.19919895504223384, 'A Monster in Paris'),
 (0.19211953448814917, 'Jupiter Ascending'),
 (0.1780810748621272, 'Predators'),
 (0.17400067175268868, 'Falcon Rising')]

In [26]:
query_title('star wars')

[(0.40353824189904475, 'The Empire Strikes Back'),
 (0.2364032756663848, 'Return of the Jedi'),
 (0.2331743401003578, 'Star Wars: Episode III - Revenge of the Sith'),
 (0.1897114237127075, 'Impostor'),
 (0.18573067154871115, 'Star Wars: Episode II - Attack of the Clones')]