In [224]:
import pandas as pd
import os
# from pyspark.sql import SparkSession
import numpy as np
import json
from sklearn.feature_extraction.text import CountVectorizer
import nltk

In [225]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [226]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

In [227]:
def process_json(text):
    return ' '.join([genre['name'].lower() for genre in json.loads(text)])

In [228]:
movies.genres = movies.genres.apply(process_json)

In [229]:
movies.keywords = movies.keywords.apply(process_json)

In [230]:
movies = movies[['id', 'title', 'keywords', 'overview', 'vote_average', 'vote_count']]

In [231]:
def process_json_name(text):
    return ' '.join([nome['name'].lower().replace(' ', '') for nome in json.loads(text)])

def process_json_director(text):
    return ' '.join([nome['name'].lower().replace(' ', '') for nome in json.loads(text)
                     if nome['department'] == 'Directing'])

In [232]:
credits.cast = credits.cast.apply(process_json_name)

In [233]:
credits.crew = credits.crew.apply(process_json_director)

In [234]:
movies['overview'] = movies.overview.apply(lambda x: str(x).lower())

In [235]:
credits.rename({'movie_id': 'id'}, inplace=True, axis=1)

In [236]:
final_df = movies.merge(credits, on='id', how='inner')
final_df.head()

Unnamed: 0,id,title_x,keywords,overview,vote_average,vote_count,title_y,cast,crew
0,19995,Avatar,culture clash future space war space colony so...,"in the 22nd century, a paraplegic marine is di...",7.2,11800,Avatar,samworthington zoesaldana sigourneyweaver step...,jamescameron
1,285,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...,"captain barbossa, long believed to be dead, ha...",6.9,4500,Pirates of the Caribbean: At World's End,johnnydepp orlandobloom keiraknightley stellan...,goreverbinski pamelaalch sharronreynolds karen...
2,206647,Spectre,spy based on novel secret agent sequel mi6 bri...,a cryptic message from bond’s past sends him o...,6.3,4466,Spectre,danielcraig christophwaltz léaseydoux ralphfie...,sammendes jayne-anntenggren nicolettamani susi...
3,49026,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...,following the death of district attorney harve...,7.6,9106,The Dark Knight Rises,christianbale michaelcaine garyoldman annehath...,christophernolan stevegehrke matthewreedy sidk...
4,49529,John Carter,based on novel mars medallion space travel pri...,"john carter is a war-weary, former military ca...",6.1,2124,John Carter,taylorkitsch lynncollins samanthamorton willem...,andrewstanton anniepenn tommygormley bendixon ...


In [237]:
final_df.rename({'title_x': 'title'}, inplace=True, axis=1)

In [152]:
final_df.head()

Unnamed: 0,id,title,keywords,overview,vote_average,vote_count,title_y,cast,crew
0,19995,Avatar,culture clash future space war space colony so...,"in the 22nd century, a paraplegic marine is di...",7.2,11800,Avatar,samworthington zoesaldana sigourneyweaver step...,jamescameron
1,285,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...,"captain barbossa, long believed to be dead, ha...",6.9,4500,Pirates of the Caribbean: At World's End,johnnydepp orlandobloom keiraknightley stellan...,goreverbinski pamelaalch sharronreynolds karen...
2,206647,Spectre,spy based on novel secret agent sequel mi6 bri...,a cryptic message from bond’s past sends him o...,6.3,4466,Spectre,danielcraig christophwaltz léaseydoux ralphfie...,sammendes jayne-anntenggren nicolettamani susi...
3,49026,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...,following the death of district attorney harve...,7.6,9106,The Dark Knight Rises,christianbale michaelcaine garyoldman annehath...,christophernolan stevegehrke matthewreedy sidk...
4,49529,John Carter,based on novel mars medallion space travel pri...,"john carter is a war-weary, former military ca...",6.1,2124,John Carter,taylorkitsch lynncollins samanthamorton willem...,andrewstanton anniepenn tommygormley bendixon ...


In [238]:
mean_vote = np.mean(final_df['vote_average'])
minimum_votes  = final_df['vote_count'].quantile(0.70)

def weighted_rating(x, minimum_votes=minimum_votes, mean_vote=mean_vote):
    v = x['vote_count']
    R = x['vote_average']
    return (v / (v + minimum_votes) * R) + (minimum_votes / (v + minimum_votes) * mean_vote)

final_df['weighted_rating'] = final_df.apply(weighted_rating, axis=1)

In [239]:
final_df.drop(['vote_count', 'vote_average'], inplace=True, axis=1)

In [240]:
final_df.keywords = final_df.keywords.apply(lambda x: x.strip() + ' ')
final_df.overview = final_df.overview.apply(lambda x: x.strip())
final_df.cast = final_df.cast.apply(lambda x: ' ' + x.strip())
final_df.crew = final_df.crew.apply(lambda x: ' ' + x.strip())

final_df['related_words'] = final_df.keywords + final_df.overview + final_df.cast + final_df.crew

In [241]:
movies_df = final_df[['id', 'title', 'related_words', 'weighted_rating']]

In [242]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def get_stem(text):
    return ' '.join([stemmer.stem(word) for word in text.split()])

movies_df['related_words'] = movies_df['related_words'].apply(get_stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['related_words'] = movies_df['related_words'].apply(get_stem)


In [243]:
movies_df.head()

Unnamed: 0,id,title,related_words,weighted_rating
0,19995,Avatar,cultur clash futur space war space coloni soci...,7.148013
1,285,Pirates of the Caribbean: At World's End,ocean drug abus exot island east india trade c...,6.807627
2,206647,Spectre,spi base on novel secret agent sequel mi6 brit...,6.276075
3,49026,The Dark Knight Rises,dc comic crime fighter terrorist secret ident ...,7.509565
4,49529,John Carter,base on novel mar medallion space travel princ...,6.098319


In [244]:
count_vec = CountVectorizer(max_features=5000, stop_words='english')
word_vector = count_vec.fit_transform(movies_df['related_words']).toarray()


In [245]:
from sklearn.metrics.pairwise import cosine_similarity

In [246]:
similarity_words = cosine_similarity(word_vector)

In [247]:
movies_df.head()

Unnamed: 0,id,title,related_words,weighted_rating
0,19995,Avatar,cultur clash futur space war space coloni soci...,7.148013
1,285,Pirates of the Caribbean: At World's End,ocean drug abus exot island east india trade c...,6.807627
2,206647,Spectre,spi base on novel secret agent sequel mi6 brit...,6.276075
3,49026,The Dark Knight Rises,dc comic crime fighter terrorist secret ident ...,7.509565
4,49529,John Carter,base on novel mar medallion space travel princ...,6.098319


In [248]:
test = np.array([4,3,2,1])
[order[0] for order in sorted(enumerate(test), key=lambda x: x[1], reverse=True)]

[0, 1, 2, 3]

In [249]:
def get_similar_movies(title):
    idx = movies_df[movies_df['title'] == title].index[0]
    similar_idx = np.array([order[0] for order in 
                            sorted(enumerate(similarity_words[idx]), key=lambda x: x[1], reverse=True)])[1:11]
    similar_movies = movies_df.iloc[similar_idx[1:11]][['title', 'weighted_rating']]\
            .sort_values(by='weighted_rating', ascending=False)

    return similar_movies

In [250]:
movies_df.head()

Unnamed: 0,id,title,related_words,weighted_rating
0,19995,Avatar,cultur clash futur space war space coloni soci...,7.148013
1,285,Pirates of the Caribbean: At World's End,ocean drug abus exot island east india trade c...,6.807627
2,206647,Spectre,spi base on novel secret agent sequel mi6 brit...,6.276075
3,49026,The Dark Knight Rises,dc comic crime fighter terrorist secret ident ...,7.509565
4,49529,John Carter,base on novel mar medallion space travel princ...,6.098319


In [251]:
get_similar_movies('John Carter')

Unnamed: 0,title,weighted_rating
0,Avatar,7.148013
184,Home,6.604167
1959,Star Trek IV: The Voyage Home,6.456382
1068,The Hitchhiker's Guide to the Galaxy,6.444384
1217,Paul,6.400022
1201,Predators,6.029967
939,I Am Number Four,5.951979
754,Planet 51,5.851497
778,Meet Dave,5.666546


In [252]:
import pickle

# Assume `movies` is your DataFrame with 'title' and 'vector' columns
with open('data/movies.pkl', 'wb') as f:
    pickle.dump(movies_df, f)

with open('data/similarity_words.pkl', 'wb') as f:
    pickle.dump(similarity_words, f)