In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer
import re


In [2]:

# Cargar los datos en un DataFrame
df = pd.read_csv("TMDb_updated.csv", usecols=["title", "overview"])

#Eliminar duplicados
df = df.drop_duplicates(subset='title')

# Eliminar valores nulos en title y genres
df = df.dropna(subset=['title','overview'])

df['overview'] = df['overview'].str.lower()
df.head()


Unnamed: 0,title,overview
0,Ad Astra,"the near future, a time when both hope and har..."
1,Bloodshot,"after he and his wife are murdered, marine ray..."
2,Bad Boys for Life,marcus and mike are forced to confront new thr...
3,Ant-Man,armed with the astonishing ability to shrink i...
4,Percy Jackson: Sea of Monsters,"in their quest to confront the ultimate evil, ..."


In [3]:
def quitar_num(argumento):
    s = argumento.lower()
    s = re.sub(r"\d+","",s)
    return s

contador_overview = CountVectorizer(preprocessor = quitar_num,
                                    min_df=5)

overview_bag_of_words = (contador_overview
                        .fit_transform(df['overview'])
                        .toarray()
                        )

columnas_overview = [tup[0] for tup in
                    sorted(contador_overview.vocabulary_.items(),
                          key= lambda x: x[1])]

overview_bag_of_words_df = pd.DataFrame(overview_bag_of_words,
                                       columns=columnas_overview,
                                       index=df.title)



In [4]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_idf = TfidfTransformer()

tf_idf_pelis = tf_idf.fit_transform(overview_bag_of_words_df).toarray()

tf_idf_pelis_df = pd.DataFrame(tf_idf_pelis,
                              index = df.title,
                              columns = columnas_overview)



In [5]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sims = cosine_similarity(tf_idf_pelis_df)

matriz_similaridades_df = pd.DataFrame(cosine_sims,
                                      index=df.title,
                                      columns=df.title)

np.fill_diagonal(matriz_similaridades_df.values, np.nan)


In [6]:
orden_pelis_cosine_sims_por_fila = np.argsort((-cosine_sims), axis = 1)
cosine_sims_ordenadas = np.sort(-cosine_sims, axis = 1)



In [7]:
df.reset_index(drop=True, inplace=True)


In [8]:
def top_k_similares(movie_title, k):
    fila_cosine_sims = dicc_movie_title_indice[movie_title]
    
    lista_ordenada_pelis_sim = orden_pelis_cosine_sims_por_fila[fila_cosine_sims]
    lista_ordenada_sims = cosine_sims_ordenadas[fila_cosine_sims]
    
    top_k = lista_ordenada_pelis_sim[:k]
    cosine_sims_top_k = lista_ordenada_sims[:k]
    
    top_k_df = df.loc[top_k].copy()
    top_k_df["similaridad"] = cosine_sims_top_k
    
    return top_k_df

# Crear un diccionario que asocie el título de la película con su índice en el DataFrame df
dicc_movie_title_indice = {}
for i, movie_title in enumerate(df.title):
    dicc_movie_title_indice[movie_title] = i


In [12]:
top_k_similares('Pulp Fiction',10)

Unnamed: 0,title,overview,similaridad
3716,Sliding Doors,"gwyneth paltrow plays london publicist helen, ...",-0.18982
2518,The Sting,set in the 1930s this intricate caper deals wi...,-0.182264
3074,Marrowbone,a young man and his three younger siblings are...,-0.173455
7397,Kill Your Friends,"in the late 1990s, a drug-addled nihilist reso...",-0.163935
7385,First Love,a young boxer and a call girl get caught up in...,-0.162367
5764,La Jetée,"time travel, still images, a past, present and...",-0.147189
9467,A Million Little Pieces,a young drug-addled writer approaching the bot...,-0.144325
1256,The Fighter,"the fighter, is a drama about boxer ""irish"" mi...",-0.142354
7205,The Wackness,"set in new york city in the sweltering summer,...",-0.141202
33,Digimon Adventure: Last Evolution Kizuna,"tai is now a university student, living alone,...",-0.138897


from flask import Flask, request
app = Flask(__name__)

@app.route('/')
def index():
    return '''
    <form action='/result' method='post'>
    <input type='text' name='title'>
    <input type='submit' value='Submit'>
    </form>
    '''

@app.route('/result', methods=['POST'])
def result():
    title = request.form['title']
    df = pd.read_csv("https://raw.githubusercontent.com/beaunus/movies-unpacked/master/movies.csv")
    df = df[['title', 'overview']]
    text = '<br>'.join(top_k_similares(title, df, k=10))
    return text

if __name__ == '__main__':
    app.run()
