#EDA - Analisis Exploratorio de Datos

#Se importan las librerías a utilizar

In [26]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from itertools import combinations

#Se trabaja sobre el dataframe limpio, con los datos completos.

In [27]:
movies = pd.read_csv('../ETL_API/movies_ETL_API.csv', dtype={'overview': str}, low_memory=False)
movies.head(3)

Unnamed: 0,belongs_to_collection,budget,genres,id,original_language,overview,popularity,production_companies,production_countries,release_date,...,spoken_languages,status,tagline,title,vote_average,vote_count,release_year,return,actors,directors
0,Toy Story Collection,30000000.0,"['Animation', 'Comedy', 'Family']",862,en,"Led by Woody, Andy's toys live happily in his ...",21.946943,Pixar Animation Studios,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,...,English,Released,,Toy Story,7.7,5415.0,1995,12.45,"['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim...",['John Lasseter']
1,,65000000.0,"['Adventure', 'Fantasy', 'Family']",8844,en,When siblings Judy and Peter discover an encha...,17.015539,TriStar Pictures,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,...,English,Released,Roll the dice and unleash the excitement!,Jumanji,6.9,2413.0,1995,4.04,"['Robin Williams', 'Jonathan Hyde', 'Kirsten D...",['Joe Johnston']
2,Grumpy Old Men Collection,0.0,"['Romance', 'Comedy']",15602,en,A family wedding reignites the ancient feud be...,11.7129,Warner Bros.,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,...,English,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,6.5,92.0,1995,,"['Walter Matthau', 'Jack Lemmon', 'Ann-Margret...",['Howard Deutch']


In [28]:
movies.shape[0]

45466

#Se eliminan duplicados y se visuliza como queda el dataframe

In [29]:
movies.drop_duplicates(inplace=True)
movies.drop_duplicates(subset=['id'],inplace=True)
movies=movies.reset_index(drop=True)

In [30]:
movies.shape[0]

45436

#Se visualiza información de datos nulos

In [31]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45436 entries, 0 to 45435
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   belongs_to_collection  4488 non-null   object 
 1   budget                 45433 non-null  float64
 2   genres                 45436 non-null  object 
 3   id                     45436 non-null  object 
 4   original_language      45425 non-null  object 
 5   overview               44482 non-null  object 
 6   popularity             45431 non-null  object 
 7   production_companies   33562 non-null  object 
 8   production_countries   45433 non-null  object 
 9   release_date           45346 non-null  object 
 10  revenue                45436 non-null  int64  
 11  runtime                45173 non-null  float64
 12  spoken_languages       41382 non-null  object 
 13  status                 45349 non-null  object 
 14  tagline                20401 non-null  object 
 15  ti

#Se utiliza el criterio de considerar solamente las películas de los últimos 10 años debido a los problemas de memoria que se generan para correr el código.

In [32]:
# Se obtiene la fecha actual
current_date = datetime.now()

# Se calcula la fecha mínima para los últimos 10 años
ten_years_ago = current_date - pd.DateOffset(years=10)

# Se convierte el campo 'release_date' a marca de tiempo
movies['release_date'] = pd.to_datetime(movies['release_date'])

# Se filtran los datos de los últimos 10 años
movies = movies[movies['release_date'] >= ten_years_ago]

# Se reinician los índices del DataFrame resultante
movies = movies.reset_index(drop=True)

movies.shape

(7126, 22)

#Se crea un dataframe de recomendación 

In [33]:
recomend = movies[['id', 'title', 'genres', 'tagline']].copy()
recomend.head(3)

Unnamed: 0,id,title,genres,tagline
0,141210,The Sleepover,"['Comedy', 'Horror']",
1,31156,Dante's Hell Animated,['Animation'],The real epic animation as written by Dante Al...
2,235271,Addicted,"['Drama', 'Thriller']",Every Woman Needs an Escape


#Se crean tablas de referencias entre el id de la película y la fila del dataframe y a la inversa para uso posterior.

In [34]:
dicc_indice_movieid = recomend['id'].to_dict()
print(dicc_indice_movieid[0])

141210


In [35]:
dicc_movieid_indice ={valor:clave for clave, valor in dicc_indice_movieid.items()}
print(dicc_movieid_indice['141210'])

0


#Se crean vectores a partir del campo tagline para posteriormente utilizar esos vectores como criterios de similaridad.-

#Dado que no aportan información al proceso se eliminan los valores numéricos del campo overview por medio de una función

In [36]:
def quitar_numeros(tagline):
    s = tagline.lower()
    s= re.sub(r"\d+", "",s)
    return s

#Se prueba el funcionamiento de la función... me quita los años :)

In [37]:
quitar_numeros("Hola, soy Gaby y tengo 47 años")

'hola, soy gaby y tengo  años'

#Se eliminan  los nulos

In [38]:
recomend['tagline'] = recomend['tagline'].fillna('')

#Se cuenta la cantidad de veces que se repite una palabra utilizando sklearn

In [39]:
contador = CountVectorizer(preprocessor=quitar_numeros, min_df=5)
tagline_bag_of_words = contador.fit_transform(recomend["tagline"]).toarray()
tagline_bag_of_words

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [40]:
columna_tagline = [tup[0] for tup in sorted(contador.vocabulary_.items(), key=lambda x: x[1])]
tag_bag_of_words_df = pd.DataFrame(tagline_bag_of_words, columns=columna_tagline, index=recomend['title'])
tag_bag_of_words_df.head(3)

Unnamed: 0_level_0,about,act,adventure,after,again,against,age,alive,all,alone,...,worth,would,wrong,year,years,yet,you,young,your,yourself
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Sleepover,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dante's Hell Animated,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Addicted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
def tokenizador_generos(string_generos):
    generos_separados = string_generos.split("|")
    resultado = []
    for tamaño in [1,2]:
        combs = ["Géneros -" + "|".join(sorted(tupla))
                 for tupla in combinations (generos_separados, r=tamaño)]
        resultado = resultado + combs
    return sorted(resultado)

In [42]:
tokenizador_generos("Animación|Comedia|Acción")

['Géneros -Acción',
 'Géneros -Acción|Animación',
 'Géneros -Acción|Comedia',
 'Géneros -Animación',
 'Géneros -Animación|Comedia',
 'Géneros -Comedia']

In [43]:
contador_generos = CountVectorizer(tokenizer=tokenizador_generos, token_pattern=None,lowercase=False)
contador_generos.fit(recomend['genres'])
gen_bag_of_words= contador_generos.fit_transform(recomend['genres']).toarray()
gen_bag_of_words

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [44]:
columnas_gen = [tup[0] for tup in sorted (contador_generos.vocabulary_.items(), key=lambda x :x[1])]
gen_bag_of_words_df = pd.DataFrame(gen_bag_of_words, columns=columnas_gen,index=recomend['title'])
gen_bag_of_words_df

Unnamed: 0_level_0,"Géneros -['Action', 'Adventure', 'Animation', 'Comedy', 'Family']","Géneros -['Action', 'Adventure', 'Animation', 'Documentary', 'Family', 'History']","Géneros -['Action', 'Adventure', 'Animation', 'Family']","Géneros -['Action', 'Adventure', 'Animation', 'Fantasy']","Géneros -['Action', 'Adventure', 'Animation', 'Horror', 'Mystery', 'Science Fiction']","Géneros -['Action', 'Adventure', 'Animation']","Géneros -['Action', 'Adventure', 'Comedy', 'Crime', 'Mystery', 'Thriller']","Géneros -['Action', 'Adventure', 'Comedy', 'Fantasy', 'Science Fiction', 'Thriller']","Géneros -['Action', 'Adventure', 'Comedy', 'Romance', 'Thriller']","Géneros -['Action', 'Adventure', 'Comedy', 'Science Fiction']",...,"Géneros -['Western', 'Action', 'Drama', 'Science Fiction']","Géneros -['Western', 'Action', 'Fantasy', 'Horror']","Géneros -['Western', 'Action', 'Thriller']","Géneros -['Western', 'Documentary']","Géneros -['Western', 'Drama', 'Adventure', 'Thriller']","Géneros -['Western', 'Drama', 'Thriller']","Géneros -['Western', 'Drama']","Géneros -['Western', 'Horror']",Géneros -['Western'],Géneros -[]
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Sleepover,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dante's Hell Animated,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Addicted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The ABCs of Death,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Eternal Return of Antonis Paraskevas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
In a Heartbeat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Blood, Sweat and Tears",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Mom,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Morning After,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
bag_of_words_combinado = np.hstack((tagline_bag_of_words,gen_bag_of_words))
bag_of_words_combinado_df = pd.DataFrame(bag_of_words_combinado,columns=columna_tagline+columnas_gen, index=recomend['title'])
bag_of_words_combinado_df                                                                                                            

Unnamed: 0_level_0,about,act,adventure,after,again,against,age,alive,all,alone,...,"Géneros -['Western', 'Action', 'Drama', 'Science Fiction']","Géneros -['Western', 'Action', 'Fantasy', 'Horror']","Géneros -['Western', 'Action', 'Thriller']","Géneros -['Western', 'Documentary']","Géneros -['Western', 'Drama', 'Adventure', 'Thriller']","Géneros -['Western', 'Drama', 'Thriller']","Géneros -['Western', 'Drama']","Géneros -['Western', 'Horror']",Géneros -['Western'],Géneros -[]
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Sleepover,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Dante's Hell Animated,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Addicted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The ABCs of Death,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Eternal Return of Antonis Paraskevas,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
In a Heartbeat,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"Blood, Sweat and Tears",1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Mom,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Morning After,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
tf_idf = TfidfTransformer()
tf_idf_movies = tf_idf.fit_transform(bag_of_words_combinado_df).toarray()
tf_idf_movies

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [47]:
tf_idf_movies_df = pd.DataFrame(tf_idf_movies,index=recomend['title'],columns=columna_tagline+columnas_gen)
tf_idf_movies_df                                

Unnamed: 0_level_0,about,act,adventure,after,again,against,age,alive,all,alone,...,"Géneros -['Western', 'Action', 'Drama', 'Science Fiction']","Géneros -['Western', 'Action', 'Fantasy', 'Horror']","Géneros -['Western', 'Action', 'Thriller']","Géneros -['Western', 'Documentary']","Géneros -['Western', 'Drama', 'Adventure', 'Thriller']","Géneros -['Western', 'Drama', 'Thriller']","Géneros -['Western', 'Drama']","Géneros -['Western', 'Horror']",Géneros -['Western'],Géneros -[]
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Sleepover,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dante's Hell Animated,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Addicted,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The ABCs of Death,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Eternal Return of Antonis Paraskevas,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
In a Heartbeat,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Blood, Sweat and Tears",0.561134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Mom,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Morning After,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
cosine_sims=cosine_similarity(tf_idf_movies_df)
matriz_similaridad_df = pd.DataFrame(cosine_sims,index=recomend['title'],columns=recomend['title'])
matriz_similaridad_df

title,The Sleepover,Dante's Hell Animated,Addicted,The ABCs of Death,The Eternal Return of Antonis Paraskevas,Blue Caprice,A Band Called Death,Standing Up,Lovelace,Assault on Wall Street,...,Thick Lashes of Lauri Mäntyvaara,All at Once,Corporate Event,Cop and a Half: New Recruit,The Sublet,In a Heartbeat,"Blood, Sweat and Tears",Mom,The Morning After,Queerama
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Sleepover,1.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
Dante's Hell Animated,0.0,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.041343,0.0,...,0.0,0.0,0.0,0.0,0.0,0.052555,0.059101,0.0,0.000000,0.0
Addicted,0.0,0.000000,1.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
The ABCs of Death,0.0,0.000000,0.0,1.0,0.000000,0.062652,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0
The Eternal Return of Antonis Paraskevas,0.0,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.240242,0.211106,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.301784,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
In a Heartbeat,0.0,0.052555,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.054949,0.0,...,0.0,0.0,0.0,0.0,0.0,1.000000,0.078551,0.0,0.078564,0.0
"Blood, Sweat and Tears",0.0,0.059101,0.0,0.0,0.301784,0.000000,0.0,0.072501,0.125501,0.0,...,0.0,0.0,0.0,0.0,0.0,0.078551,1.000000,0.0,0.000000,0.0
Mom,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,1.0,0.000000,0.0
The Morning After,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.078564,0.000000,0.0,1.000000,0.0


In [56]:
np.fill_diagonal(matriz_similaridad_df.values,np.nan)
matriz_similaridad_df.head()

title,The Sleepover,Dante's Hell Animated,Addicted,The ABCs of Death,The Eternal Return of Antonis Paraskevas,Blue Caprice,A Band Called Death,Standing Up,Lovelace,Assault on Wall Street,...,Thick Lashes of Lauri Mäntyvaara,All at Once,Corporate Event,Cop and a Half: New Recruit,The Sublet,In a Heartbeat,"Blood, Sweat and Tears",Mom,The Morning After,Queerama
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Sleepover,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dante's Hell Animated,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.041343,0.0,...,0.0,0.0,0.0,0.0,0.0,0.052555,0.059101,0.0,0.0,0.0
Addicted,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The ABCs of Death,0.0,0.0,0.0,,0.0,0.062652,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Eternal Return of Antonis Paraskevas,0.0,0.0,0.0,0.0,,0.0,0.0,0.240242,0.211106,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.301784,0.0,0.0,0.0


In [73]:
orden_movies_cosine_sims_fila = np.argsort(cosine_sims, axis=1)
orden_movies_cosine_sims_fila

array([[3562, 4752, 4751, ...,  405, 5341,    0],
       [   0, 4635, 4634, ..., 4953, 2926,    1],
       [   0, 4727, 4726, ..., 3621, 1349,    2],
       ...,
       [   0, 4751, 4750, ...,  149,  846, 7123],
       [   0, 4707, 4706, ..., 3813, 1028, 7124],
       [   0, 4713, 4712, ..., 3043, 4778, 7125]], dtype=int64)

In [74]:
cosine_sims_ordenadas = np.sort(cosine_sims,axis=1)
cosine_sims_ordenadas

array([[0.        , 0.        , 0.        , ..., 1.        , 1.        ,
               nan],
       [0.        , 0.        , 0.        , ..., 0.39824235, 0.40478131,
               nan],
       [0.        , 0.        , 0.        , ..., 0.3757276 , 0.43065385,
               nan],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
               nan],
       [0.        , 0.        , 0.        , ..., 0.39586529, 0.5645431 ,
               nan],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
               nan]])

In [84]:
def top_k_similares(moviesId,k):
    orden_movies_cosine_sims_fila = dicc_movieid_indice[moviesId]

    lista_odenada_movies_sim = orden_movies_cosine_sims_fila[orden_movies_cosine_sims_fila]
    lista_odenada_sims = cosine_sims_ordenadas[orden_movies_cosine_sims_fila]

    top_k = lista_odenada_movies_sim[:k]
    cosine_sims_top_k = lista_odenada_sims[:k]

    top_k_df = recomend.loc[top_k].copy()
    top_k_df["similaridad"] = cosine_sims_top_k

    return top_k_df

In [76]:
recomend.head()

Unnamed: 0,id,title,genres,tagline
0,141210,The Sleepover,"['Comedy', 'Horror']",
1,31156,Dante's Hell Animated,['Animation'],The real epic animation as written by Dante Al...
2,235271,Addicted,"['Drama', 'Thriller']",Every Woman Needs an Escape
3,87436,The ABCs of Death,['Horror'],"26 Directors, 26 Ways to Die."
4,205864,The Eternal Return of Antonis Paraskevas,['Drama'],


In [99]:
def obtener_similares(titulo):
    # Obtener el índice de la película según el título
    indice_pelicula = movies[movies['title'] == titulo].index[0]
    
    # Obtener la fila de similitudes de coseno para la película
    fila_similitudes = cosine_sims[indice_pelicula]
    
    # Obtener los índices de las películas ordenadas según la similitud de coseno
    indices_ordenados = fila_similitudes.argsort()[::-1]
    
    # Obtener los índices de las 5 películas más similares (excluyendo la película misma)
    indices_similares = indices_ordenados[1:6]
    
    # Obtener los títulos y géneros de las películas más similares
    titulos_similares = movies.loc[indices_similares, 'title'].values
    generos_similares = movies.loc[indices_similares, 'genres'].values
    
    # Crear el diccionario de tuplas (título, género) de las películas similares
    peliculas_similares = {
        i: (titulo, genero)
        for i, (titulo, genero) in enumerate(zip(titulos_similares, generos_similares), 1)
    }
    
    return peliculas_similares


In [100]:
obtener_similares('Addicted')

{1: ('The D Train', "['Drama', 'Comedy']"),
 2: ('The Boy', "['Horror', 'Mystery', 'Thriller']"),
 3: ('Hours', "['Drama', 'Thriller']"),
 4: ('The Girl on the Train', "['Thriller']"),
 5: ('Accused', "['Drama', 'Thriller']")}