# Importamos los modulos requeridos por la aplicación

In [1]:
import pandas as pd
import string
import warnings
warnings.filterwarnings('ignore')
# importamos el modulo de sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
def verificar_coincidencias(data_ver: pd.DataFrame, titulo: str, campo: str, rec: str):
    data_ver = data_ver[(data_ver['title'] == titulo) | (data_ver['title'] == rec)]
    aux = data_ver[['title', campo]]
    aux.drop_duplicates(inplace=True)
    dict_list = aux.to_dict('records')

    nuevo_dicc = {}
    for diccionario in dict_list:
        clave = diccionario['title']
        valor = diccionario[campo]
        
        if clave in nuevo_dicc:
            nuevo_dicc[clave].append(valor)
        else:
            nuevo_dicc[clave] = [valor]

    lista1 = nuevo_dicc[titulo]
    lista2 = nuevo_dicc[rec]

    elementos_comunes = set(lista1) & set(lista2)
    cantidad_elementos_comunes = list(elementos_comunes)

    print("Elementos comunes en " + campo + ': ' + str(cantidad_elementos_comunes))
    return nuevo_dicc

# Cargamos el dataset a utilizar

In [3]:
data = pd.read_csv('dataset_3.zip', index_col=0)
data.head()

Unnamed: 0,id,title,tags
0,862,toy story,toy story tom hanks tim allen don rickles jim ...
1,8844,jumanji,jumanji robin williams jonathan hyde kirsten d...
2,15602,grumpier old men,grumpier old men walter matthau jack lemmon an...
3,31357,waiting to exhale,waiting to exhale whitney houston angela basse...
4,11862,father of the bride part ii,father of the bride part ii steve martin diane...


# Hacemos el tratamiento de los datos que vamos a utilizar para la recomendacion

In [4]:
df = data[['title', 'tags']]
# df = df.sample(frac=0.1)
df = df.iloc[0:5001]
df.reset_index(drop=True, inplace=True)
df['tags'] = df['tags'].str.replace('[{}]'.format(string.punctuation), ' ')
df

Unnamed: 0,title,tags
0,toy story,toy story tom hanks tim allen don rickles jim ...
1,jumanji,jumanji robin williams jonathan hyde kirsten d...
2,grumpier old men,grumpier old men walter matthau jack lemmon an...
3,waiting to exhale,waiting to exhale whitney houston angela basse...
4,father of the bride part ii,father of the bride part ii steve martin diane...
...,...,...
4996,the last man,the last man dan montgomery jr david arnott j...
4997,maryam,maryam mariam parris david ackert shaun toub s...
4998,mean machine,mean machine vinnie jones david kelly david he...
4999,monsoon wedding,monsoon wedding naseeruddin shah lillete dubey...


Usamos el vectorizer de sklearn para calcular la frecuencia de las palabras que aparecen en nuestro tag y el parametro stop words para descartar todas las pablabras comunes del idioma ingles que no aportar valor a mi modelo

In [None]:
tfidf = TfidfVectorizer(stop_words='english')
df['tags'].fillna('', inplace=True) # reemplazamos los valores nulos del dataframe por un vacio
tfidf_matrix = tfidf.fit_transform(df['tags']) # creamos la matriz donde estaran las palabras de cada tag y su frecuencia
tfidf.vocabulary_ # imprimimos el vocabulario encontrado y su frecuencia a lo largo del dataframe

In [6]:
# imprimimos el tamaño de la matrix
tfidf_matrix.shape

(5001, 46386)

# Implementamos la similaridad coseno

- calcula el nucleo lineal entre los parametros recibidos y me sirve para comparar al llamar una pelicula se compara con todas las otras peliculas y selecciona la que tiene mas relacion

In [7]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix) 

- Creamos una serie con el id de las peliculas

In [8]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

# Creamos la funcion que hace la recomendacion

In [9]:
def recomendacion(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key= lambda x:x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    movies_idx = [i[0] for i in sim_scores]
    return df['title'].iloc[movies_idx]

In [10]:
df.head()

Unnamed: 0,title,tags
0,toy story,toy story tom hanks tim allen don rickles jim ...
1,jumanji,jumanji robin williams jonathan hyde kirsten d...
2,grumpier old men,grumpier old men walter matthau jack lemmon an...
3,waiting to exhale,waiting to exhale whitney houston angela basse...
4,father of the bride part ii,father of the bride part ii steve martin diane...


In [11]:
movie = df.title[0]
rec = recomendacion(title=movie)

# Imprimimos la recomendacion

In [12]:
print('La pelicula que viste es:', movie.title())
print('Tu recomendacion es: ')
for i, j  in enumerate(rec):
    print(str(i + 1) + ' ' + '-' + ' ' + j.title())

La pelicula que viste es: Toy Story
Tu recomendacion es: 
1 - Toy Story 2
2 - Rebel Without A Cause
3 - The Thin Red Line
4 - Radio Days
5 - The Sunchaser


# Verificamos que tanta coincidencia tienen la pelicula que vimos y la que nos recomiendo en primer lugar

In [13]:
file_csv = 'verificacion_movies.zip'
csv = pd.read_csv(file_csv, index_col=0)
csv.head()

Unnamed: 0,title,overview,name_actor,genero
0,toy story,"Led by Woody, Andy's toys live happily in his ...",Tom Hanks,Animation
1,toy story,"Led by Woody, Andy's toys live happily in his ...",Tim Allen,Animation
2,toy story,"Led by Woody, Andy's toys live happily in his ...",Don Rickles,Animation
3,toy story,"Led by Woody, Andy's toys live happily in his ...",Jim Varney,Animation
4,toy story,"Led by Woody, Andy's toys live happily in his ...",Wallace Shawn,Animation


In [14]:
data_ver = csv[['title', 'overview', 'name_actor', 'genero']]
data_ver.title = data_ver.title.str.lower()
data_ver.head()

Unnamed: 0,title,overview,name_actor,genero
0,toy story,"Led by Woody, Andy's toys live happily in his ...",Tom Hanks,Animation
1,toy story,"Led by Woody, Andy's toys live happily in his ...",Tim Allen,Animation
2,toy story,"Led by Woody, Andy's toys live happily in his ...",Don Rickles,Animation
3,toy story,"Led by Woody, Andy's toys live happily in his ...",Jim Varney,Animation
4,toy story,"Led by Woody, Andy's toys live happily in his ...",Wallace Shawn,Animation


# Comparamos los actores, overviews y los generos de la pelicula vista y la primera recomendacion

- Actores

In [15]:
act_com = verificar_coincidencias(data_ver, movie, 'name_actor', rec.values[0])

Elementos comunes en name_actor: ['Jim Varney', 'John Ratzenberger', 'Wallace Shawn', 'R. Lee Ermey', 'Tom Hanks', 'John Morris', 'Don Rickles', 'Laurie Metcalf', 'Annie Potts', 'Tim Allen']


- Overviews

In [16]:
ovr_com = verificar_coincidencias(data_ver, movie, 'overview', rec.values[0])

Elementos comunes en overview: []


- Generos


In [17]:
gen_com = verificar_coincidencias(data_ver, movie, 'genero', rec.values[0])

Elementos comunes en genero: ['Comedy', 'Animation', 'Family']


- Titulo

In [18]:
tit_com = verificar_coincidencias(data_ver, movie, 'title', rec.values[0])

Elementos comunes en title: []


# Campo utilizado para hacer el NLP

Este campo es una cadena conformada por los actores, el overview y el titulo de la pelicula

In [19]:
data.tags[0]

"toy story tom hanks tim allen don rickles jim varney wallace shawn john ratzenberger annie potts john morris erik von detten laurie metcalf r. lee ermey sarah freeman penn jillette animation comedy family led by woody, andy's toys live happily in his room until andy's birthday brings buzz lightyear onto the scene. afraid of losing his place in andy's heart, woody plots against buzz. but when circumstances separate buzz and woody from their owner, the duo eventually learns to put aside their differences."