In [1]:
import pandas as pd
import numpy as np


treino = pd.read_json("ratings.jsonl", lines=True)
conteudo = pd.read_json("content.jsonl", lines=True) 
teste = pd.read_csv("targets.csv")

In [2]:
conteudo['Plot'].head(5)

0    A man (Edison's assistant) takes a pinch of sn...
1    A man opens the big gates to the Lumière facto...
2    A group of people are standing in a straight l...
3    Although the content of this film is primitive...
4    A bat flies into an ancient castle and transfo...
Name: Plot, dtype: object

In [3]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
conteudo['Plot'] = conteudo['Plot'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(conteudo['Plot'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(38012, 79207)

In [4]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [5]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(conteudo.index, index=conteudo['Title']).drop_duplicates()

In [6]:
print(indices)

Title
Edison Kinetoscopic Record of a Sneeze               0
Leaving the Factory                                  1
The Arrival of a Train                               2
The Oxford and Cambridge University Boat Race        3
The House of the Devil                               4
                                                 ...  
Yara                                             38007
Lords of Scam                                    38008
Cash                                             38009
Sompoy                                           38010
The Making of 'Rocky vs. Drago'                  38011
Length: 38012, dtype: int64


In [7]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return conteudo['Title'].iloc[movie_indices]

In [8]:
get_recommendations('The Dark Knight Rises')

15199                                      The Dark Knight
37937                                        Dying Is Easy
9003                                        Batman Forever
34967                                         Batman Ninja
36985                       LEGO DC Batman: Family Matters
2277                                      The Seventh Seal
14592                               Get Rich or Die Tryin'
27059    Masterpiece: Frank Miller's the Dark Knight Re...
8435                          Batman: Mask of the Phantasm
26494                   Batman v Superman: Dawn of Justice
Name: Title, dtype: object