Carregando as bibliotecas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import re #biblioteca para manipulação de strings

from sklearn.feature_extraction.text import TfidfVectorizer #Pacote para extração de tokens
from sklearn.metrics.pairwise import linear_kernel #Pacote para calcular a similaridade via cosseno

Importando o arquivo de vendas "movies_ratings.csv"

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = './netflix_titles.csv'
#path = '/content/drive/MyDrive/Data/netflix_titles.csv'
dataRaw = pd.read_csv(path)
dataRaw.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [5]:
dataRaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB


Remover todos os NAN's das colunas.

In [6]:
dataRaw.dropna(subset=['cast','title','description','listed_in'],inplace=True,axis=0)
dataRaw = dataRaw.reset_index(drop=True)

dataRaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5664 entries, 0 to 5663
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       5664 non-null   int64 
 1   type          5664 non-null   object
 2   title         5664 non-null   object
 3   director      3909 non-null   object
 4   cast          5664 non-null   object
 5   country       5271 non-null   object
 6   date_added    5654 non-null   object
 7   release_year  5664 non-null   int64 
 8   rating        5657 non-null   object
 9   duration      5664 non-null   object
 10  listed_in     5664 non-null   object
 11  description   5664 non-null   object
dtypes: int64(2), object(10)
memory usage: 531.1+ KB


Neste exercícios iremos utilizar os seguintes campos para busca de similaridade:

- type: tipo de mídia (filme, tv show, etc.)
- título: nome da mídia
- listed_in: categoria onde é apresentado.
- description: informação texto livre sobre a mídia.

Para aplicar o modelo de recomendação vamos limpar os textos e combiná-los.

In [7]:
dataRaw['listed_in'] = [re.sub(r'[^\w\s]', '', t) for t in dataRaw['listed_in']]
dataRaw['cast'] = [re.sub(',',' ',re.sub(' ','',t)) for t in dataRaw['cast']]
dataRaw['description'] = [re.sub(r'[^\w\s]', '', t) for t in dataRaw['description']]
dataRaw['title'] = [re.sub(r'[^\w\s]', '', t) for t in dataRaw['title']]

Agora vamos combinar os campos textuais numa única coluna chamanda "combined".

In [8]:
dataRaw["combined"] = dataRaw['listed_in'] + '  ' + dataRaw['cast'] + ' ' + dataRaw['title'] + ' ' + dataRaw['description']
dataRaw.drop(['listed_in','cast','description'],axis=1,inplace=True)
dataRaw.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,combined
0,81145628,Movie,Norm of the North King Sized Adventure,"Richard Finn, Tim Maltby","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,Children Family Movies Comedies AlanMarriott...
1,80117401,Movie,Jandino Whatever it Takes,,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,StandUp Comedy JandinoAsporaat Jandino Whatev...
2,70234439,TV Show,Transformers Prime,,United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids TV PeterCullen SumaleeMontano FrankWelke...
3,80058654,TV Show,Transformers Robots in Disguise,,United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids TV WillFriedle DarrenCriss ConstanceZimm...
4,80125979,Movie,realityhigh,Fernando Lebrija,United States,"September 8, 2017",2017,TV-14,99 min,Comedies NestaCooper KateWalsh JohnMichaelHig...


Tokenizaremos a coluna "combined" utilizando o método TF-IDF.

In [10]:
vectorizer = TfidfVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(dataRaw["combined"])
matrix.shape

(5664, 46919)

Calcular a similaridade por cosseno.

In [11]:
cosine_similarities = linear_kernel(matrix,matrix)
cosine_similarities.shape

(5664, 5664)

Criar um índice para os filmes.

In [13]:
indices = pd.Series(dataRaw.index, index=dataRaw['title']).drop_duplicates()
indices

title
Norm of the North King Sized Adventure           0
Jandino Whatever it Takes                        1
Transformers Prime                               2
Transformers Robots in Disguise                  3
realityhigh                                      4
                                              ... 
Kikoriki                                      5659
Red vs Blue                                   5660
Maron                                         5661
A Young Doctors Notebook and Other Stories    5662
Friends                                       5663
Length: 5664, dtype: int64

Criar uma função para retornar itens semelhantes.

In [14]:
def get_similar(title, indices, cosine_sim, num_recommend = 10):

  idx = indices[title]

  # Obtem todas os pares de scores de similaridade de todos os filmes com o filme alvo
  sim_scores = list(enumerate(cosine_sim[idx]))

  # Ordena os filmes com base no score de similaridade
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # Obtem o score dos num_recommend filmes mais proximos
  top_similar = sim_scores[1:num_recommend+1]

  # Obtem o indice dos filmes
  movie_indices = [i[0] for i in top_similar]

  # Retorna os num_recommend filmes mais proximos
  return movie_indices

In [15]:
idx = get_similar("Naruto", indices, cosine_similarities, 10)

In [16]:
indices[idx]

title
Naruto Shippûden the Movie Bonds                              643
Naruto Shippuden The Movie                                    644
Naruto Shippuden  Blood Prison                                335
Naruto the Movie 2 Legend of the Stone of Gelel               338
Naruto Shippûden the Movie The Will of Fire                   336
Naruto the Movie 3 Guardians of the Crescent Moon Kingdom     339
Naruto Shippuden The Movie The Lost Tower                     337
Naruto the Movie Ninja Clash in the Land of Snow              340
Saint Seiya The Lost Canvas                                  1600
Beyblade Metal Fusion                                        4112
dtype: int64

In [19]:
def RecommendMovies(user_ratings):
    # Carregar os dados e pré-processar
    dataRaw = pd.read_csv('./netflix_titles.csv')
    dataRaw['description'] = dataRaw['description'].fillna('') # Preencher valores nulos com string vazia
    
    # Criar a matriz TF-IDF
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(dataRaw['description'])
    
    # Calcular a similaridade de cosseno
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    
    # Criar uma série mapeando o título dos filmes para seus índices
    indices = pd.Series(dataRaw.index, index=dataRaw['title']).drop_duplicates()
    
    # Função para obter recomendações com base na similaridade
    def get_recommendations(title, cosine_sim=cosine_sim):
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        movie_indices = [i[0] for i in sim_scores]
        return dataRaw['title'].iloc[movie_indices]

    # Obter recomendações para cada filme avaliado pelo usuário
    all_recommendations = []
    for movie, rating in zip(user_ratings['movie'], user_ratings['rating']):
        recommendations = get_recommendations(movie)
        for rec in recommendations:
            all_recommendations.append((rec, rating))

    # Ordenar as recomendações por rating
    all_recommendations = sorted(all_recommendations, key=lambda x: x[1], reverse=True)

    # Selecionar os top 10
    top_10_recommendations = [rec for rec, rating in all_recommendations[:10]]
    
    return top_10_recommendations

In [20]:
# Exemplo de uso
usr1 = {
    "movie": ["Star Wars", "The Matrix", "Alien"],
    "rating": [1, 4, 5]
}

recommendations = RecommendMovies(usr1)
print("Top 10 Recommendations:", recommendations)

KeyError: 'Star Wars'