**1.0 Introdução**

In [36]:
#https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system/notebook
#https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset?resource=download
#https://www.kaggle.com/code/saritrath/movierecommendersystems
#https://www.kaggle.com/code/faiqueali/tensorflow-movie-recommender-system

**1.1 Pacotes**

In [37]:
# %pip install spacy
# !python -m spacy download en_core_web_lg 
# %pip install sentence-transformers
# %pip install transformers[sentence]
# %pip install nltk
# %pip install surprise

In [38]:
#trat. dados
import pandas as pd
import numpy as np
from ast import literal_eval

#windows
import zipfile
import os

#ml
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split as train_test_split_skl

#lib recomendacao 
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate

#ignorar avisos
import warnings
warnings.filterwarnings("ignore")

#nlp
import spacy
nlp = spacy.load("en_core_web_lg")

**2.0 Dados**

**2.1 Extração**

In [39]:
#!kaggle datasets download -d rounakbanik/the-movies-dataset

In [40]:
#Caminho para o arquivo zip baixado
zip_file_path = 'the-movies-dataset.zip'

#Caminho para a pasta onde os arquivos serão extraídos
extracted_folder_path = 'bases'

#Extrair os arquivos do zip
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_folder_path)

#Listar os arquivos extraídos
extracted_files = os.listdir(extracted_folder_path)
print(extracted_files)

['credits.csv', 'filmes_final.csv', 'keywords.csv', 'links.csv', 'links_small.csv', 'movies_metadata.csv', 'ratings.csv', 'ratings_small.csv']


**2.2 Carregamento**

In [41]:
credits = pd.read_csv('C:/Users/eduar/OneDrive/Área de Trabalho/Python/recomendador/bases/credits.csv')
keywords = pd.read_csv('C:/Users/eduar/OneDrive/Área de Trabalho/Python/recomendador/bases/keywords.csv')

#dados de filmes
movies_metadata = ['original_language','genres','id', 'original_title','overview','popularity',
'production_companies','production_countries','runtime','tagline','title','vote_average','vote_count']
movies_metadata = pd.read_csv('C:/Users/eduar/OneDrive/Área de Trabalho/Python/recomendador/bases/movies_metadata.csv',
usecols=movies_metadata).drop([19730, 29503, 35587])#.iloc[:100]

#avaliacoes
ratings = pd.read_csv('C:/Users/eduar/OneDrive/Área de Trabalho/Python/recomendador/bases/ratings.csv')
ratings = ratings.drop(['timestamp'], axis=1)

**2.3 Tratamento**

**2.3.1 Dados filmes**

In [42]:
# movies_metadata = movies_metadata[movies_metadata['original_language']=='en']
# movies_metadata['id'] = movies_metadata['id'].astype('int64')
# content_df = pd.merge(movies_metadata, credits, left_on='id', right_on='id', how='left').dropna(subset=['id'])
# content_df = pd.merge(content_df, keywords, left_on='id', right_on='id', how='left').dropna(subset=['id'])
# content_df['genres']= content_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
# content_df['production_companies']= content_df['production_companies'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
# content_df['production_countries']= content_df['production_countries'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
# content_df['keywords']= content_df['keywords'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
# content_df['cast']= content_df['cast'].fillna('[]').apply(literal_eval).apply(lambda x:[i['name'] for i in x] if isinstance(x,list) else [])
# content_df['crew'] = content_df['crew'].apply(lambda x: eval(x) if pd.notnull(x) and x != '[]' else [])
# content_df['director'] = content_df['crew'].apply(lambda x: next((i['name'] for i in x if i['job'] == 'Director'), np.nan))
# text = ['original_title', 'genres', 'overview','production_companies', 'tagline', 'keywords', 'director', 'cast']
# content_df['bag_of_words'] = content_df[text[0:]].astype(str).apply(lambda x: ' '.join(x), axis=1)
# content_df['bag_of_words'] = content_df['bag_of_words'].apply(lambda texto: ' '.join([token.lemma_ for token in nlp(texto) if token.pos_ == 'NOUN']))
# content_df = content_df.dropna(subset=['bag_of_words'])
# content_df['bag_of_words'] = content_df['bag_of_words'].astype(str)
# content_df = content_df[content_df['original_language']=='en']
# content_df = content_df[content_df['vote_average']>=3.5]
# content_df = content_df[content_df['vote_count']>=50]

# #salvar
# content_df.to_csv('bases/filmes_final.csv', index=False,mode='w')
content_df = pd.read_csv('bases/filmes_final.csv')

**2.3 Avaliações**

In [43]:
#df para filtragem de conteudo
avaliacoes_total = pd.merge(ratings, content_df, left_on='movieId', right_on='id', how='inner').dropna(subset=['id'])

**3.0 Filtragem de Conteúdo**

In [44]:
#pre-processamento
avaliacoes = avaliacoes_total[avaliacoes_total['rating']>3]

#Carregue o modelo BERT pré-treinado
bert_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

#Representação em embeddings do texto
texto_embeddings = bert_model.encode(content_df['bag_of_words'].tolist(), convert_to_tensor=True)

#df
embedding_df = pd.DataFrame(texto_embeddings.numpy())

#Calcular a similaridade de cosseno entre os filmes usando os embeddings do BERT
cosine_similarities_bert = cosine_similarity(embedding_df, embedding_df)

#Criar um DataFrame de similaridades
similaridades_df_bert = pd.DataFrame(cosine_similarities_bert, index=content_df['original_title'], columns=content_df['original_title'])

#Função para obter recomendações para um usuário específico usando embeddings do BERT
def obter_recomendacoes_bert(usuario_id):
    #Filtrar avaliações do usuário
    avaliacoes_usuario = avaliacoes[avaliacoes['userId'] == usuario_id]

    #Filtrar filmes que o usuário ainda não avaliou
    filmes_nao_avaliados = content_df[~content_df['original_title'].isin(avaliacoes_usuario['original_title'])]

    #Calcular a pontuação ponderada para cada filme não avaliado usando embeddings do BERT
    filmes_nao_avaliados['pontuacao'] = filmes_nao_avaliados.apply(lambda row: similaridades_df_bert.loc[row['original_title'], 
    avaliacoes_usuario['original_title']].sum(),axis=1)   
    filmes_nao_avaliados['pontuacao'] = pd.to_numeric(filmes_nao_avaliados['pontuacao'], errors='coerce')

    #Ordenar filmes por pontuação em ordem decrescente
    recomendacoes = filmes_nao_avaliados.sort_values(by='pontuacao', ascending=False)

    return recomendacoes,avaliacoes_usuario

In [45]:
#Exemplo de recomendações para o usuário com ID 1 usando embeddings do BERT
recomendacoes,avaliacoes_usuario = obter_recomendacoes_bert(270872)
recomendacoes.head(3)

Unnamed: 0,genres,id,original_language,original_title,overview,popularity,production_companies,production_countries,runtime,tagline,title,vote_average,vote_count,cast,crew,keywords,director,bag_of_words,pontuacao
4078,"['Drama', 'Action', 'Thriller', 'Crime']",13641,en,The Air I Breathe,A drama based on an ancient Chinese proverb th...,8.876157,['NALA Films'],"['Mexico', 'United States of America']",95.0,"The question is not whether we will die, but h...",The Air I Breathe,6.1,115.0,"['Brendan Fraser', 'Andy García', 'Kevin Bacon...","[{'credit_id': '52fe45869251416c75059867', 'de...","['corruption', 'horse race', 'suicide attempt'...",Jieho Lee,drama action thriller crime ' drama proverb li...,23.319563
4874,"['Action', 'Crime', 'Comedy']",40805,en,The Green Hornet,"Britt Reid (Seth Rogen), the heir to the large...",7.888013,"['Original Film', 'Sony Pictures Entertainment...",['United States of America'],119.0,Breaking the Law to Protect It.,The Green Hornet,5.5,1274.0,"['Seth Rogen', 'Jay Chou', 'Christoph Waltz', ...","[{'credit_id': '52fe45a4c3a36847f80d25a9', 'de...","['bomb', 'martial arts', 'assassin', 'vandalis...",Michel Gondry,crime comedy ' heir newspaper fortune playboy ...,22.214787
4059,"['Romance', 'Comedy', 'Drama']",10247,en,He Was a Quiet Man,"A troubled loner, Bob Maconel, imagines blowin...",4.176044,"['Neo Art & Logic', 'Quiet Man Productions']",['United States of America'],95.0,He seemed like such a nice guy.. He pretty muc...,He Was a Quiet Man,6.5,83.0,"['Christian Slater', 'John Gulager', 'Elisha C...","[{'credit_id': '52fe43499251416c7500a8a3', 'de...","['suicide', 'paraplegic', 'office', 'aquarium'...",Frank A. Cappello,romance comedy drama loner tower revolver offi...,22.071098


In [46]:
#Validacao de desempenho do modelo
#listar usuarios
user_counts = avaliacoes_total['userId'].value_counts()
user_list = user_counts[user_counts > 800].index.tolist()
#user_list = [107720]
#criar df vazio
recomendacoes_total = pd.DataFrame(columns=['userId', 'original_title', 'rating'])
filmes = avaliacoes_total[['userId','original_title','rating']]

for user in user_list:
    try:
      usuario_id = user
      #todas as avaliacoes do usuario
      avaliacoes_usuario = filmes[filmes['userId'] == user]
      #Agora é necessario dividir entre treino e teste
      train, test = train_test_split_skl(avaliacoes_usuario, test_size=0.3, random_state=42)
      #no treino, deixaremos apenas avaliacoes positivas
      train = train[train['rating']>4]
      #balancear test
      test_5 = test[test['rating']>=4]
      test_3 = test[test['rating']<3]
      count_3 = len(test_3)
      count_5 = len(test_5)
      diferenca = count_5 - count_3
      if diferenca > 0:
        test = test_5.head(count_3).append(test_3)
        limit = len(test.head(5))
      else:
        test = test_3.head(count_5).append(test_5)
        limit = len(test.head(5))

      #Calcular a pontuação ponderada para cada filme não avaliado usando embeddings do BERT  
      test['pontuacao'] = test.apply(lambda row: similaridades_df_bert.loc[row['original_title'],train['original_title']].sum(),axis=1)
      test['pontuacao'] = pd.to_numeric(test['pontuacao'], errors='coerce')
      #Ordenar filmes por pontuação em ordem decrescente
      recomendacoes = test.sort_values(by='pontuacao', ascending=False).head(limit)
      recomendacoes = recomendacoes[['userId','original_title','rating']]
      recomendacoes_total = pd.concat([recomendacoes, recomendacoes_total])
      print(user)
    except:
      print("Erro: "+user)
      pass

recomendacoes_total['bem_avaliado'] = recomendacoes_total['rating'].apply(lambda x: 1 if x >= 4 else 0)
recall = recomendacoes_total['bem_avaliado'].mean()
print(recall)

8659
107720
179792
45811
229879
243443
0.6333333333333333


**4.0 Filtragem Colaborativa**

- Surprise
- O que usuários parecidos estão assistindo
- Basicamente precisa apenas de user item e rating para rodar esse modelo. simples.

In [47]:
#df para filtragem de colaborativa
df = pd.merge(ratings, content_df, left_on='movieId', right_on='id', how='inner').dropna(subset=['id'])
df = (df[['userId', 'original_title','rating']])
reader = Reader(rating_scale=(1, 5))
avaliacoes = Dataset.load_from_df(df[['userId', 'original_title', 'rating']], reader)

#modelo
train, test = train_test_split(avaliacoes, test_size=0.2)
model = SVD()
model.fit(train)
predictions = model.test(test)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)
print(f'rmse: {rmse}')
print(f'mae: {rmse}')
results = cross_validate(model, avaliacoes, measures=['RMSE', 'MAE'], cv=5, verbose=True)

#recomendacoes
user_id = 270872
unique_ids = df['original_title'].unique()
items_rated_by_user = df[df['userId'] == user_id]['userId'].unique()
items_to_predict = np.setdiff1d(unique_ids, items_rated_by_user)
predictions_list = [(item, model.predict(user_id, item).est) for item in items_to_predict]
predictions_df = pd.DataFrame(predictions_list, columns=['original_title', 'estimated_rating']).sort_values('estimated_rating', ascending=False)
display(predictions_df.head(5))

RMSE: 0.8570
MAE:  0.6575
rmse: 0.857011571331061
mae: 0.857011571331061
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8551  0.8549  0.8572  0.8568  0.8559  0.8560  0.0009  
MAE (testset)     0.6563  0.6557  0.6576  0.6573  0.6565  0.6567  0.0007  
Fit time          92.99   88.00   86.15   85.40   86.94   87.90   2.69    
Test time         25.98   27.13   18.49   13.65   16.86   20.42   5.25    


Unnamed: 0,original_title,estimated_rating
827,Letters from Iwo Jima,4.651386
1005,On Her Majesty's Secret Service,4.599228
779,Kindergarten Cop,4.562276
950,Mr. Smith Goes to Washington,4.555736
927,Mission: Impossible II,4.464182
