In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem.porter import PorterStemmer
import ast

In [4]:

# Carregar os dados
credits_df = pd.read_csv('credits.csv')
movies_df = pd.read_csv('movies.csv')
movies_df = movies_df.merge(credits_df, on='title')

# Selecionar colunas relevantes
movies_df = movies_df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Tratar valores nulos
movies_df.dropna(inplace=True)

In [5]:

# Funções para converter gêneros, palavras-chave, elenco e diretores em listas de nomes
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

def fetch_directors(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

In [6]:

# Aplicar as funções de conversão
movies_df['genres'] = movies_df['genres'].apply(convert)
movies_df['keywords'] = movies_df['keywords'].apply(convert)
movies_df['cast'] = movies_df['cast'].apply(convert3)
movies_df['crew'] = movies_df['crew'].apply(fetch_directors)

# Tratar espaços em branco
movies_df['overview'] = movies_df['overview'].apply(lambda x: x.split())
movies_df['genres'] = movies_df['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies_df['cast'] = movies_df['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies_df['crew'] = movies_df['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

# Criar a coluna 'tags'
movies_df['tags'] = movies_df['overview'] + movies_df['genres'] + movies_df['keywords'] + movies_df['cast'] + movies_df['crew']

In [7]:


# Criar um DataFrame final com as colunas necessárias
df = movies_df[['movie_id', 'title', 'tags', 'genres']]

# Converter a lista de tags em uma string
df['tags'] = df['tags'].apply(lambda x: ' '.join(x))

# Converter as tags para minúsculas
df['tags'] = df['tags'].apply(lambda x: x.lower())

# Inicializar CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df['tags']).toarray()

# Inicializar PorterStemmer
ps = PorterStemmer()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: ' '.join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(lambda x: x.lower())


In [8]:


# Função para aplicar stemming
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return ' '.join(y)

# Aplicar stemming
df['tags'] = df['tags'].apply(stem)

# Calcular a similaridade coseno
similarity = cosine_similarity(vectors)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tags'] = df['tags'].apply(stem)


In [9]:
# Funções de recomendação
def recommend(movie):
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(df.iloc[i[0]].title)


In [10]:
def recomendagenero(movie, genero):
    listamostra = []
    conta = 0
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:]
    for i in movies_list:
        if genero in df.iloc[i[0]].genres:
            listamostra.append(df.iloc[i[0]].title)
            conta += 1
            if conta == 5:
                break
    for i in listamostra:
        print(i)

In [11]:

# Testar recomendações
recommend('Avatar')

genero = 'Action'  
nome = 'Avatar'    
recomendagenero(nome, genero)

Titan A.E.
Independence Day
Small Soldiers
Aliens vs Predator: Requiem
Battle: Los Angeles
Titan A.E.
Independence Day
Small Soldiers
Aliens vs Predator: Requiem
Battle: Los Angeles
