# Movies Recommender System

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet,stopwords
from surprise import Reader, Dataset, SVD

import warnings; warnings.simplefilter('ignore') 

In [2]:
movies_df = pd. read_csv('movies_metadata.csv')
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
movies_df['genres'] = movies_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies_df.head() # Ajustando coluna Gêneros

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
movies_df['year'] = pd.to_datetime(movies_df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan) #definindo coluna com Ano

In [5]:
# Ler banco de dados com menos filmes, devido ao limite de processamento do computador
links_small = pd.read_csv('links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [6]:
movies_df = movies_df.drop([19730, 29503, 35587]) # tirando linhas problemáticas (na values etc)
movies_df['id'] = movies_df['id'].astype('int')
small_df = movies_df[movies_df['id'].isin(links_small)]
small_df.shape

(9099, 25)

## Funções de Pré-Processamento de Texto

In [7]:
def process_text(text):  
    # Tokenização  
    tokens = word_tokenize(text.lower())  
    # Remover stopwords e não alfabéticos  
    tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]  
    # Lematização  
    lemmatizer = WordNetLemmatizer()  
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]  
    return ' '.join(lemmatized)  

## Carregando Metadados (keywords, Diretor e Atores)

In [8]:
credits = pd.read_csv('credits.csv')
keywords = pd.read_csv('keywords.csv')
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
movies_df['id'] = movies_df['id'].astype('int')
movies_df.shape

(45463, 25)

Juntando os dataframes em um só:

In [9]:
movies_df = movies_df.merge(credits, on='id')
movies_df = movies_df.merge(keywords, on='id')
small_df = movies_df[movies_df['id'].isin(links_small)]
small_df.shape

(9219, 28)

Adicionando coluna de Descrição

In [10]:
# Adicionando coluna de Descrição
small_df['tagline'] = small_df['tagline'].fillna('')
small_df['description'] = small_df['overview'] + small_df['tagline']
small_df['description'] = small_df['description'].fillna('')

## Data Wrangling dos campos de metadados

In [11]:
small_df['cast'] = small_df['cast'].apply(literal_eval)
small_df['crew'] = small_df['crew'].apply(literal_eval)
small_df['keywords'] = small_df['keywords'].apply(literal_eval)
small_df['cast_size'] = small_df['cast'].apply(lambda x: len(x))
small_df['crew_size'] = small_df['crew'].apply(lambda x: len(x))

In [12]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [13]:
small_df['director'] = small_df['crew'].apply(get_director)
small_df['cast'] = small_df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_df['cast'] = small_df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
small_df['keywords'] = small_df['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
small_df['keywords'] = small_df['keywords'].apply(lambda x: ' '.join(x) if isinstance(x,list) else str(x))
small_df['director'] = small_df['director'].astype('str').apply(lambda x: x.replace(" ", ""))
small_df['cast'] = small_df['cast'].apply(lambda x: [i.replace(" ", "") for i in x]).apply(lambda x: ' '.join(x))
small_df['genres'] = small_df['genres'].apply(lambda x: [i.replace(" ", "") for i in x]).apply(lambda x: ' '.join(x))

## Formação da Sopa de Metadados

In [14]:
small_df['soup'] = small_df['keywords'].apply(process_text) +' '+ small_df['cast'] +' '+small_df['director']+' '+small_df['director']+' '+small_df['director']+' '+ small_df['genres']
small_df['soup'].head()

0    jealousy toy boy friendship friend rivalry boy...
1    board game disappearance based child book new ...
2    fishing best friend duringcreditsstinger old m...
3    based novel interracial relationship single mo...
4    baby midlife crisis confidence aging daughter ...
Name: soup, dtype: object

## Vetorização Textual

In [15]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(small_df['soup'])
tfidf_matrix.shape

(9219, 120158)

## Calculando a Matriz de similaridade (Produto Interno)

In [16]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim[0]


array([1.        , 0.00859809, 0.00962056, ..., 0.        , 0.        ,
       0.        ])

In [17]:
small_df = small_df.reset_index()
titles = small_df['title']
indices = pd.Series(small_df.index, index=small_df['title'])

## Recomendações Baseada em Conteúdo (Diretor, Atores, Descrição e Gênero)

In [92]:
def cont_recom(title, n=3):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [93]:
cont_recom('The Godfather').head(10)

3616    Tucker: The Man and His Dream
1346                    The Rainmaker
3300                 Gardens of Stone
Name: title, dtype: object

In [94]:
cont_recom('The Dark Knight').head(10)

8031    The Dark Knight Rises
6218            Batman Begins
6623             The Prestige
Name: title, dtype: object

## Recomendações baseada em Ratings de Usuários 

In [21]:
ratings = pd.read_csv('ratings_small.csv')

In [22]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [23]:
ratings_matrix = pd.pivot_table(ratings,values='rating',columns=['movieId'], index=['userId'])
ratings_matrix.fillna(0,inplace=True)

In [24]:
users_cosine_sim = linear_kernel(ratings_matrix,ratings_matrix)
users_cosine_sim = pd.DataFrame(users_cosine_sim, index=ratings_matrix.index, columns=ratings_matrix.index)
users_cosine_sim.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,662,663,664,665,666,667,668,669,670,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,145.0,0.0,0.0,57.0,8.0,0.0,34.0,0.0,4.0,0.0,...,0.0,0.0,15.25,37.5,0.0,0.0,0.0,16.0,0.0,9.0
2,0.0,985.0,101.5,237.0,128.5,0.0,225.0,151.0,92.0,35.0,...,400.0,38.0,213.5,367.0,289.0,412.0,47.0,16.0,119.0,152.0
3,0.0,101.5,677.0,135.0,155.75,36.5,135.5,276.25,90.5,77.0,...,112.0,32.0,401.0,293.5,91.0,100.0,57.5,44.5,79.0,189.5


In [25]:
links_small = pd.read_csv('links_small.csv')[['movieId', 'tmdbId']]
links_small['tmdbId'] = links_small['tmdbId'].fillna(0).astype('int')
links_small.columns = ['movieId', 'id']
ID_table = pd.merge(right=small_df[['id','title']], left=links_small, on='id')

In [26]:
small_df=small_df.merge(ID_table[['id','movieId']], on='id')
small_df.columns

Index(['index', 'adult', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year', 'cast', 'crew', 'keywords',
       'description', 'cast_size', 'crew_size', 'director', 'soup', 'movieId'],
      dtype='object')

In [154]:
def users_recom(userId,n=3):
    similar_users = users_cosine_sim[userId].sort_values(ascending=False).index[1:6] # 5 usuários mais parecidos
    recom = ratings_matrix.loc[similar_users].mean().sort_values(ascending=False)
    recom = recom.head(n).index.tolist() 
    final_users=[]
    for mid in recom:
        final_users.extend(small_df[small_df['movieId']==mid]['title'])
    return pd.Series(final_users)

In [155]:
users_recom(1)

0    The French Connection
1          The Deer Hunter
2                   Gandhi
dtype: object

## Sistema Híbrido

In [164]:
def hibrid_recom(userId, n=5):
    # Recomendação por usuário
    recom_user = users_recom(userId)
    # Refinada com conteúdo
    final_rec = []
    for movie_id in recom_user:
        final_rec.extend(cont_recom(movie_id))
    # Eliminar duplicatas(set) e filmes já vistos pelo usuário
    vistos = ratings[ratings['userId'] == userId]['movieId'].tolist()
    final_rec = pd.Series([m for m in final_rec if m not in vistos])
    return final_rec.drop_duplicates()

## Testando para Usuários:
- Recomendação por Usuário:

In [165]:
users_recom(1,10)

0     The French Connection
1           The Deer Hunter
2                    Gandhi
3                     Dumbo
4            Apocalypse Now
5        Lawrence of Arabia
6                GoodFellas
7               Rear Window
8    The Godfather: Part II
9         Full Metal Jacket
dtype: object

- Recomendação por Conteúdo

In [168]:
cont_recom(users_recom(1)[0],10) # French Connection

2409               The Guardian
4998    To Live and Die in L.A.
118                        Jade
374                  Blue Chips
2823        Rules of Engagement
3807                   Cruising
5849                   Sorcerer
8176                 Killer Joe
7606               12 Angry Men
4566                 The Hunted
Name: title, dtype: object

- Recomendação Híbrida: (Usuário + Conteúdo)

In [167]:
hibrid_recom(1)

0                 The Guardian
1      To Live and Die in L.A.
2                         Jade
3    Thunderbolt and Lightfoot
4               Uncommon Valor
5             Gardens of Stone
6             A Bridge Too Far
7                  Cry Freedom
8              In Love and War
dtype: object