In [28]:
# install the necessary packages
!python -m pip install -U scikit-learn numpy unidecode nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 3.7 MB/s eta 0:00:00
Collecting click
  Using cached click-8.1.7-py3-none-any.whl (97 kB)
Collecting regex>=2021.8.3
  Downloading regex-2023.12.25-cp311-cp311-win_amd64.whl (269 kB)
     -------------------------------------- 269.5/269.5 kB 8.4 MB/s eta 0:00:00
Installing collected packages: regex, click, nltk
Successfully installed click-8.1.7 nltk-3.8.1 regex-2023.12.25



[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
# import libraries

import numpy as np
import pandas as pd

from ast import literal_eval

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from unidecode import unidecode

import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Tiago
[nltk_data]     Machado\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [7]:
# import movies dataset

movies = pd.read_csv('../datasets/movies.tsv', sep='\t')
movies.head()

Unnamed: 0,tconst,titleType,primaryTitle,primaryTitle_ptBr,overview,genres,principals,crew,startYear,runtimeMinutes,averageRating,numVotes,isAdult,imdb_link,image_url
0,tt0002130,movie,Dante's Inferno,Inferno,Loosely adapted from Dante's Divine Comedy and...,"Adventure,Drama,Fantasy","[{'nconst': 'nm0660139', 'primaryName': 'Salva...","[{'nconst': 'nm0078205', 'primaryName': 'Franc...",1911,71,7.0,3430,0,https://www.imdb.com/title/tt0002130/?ref_=sr_t_1,https://m.media-amazon.com/images/M/MV5BMzY0NT...
1,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,Fantômas - À l'ombre de la guillotine,Inspector Juve is tasked to investigate and ca...,"Crime,Drama","[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis...",1913,54,6.9,2477,0,https://www.imdb.com/title/tt0002844/?ref_=sr_t_2,https://m.media-amazon.com/images/M/MV5BMTQxND...
2,tt0003014,movie,Ingeborg Holm,Ingeborg Holm,Financial struggles separate a single mother f...,Drama,"[{'nconst': 'nm0096737', 'primaryName': 'Hilda...","[{'nconst': 'nm0803705', 'primaryName': 'Victo...",1913,96,7.0,1422,0,https://www.imdb.com/title/tt0003014/?ref_=sr_t_5,https://m.media-amazon.com/images/M/MV5BMTQyND...
3,tt0003037,movie,Fantomas: The Man in Black,Juve contre Fantômas,In Part Two of Louis Feuillade's 5 1/2-hour ep...,"Crime,Drama","[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis...",1913,61,6.9,1676,0,https://www.imdb.com/title/tt0003037/?ref_=sr_t_4,https://m.media-amazon.com/images/M/MV5BMTFkM2...
4,tt0003165,movie,Fantômas: The Dead Man Who Killed,Le mort qui tue,After a body disappears from inside the prison...,"Crime,Drama,Mystery","[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis...",1913,90,6.9,1330,0,https://www.imdb.com/title/tt0003165/?ref_=sr_t_6,https://m.media-amazon.com/images/M/MV5BMjQwMT...


### Pre process data

In [8]:
# check if there's any duplicated movie in the dataset
movies.duplicated(subset='tconst').sum()

0

In [10]:
# checking if there's any null value in the dataset
print(movies.isnull().sum())

movies = movies.dropna(subset='overview')
movies.isnull().sum()

tconst                0
titleType             0
primaryTitle          0
primaryTitle_ptBr     0
overview             64
genres                0
principals            0
crew                  0
startYear             0
runtimeMinutes        0
averageRating         0
numVotes              0
isAdult               0
imdb_link             0
image_url             8
dtype: int64


tconst               0
titleType            0
primaryTitle         0
primaryTitle_ptBr    0
overview             0
genres               0
principals           0
crew                 0
startYear            0
runtimeMinutes       0
averageRating        0
numVotes             0
isAdult              0
imdb_link            0
image_url            5
dtype: int64

In [11]:
# 'unpack' the crew and the principals 
features = ['crew', 'principals']

for feature in features:
  movies[feature] = movies[feature].apply(literal_eval)

In [12]:
# get the primary director for the title
def get_director(x):
  for i in x:
    if i['category'] == 'director':
      return i['primaryName']
    return np.nan
  # return

movies['director'] = movies['crew'].apply(get_director)

In [13]:
# get the actors for the titles
def get_list(x, filter_actors=True):
  if isinstance(x, list):
    if filter_actors: names = [i['primaryName'] for i in x if i['category'] == 'actor' or i['category'] == 'actress' or i['category'] == 'self']
    else: names = [i['primaryName'] for i in x]
    return names
  return []

movies['cast'] = movies['principals'].apply(get_list)

In [15]:
# cleaning our data
def clean_data(x):
  if isinstance(x, list):
    return [str.lower(i.replace(' ', '')) for i in x]

  else:
    if isinstance(x, str):
      return str.lower(x.replace(' ', ''))
    else: return ''

features = ['cast', 'director', 'genres']

for feature in features:
  movies[feature] = movies[feature].apply(clean_data)

In [17]:
def create_soup(x):
  return ' '.join(x['genres'].split(',')) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + x['overview']

movies['soup'] = movies.apply(create_soup, axis=1)
movies['soup']

0        adventure drama fantasy salvatorepapa arturopi...
1        crime drama renénavarre renénavarre edmundbreo...
2        drama hildaborgström georggrönroos aronlindgre...
3        crime drama renénavarre renénavarre renénavarr...
4        crime drama mystery renénavarre renénavarre ed...
                               ...                        
44982    documentary game-show reality-tv johnhannah ro...
44983    drama fikretkuskan denizugur ozandolunay serak...
44984    comedy drama hermanfinkers johannatersteege le...
44985    thriller sergiocastellitto lorenzorichelmy ann...
44986    drama history amaiaaberasturi alexbrendemühl d...
Name: soup, Length: 44923, dtype: object

In [20]:
averages = movies.averageRating.unique()
averages.sort()
averages

array([5. , 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2,
       6.3, 6.4, 6.5, 6.6, 6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.5,
       7.6, 7.7, 7.8, 7.9, 8. , 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8,
       8.9, 9. , 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7, 9.8])

In [21]:
# C = mean votes across all report
c = movies['averageRating'].mean()

# R = minimum votes to be listed
m = movies['numVotes'].quantile(0.80)

print(c, m)

# get the most voted movies on the list
df_movies = movies.copy().loc[movies['numVotes'] >= m]
df_movies.shape

6.726923847472342 17176.199999999997


(8985, 18)

In [22]:
# function to calculate the weighted ratings from imdb formula
def weighted_ratings(x, m=m, c=c):
  v = x['numVotes']
  R = x['averageRating']

  return (v / (v+m) * R) + (m / (m+v) * c)

# defining a new feature 'score' and calculating its value with
df_movies['score'] = df_movies.apply(weighted_ratings, axis=1)

# sort values from scores and show the top 10 movies
df_movies = df_movies.sort_values('score', ascending=False)
df_movies[['primaryTitle', 'averageRating', 'numVotes', 'score']].head(10)

Unnamed: 0,primaryTitle,averageRating,numVotes,score
21560,Breaking Bad,9.5,2126399,9.47778
11844,The Shawshank Redemption,9.3,2880568,9.284748
16713,The Wire,9.3,375771,9.187528
18862,Avatar: The Last Airbender,9.3,369790,9.185789
21733,Game of Thrones,9.2,2276108,9.181477
5788,The Godfather,9.2,2006150,9.179006
13735,The Sopranos,9.2,468069,9.112461
27454,Aspirants,9.2,310480,9.070358
27843,Sherlock,9.1,994697,9.059718
35233,Rick and Morty,9.1,599078,9.033858


In [27]:
def remove_special_chars(x: str) -> str:
    return unidecode(x)

df_movies['soup'] = df_movies.soup.apply(remove_special_chars)

### Create Model

In [30]:
stopwords = nltk.corpus.stopwords.words('portuguese')

vectorizer = CountVectorizer(stop_words=stopwords)
matrix = vectorizer.fit_transform(df_movies['soup'])

cousine_sim2 = cosine_similarity(matrix, matrix)

In [32]:
df_movies.reset_index(drop=True, inplace=True)

In [41]:
from typing import List, TypeVar
T = TypeVar('T')

indices = pd.Series(df_movies.index, index=df_movies['tconst'])

def get_recomendation(tconst, cosine_sim=cousine_sim2):
  idx = indices[tconst]

  sim_scores = list(enumerate(cosine_sim[idx]))

  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  sim_scores = sim_scores[1:11]

  movie_indices = [i[0] for i in sim_scores]

  return df_movies[['primaryTitle_ptBr', 'titleType']].iloc[movie_indices]

In [44]:
get_recomendation('tt1160419')

Unnamed: 0,primaryTitle_ptBr,titleType
44,Duna: Parte 2,movie
5726,Melanie - A Última Esperança,movie
2199,Planeta dos Macacos: A Guerra,movie
5505,O Projeto Adam,movie
1095,Da Vinci's Demons,tvSeries
0,Breaking Bad,tvSeries
2850,Perdidos no Espaço,tvSeries
3234,Diário do Futuro,tvSeries
4901,Enola Holmes 2,movie
1450,Minority Report - A Nova Lei,movie


In [43]:
df_movies.loc[df_movies.primaryTitle.str.contains('Dune')]

Unnamed: 0,tconst,titleType,primaryTitle,primaryTitle_ptBr,overview,genres,principals,crew,startYear,runtimeMinutes,averageRating,numVotes,isAdult,imdb_link,image_url,director,cast,soup,score
44,tt15239678,movie,Dune: Part Two,Duna: Parte 2,Diante da difícil escolha entre o amor de sua ...,"action,adventure,drama","[{'nconst': 'nm3154303', 'primaryName': 'Timot...","[{'nconst': 'nm0898288', 'primaryName': 'Denis...",2024,166,8.8,314953,0,https://www.imdb.com/title/tt15239678/?ref_=sr...,https://m.media-amazon.com/images/M/MV5BNDdjMD...,denisvilleneuve,"[timothéechalamet, zendaya, rebeccaferguson, j...",action adventure drama timotheechalamet zenday...,8.69279
536,tt1160419,movie,Dune: Part One,Duna,Paul Atreides é um jovem prodígio com um futur...,"action,adventure,drama","[{'nconst': 'nm3154303', 'primaryName': 'Timot...","[{'nconst': 'nm0898288', 'primaryName': 'Denis...",2021,155,8.0,839855,0,https://www.imdb.com/title/tt1160419/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYWNmZG...,denisvilleneuve,"[timothéechalamet, rebeccaferguson, zendaya, o...",action adventure drama timotheechalamet rebecc...,7.974486
992,tt0058625,movie,Woman in the Dunes,A Mulher da Areia,Um entomologista de férias é forçado pelos mor...,"drama,thriller","[{'nconst': 'nm0645402', 'primaryName': 'Eiji ...","[{'nconst': 'nm0856267', 'primaryName': 'Hiros...",1964,147,8.5,22601,0,https://www.imdb.com/title/tt0058625/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BN2RlNT...,hiroshiteshigahara,"[eijiokada, kyôkokishida, kôjimitsui, hirokoit...",drama thriller eijiokada kyokokishida kojimits...,7.734368
1673,tt1935156,movie,Jodorowsky's Dune,Duna de Jodorowsky,"A história da ambiciosa, mas condenada adaptaç...",documentary,"[{'nconst': 'nm0423524', 'primaryName': 'Aleja...","[{'nconst': 'nm0667650', 'primaryName': 'Frank...",2013,90,8.0,27451,0,https://www.imdb.com/title/tt1935156/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BMTU0Mz...,frankpavich,"[alejandrojodorowsky, michelseydoux, h.r.giger...",documentary alejandrojodorowsky michelseydoux ...,7.510016
7106,tt0087182,movie,Dune,Duna,O filho de um duque lidera guerreiros do deser...,"action,adventure,sci-fi","[{'nconst': 'nm0001492', 'primaryName': 'Kyle ...","[{'nconst': 'nm0000186', 'primaryName': 'David...",1984,137,6.3,178024,0,https://www.imdb.com/title/tt0087182/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BMzNhZj...,davidlynch,"[kylemaclachlan, virginiamadsen, francescaanni...",action adventure sci-fi kylemaclachlan virgini...,6.337566


In [46]:
import joblib

joblib.dump((movies, cousine_sim2), '../models/model_v1.joblib', compress=True)

['../models/model_v1.joblib']