# Criando um dataset de Filmes e Séries de TV a partir do IMDB

O objetivo deste notebook é realizar a coleta e a compilação dos dados de filmes e séries de tv do site IMDB.

In [None]:
# install libraries
# !pip install pandas selenium tqdm ipywidgets

In [1]:
# import libraries
import pandas as pd
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.by import By

from tqdm.notebook import tqdm
from typing import List, TypeVar

T = TypeVar('T')

## Obtendo a descrição dos filmes do site IMDB via WebScrapping

Nesta primeira parte, estaremos coletando os resumos dos filmes e séries de TV diretamente do site IMDB utilizando a técnica de web scrapping, que é uma técnica de obtenção de dados via automação de código diretamente do navegador.

O IMDB possui uma ampla base dados, com cerca de XXX filmes e séries de TV. 

Como o objetivo final deste trabalho é desenvolver um algoritmo de recomendação de filmes e/ou séries, optamos por aplicar filtros a fim de diminuir o tamanho geral do dataset e evitar problemas de lentidão e memória.

Estaremos buscando dados de filmes e séries de TV que tenham avaliação geral maior que 7 e que tenham recebidos mais de 1 mil votos. 

In [2]:
# start webriver
driver = webdriver.Firefox() # start driver

# go to imdb site and assert the language is english
driver.get('https://www.imdb.com')

assert driver.title == 'IMDb: Classificações, avaliações e onde assistir os melhores filmes e séries'

In [3]:
def select_english_language():
    if not driver.find_element(By.XPATH, '/html/body/div[2]/nav/div[2]/div[6]/label/span').text == 'EN':
        driver.find_element(By.XPATH, '/html/body/div[2]/nav/div[2]/div[6]').click() # click on dropdown language set
        sleep(1)
        driver.find_element(By.XPATH, '/html/body/div[2]/nav/div[2]/div[6]/div/div/div/span/ul[1]/li[3]/span[2]').click() # click on English language on the list
        sleep(1)

# select_english_language()

In [4]:
# create a base url to get the movies

def create_base_url(min_ratings: str,
                    max_ratings: str,
                    min_votes: str = 1000,
                    title_types: List[str] = ['feature', 'tv_series'],
                    filter_type: str = 'release_date',
                    order_type: str= 'desc') -> str:
    # create a url based on the given filters
    return f'https://www.imdb.com/search/title/?title_type={",".join(title_types)}&user_rating={min_ratings},{max_ratings}&sort={filter_type},{order_type}&num_votes={min_votes},999999999999'

# navigate to imdb url
# driver.get(create_base_url(min_ratings=5, max_ratings=6))

In [5]:
def get_total_titles():
    total_size = int(driver.find_element(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[1]/div[1]').text.split(' ')[-1].replace('.', ''))
    return total_size

get_total_titles()

10415

In [29]:
# since the imdb shows only 50 titles by time, we have to scroll the page to load all titles
def load_page_items(batch_size: int= 10000):
    """Scoll the page until reach the batch size given"""
    # grab total titles with the given filter
    total_size = int(driver.find_element(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[1]/div[1]').text.split(' ')[-1].replace('.', ''))

    # check how many titles are loaded on screen
    size = len(driver.find_elements(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li'))

    with tqdm(range(batch_size), desc='Carregando títulos', leave=False) as pbar:
        # while the size of titles in screen is less then the total, we click on "load more" button to load the other titles
        while not size >= batch_size:
            try:
                try:
                    driver.find_element(By.CSS_SELECTOR, 'button.ipc-btn--single-padding:nth-child(1)').click()
                    size = len(driver.find_elements(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li'))
                    pbar.update(50)
                except:
                    pass
                sleep(1)
            except KeyboardInterrupt:
                break

# load_page_items()

In [30]:
# create a dataset with the titles loaded on screen
def get_elements_from_screen(df: List[T] = pd.DataFrame()) -> List[T]:
    """Return all the elements loaded on screen in a dadaset"""
    # store the elements in a list
    elements = driver.find_elements(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')

    with tqdm(range(len(elements)), desc="Criando dataset", leave=False) as pbar:
        # for each element we grab the infos we need and add to main df
        for i, element in enumerate(elements):
            tconst = element.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]/a').get_attribute('href').split('/')[4]     # tconst
            primaryTitle = element.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]/a').text                              # primaryTitle
            imdb_link = element.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]/a').get_attribute('href')                # imdb link
            
            try:
                overview = element.find_element(By.XPATH, './div/div/div/div[2]/div[1]/div').text                                   # description
            except:
                overview = ""
            
            try:
                image_url = element.find_element(By.XPATH, './div/div/div/div[1]/div[1]/div/div[2]/img').get_attribute('src')       # image_url
            except:
                image_url = ""

            item = {
                'tconst': [tconst],
                'primaryTitle': [primaryTitle],
                'overview': [overview],
                'imdb_link': [imdb_link],
                'image_url': [image_url]

            }

            df = pd.concat([df, pd.DataFrame(item)], axis=0)
            df.drop_duplicates(subset=['tconst'], inplace=True)
            df.reset_index(drop=True, inplace=True)

            pbar.update(1)

    return df

# get_elements_from_screen()

In [8]:
def make_backup(df: List[T], path: str) -> bool:
    """Make a backup for the given df and path"""
    try:
        df.to_csv(path, sep='\t', index=False)
        return True
    except:
        return False
    
def load_backup(path: str) -> List[T]:
    """Load a backup or create a empty dataset"""
    try:
        return pd.read_csv(path, sep='\t')
    except:
        return pd.DataFrame()


In [28]:
import math 

overview_dataset_path = '../datasets/titles.overviews.pt-br.tsv'

def run():
    MIN_RATING = 6
    MAX_RATING = 10

    # carrega o backup
    df = load_backup(overview_dataset_path)

    # for each rating interval (by one) in the min max rating interval
    for i in range(MIN_RATING, MAX_RATING):
        # create a base url for that interval
        base_url = create_base_url(i, i+1, order_type='asc')
        driver.get(base_url)

        # discover the total titles in the given parameters
        total_items = get_total_titles()

        # set the default batch size
        batch_size = 10000
        batch_range = 1

        # if total size is larger then 10000, split the interval in two parts
        if total_items >= 10000:
            batch_size = math.ceil(total_items / 2)
            batch_range = 2

        # for each interval
        for b in range(batch_range):
            # modify the base url to match the interval
            if b % 2 == 0:
                base_url = create_base_url(i, i+1, order_type='desc')
            else:
                base_url = create_base_url(i, i+1, order_type='asc')

            # assert the page is in the correct interval
            driver.get(base_url)

            # load all items for the given interval and batch size
            load_page_items(batch_size+1)

            # load elements in the screen
            df = get_elements_from_screen(df)

            # save a backup
            make_backup(df, overview_dataset_path)
            
    return df
df = run()

Total size is 10415


Carregando títulos:   0%|          | 0/5209 [00:00<?, ?it/s]

Total length of elements loaded on screen is 5250


Criando dataset:   0%|          | 0/5250 [00:00<?, ?it/s]

Total size is 10415


Carregando títulos:   0%|          | 0/5209 [00:00<?, ?it/s]

Total length of elements loaded on screen is 5250


Criando dataset:   0%|          | 0/5250 [00:00<?, ?it/s]

True

In [44]:
## to do - translate the titles whos not in pt-br

!pip uninstall googletrans