# Criando um dataset de Filmes e Séries de TV a partir do IMDB

O objetivo deste notebook é realizar a coleta e a compilação dos dados de filmes e séries de tv do site IMDB.

In [None]:
# install libraries
# !pip install pandas selenium tqdm ipywidgets

In [1]:
# import libraries
import pandas as pd
from time import sleep

from selenium import webdriver
from selenium.webdriver.common.by import By

from tqdm.notebook import tqdm
from typing import List, TypeVar

T = TypeVar('T')

## Obtendo a descrição dos filmes do site IMDB via WebScrapping

Nesta primeira parte, estaremos coletando os resumos dos filmes e séries de TV diretamente do site IMDB utilizando a técnica de web scrapping, que é uma técnica de obtenção de dados via automação de código diretamente do navegador.

O IMDB possui uma ampla base dados, com cerca de XXX filmes e séries de TV. 

Como o objetivo final deste trabalho é desenvolver um algoritmo de recomendação de filmes e/ou séries, optamos por aplicar filtros a fim de diminuir o tamanho geral do dataset e evitar problemas de lentidão e memória.

Estaremos buscando dados de filmes e séries de TV que tenham avaliação geral maior que 7 e que tenham recebidos mais de 1 mil votos. 

In [2]:
# start webriver
driver = webdriver.Firefox() # start driver

# go to imdb site and assert the language is english
driver.get('https://www.imdb.com')

assert driver.title == 'IMDb: Classificações, avaliações e onde assistir os melhores filmes e séries'

In [3]:
def select_english_language():
    if not driver.find_element(By.XPATH, '/html/body/div[2]/nav/div[2]/div[6]/label/span').text == 'EN':
        driver.find_element(By.XPATH, '/html/body/div[2]/nav/div[2]/div[6]').click() # click on dropdown language set
        sleep(1)
        driver.find_element(By.XPATH, '/html/body/div[2]/nav/div[2]/div[6]/div/div/div/span/ul[1]/li[3]/span[2]').click() # click on English language on the list
        sleep(1)

# select_english_language()

In [4]:
# create a base url to get the movies

def create_base_url(min_ratings: str,
                    max_ratings: str,
                    min_votes: str = 1000,
                    title_types: List[str] = ['feature', 'tv_series'],
                    filter_type: str = 'release_date',
                    order_type: str= 'desc') -> str:
    # create a url based on the given filters
    return f'https://www.imdb.com/search/title/?title_type={",".join(title_types)}&user_rating={min_ratings},{max_ratings}&sort={filter_type},{order_type}&num_votes={min_votes},999999999999'

# navigate to imdb url
# driver.get(create_base_url(min_ratings=5, max_ratings=6))

In [5]:
def get_total_titles():
    total_size = int(driver.find_element(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[1]/div[1]').text.split(' ')[-1].replace('.', ''))
    return total_size

get_total_titles()

10415

In [46]:
# since the imdb shows only 50 titles by time, we have to scroll the page to load all titles
def load_page_items(batch_size: int= 10000):
    """Scoll the page until reach the batch size given"""
    # grab total titles with the given filter
    total_size = int(driver.find_element(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[1]/div[1]').text.split(' ')[-1].replace('.', ''))

    # check how many titles are loaded on screen
    size = len(driver.find_elements(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li'))

    with tqdm(range(batch_size), desc='Carregando títulos', leave=False) as pbar:
        # while the size of titles in screen is less then the total, we click on "load more" button to load the other titles
        while not size >= batch_size:
            try:
                try:
                    driver.find_element(By.CSS_SELECTOR, 'button.ipc-btn--single-padding:nth-child(1)').click()
                    size = len(driver.find_elements(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li'))
                    pbar.update(50)
                except:
                    pass
                sleep(1)
            except KeyboardInterrupt:
                break

# load_page_items()

In [47]:
# create a dataset with the titles loaded on screen
def get_elements_from_screen(df: List[T] = pd.DataFrame()) -> List[T]:
    """Return all the elements loaded on screen in a dadaset"""
    # store the elements in a list
    elements = driver.find_elements(By.XPATH, '/html/body/div[2]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')

    with tqdm(range(len(elements)), desc="Criando dataset", leave=False) as pbar:
        # for each element we grab the infos we need and add to main df
        for i, element in enumerate(elements):
            tconst = element.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]/a').get_attribute('href').split('/')[4]     # tconst
            primaryTitle = element.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]/a').text                              # primaryTitle
            imdb_link = element.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]/a').get_attribute('href')                # imdb link
            
            try:
                overview = element.find_element(By.XPATH, './div/div/div/div[2]/div[1]/div').text                                   # description
            except:
                overview = ""
            
            try:
                image_url = element.find_element(By.XPATH, './div/div/div/div[1]/div[1]/div/div[2]/img').get_attribute('src')       # image_url
            except:
                image_url = ""

            item = {
                'tconst': [tconst],
                'primaryTitle': [primaryTitle],
                'overview': [overview],
                'imdb_link': [imdb_link],
                'image_url': [image_url]

            }

            df = pd.concat([df, pd.DataFrame(item)], axis=0)
            df.drop_duplicates(subset=['tconst'], inplace=True)
            df.reset_index(drop=True, inplace=True)

            pbar.update(1)

    return df

# get_elements_from_screen()

In [8]:
def make_backup(df: List[T], path: str) -> bool:
    """Make a backup for the given df and path"""
    try:
        df.to_csv(path, sep='\t', index=False)
        return True
    except:
        return False
    
def load_backup(path: str) -> List[T]:
    """Load a backup or create a empty dataset"""
    try:
        return pd.read_csv(path, sep='\t')
    except:
        return pd.DataFrame()


In [49]:
import math 

overview_dataset_path = '../datasets/titles.overviews.pt-br.tsv'

def run():
    MIN_RATING = 9
    MAX_RATING = 10

    # carrega o backup
    df = load_backup(overview_dataset_path)

    # for each rating interval (by one) in the min max rating interval
    for i in range(MIN_RATING, MAX_RATING):
        # create a base url for that interval
        base_url = create_base_url(i, i+1)
        driver.get(base_url)

        # discover the total titles in the given parameters
        total_items = get_total_titles()

        # set the default batch size
        batch_size = 10000
        batch_range = 1

        # if total size is larger then 10000, split the interval in two parts
        if total_items >= 10000:
            batch_size = math.ceil(total_items / 2)
            batch_range = 2
        else:
            batch_size = total_items

        # for each interval
        for b in range(batch_range):
            # modify the base url to match the interval
            if b % 2 == 0:
                base_url = create_base_url(i, i+1, order_type='desc')
            else:
                base_url = create_base_url(i, i+1, order_type='asc')

            # assert the page is in the correct interval
            driver.get(base_url)

            # load all items for the given interval and batch size
            load_page_items(batch_size+1)

            # load elements in the screen
            df = get_elements_from_screen(df)

            # save a backup
            make_backup(df, overview_dataset_path)
            
    return df

df = run()

Carregando títulos:   0%|          | 0/193 [00:00<?, ?it/s]

Criando dataset:   0%|          | 0/192 [00:00<?, ?it/s]

In [50]:
driver.close()

## Part II - Obtain the data from non-comercial IMDB

O imdb disponibiliza gratuitamente alguns datasets não comerciais. O download está disponível nos links abaixo, e para obter os arquivos não é necessário estar logado na plataforma ou ter uma api específica.

Esta parte do código tem como objetivo fazer o download dos datasets diretamento do site do imdb, descompactá-los e deixar disponíveis para que possamos obter as informações que precisamos e agregar ao nosso dataset final.

https://developer.imdb.com/non-commercial-datasets/

https://datasets.imdbws.com/



In [55]:
path_to_datasets = 'C:\\Users\\Tiago Machado\\Projetos\\Machine Learning\\movie-recomendation-system\\datasets' # location of datasets folder for this project

O código abaixo utiliza bibliotecas nativas do Python para baixar e descompactar os datasets disponíveis no imdb diretamente para a pasta que definimos na célula anterior.

In [53]:
!pip install wget

Collecting wget
  Using cached wget-3.2.zip (10 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Installing collected packages: wget
  Running setup.py install for wget: started
  Running setup.py install for wget: finished with status 'done'
Successfully installed wget-3.2


  DEPRECATION: wget is being installed using the legacy 'setup.py install' method, because it does not have a 'pyproject.toml' and the 'wheel' package is not installed. pip 23.1 will enforce this behaviour change. A possible replacement is to enable the '--use-pep517' option. Discussion can be found at https://github.com/pypa/pip/issues/8559

[notice] A new release of pip available: 22.3.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [56]:
import shutil, os, gzip
import wget

imdb_public_datasets = ['title.principals.tsv.gz', 'title.ratings.tsv.gz', 'title.basics.tsv.gz'] # items to be downloaded

for item in imdb_public_datasets:
    item = item[:-3] # remove .gz extension

    wget.download(f'https://datasets.imdbws.com/{item}.gz', path_to_datasets) # download dataset from imdb database

    file = f'{path_to_datasets}/{item}'

    try:
        os.remove(file) # remove older dataset if it exists
    except:
        pass

    with gzip.open(f'{file}.gz', 'rb') as f_in:
        with open(file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out) # extract and move the dataset to datasets folder

    os.remove(f'{file}.gz') # remove compressed file
    sleep(1)

## Part III - Merge scrapped data with public datasets provided from IMDB

Reference
https://realpython.com/flask-project/


In [59]:
df_ratings = pd.read_csv(f'{path_to_datasets}/title.ratings.tsv', sep='\t') # load ratings dataset
df_ratings = df_ratings.loc[df_ratings.tconst.isin(df.tconst)] # keep only scraped items

df_ratings.reset_index(drop=True, inplace=True)
df_ratings.to_csv(f'{path_to_datasets}/title.ratings.tsv', sep='\t', index=False) # save filtered dataset to disk

df_ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0002130,7.0,3430
1,tt0002844,6.9,2477
2,tt0003014,7.0,1422
3,tt0003037,6.9,1676
4,tt0003165,6.9,1330
...,...,...,...
44982,tt9909248,8.3,1068
44983,tt9910728,5.8,1251
44984,tt9911196,7.4,3318
44985,tt9916270,5.8,1477


In [60]:
df_titles = pd.read_csv(f'{path_to_datasets}/title.basics.tsv', sep='\t') # load titles dataset
df_titles = df_titles.loc[df_titles.tconst.isin(df.tconst)] # keep only scraped items

df_titles.reset_index(drop=True, inplace=True)
df_titles.to_csv(f'{path_to_datasets}/title.basics.tsv', sep='\t', index=False) # save filtered dataset to disk

df_titles

  df_titles = pd.read_csv(f'{path_to_datasets}/title.basics.tsv', sep='\t') # load titles dataset


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0002130,movie,Dante's Inferno,L'Inferno,0,1911,\N,71,"Adventure,Drama,Fantasy"
1,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,Fantômas - À l'ombre de la guillotine,0,1913,\N,54,"Crime,Drama"
2,tt0003014,movie,Ingeborg Holm,Ingeborg Holm,0,1913,\N,96,Drama
3,tt0003037,movie,Fantomas: The Man in Black,Juve contre Fantômas,0,1913,\N,61,"Crime,Drama"
4,tt0003165,movie,Fantômas: The Dead Man Who Killed,Le mort qui tue,0,1913,\N,90,"Crime,Drama,Mystery"
...,...,...,...,...,...,...,...,...,...
44982,tt9909248,tvSeries,Race Across the World,Race Across the World,0,2019,\N,59,"Documentary,Game-Show,Reality-TV"
44983,tt9910728,tvSeries,Cruel Istanbul,Zalim Istanbul,0,2019,2020,120,Drama
44984,tt9911196,movie,The Marriage Escape,De beentjes van Sint-Hildegard,0,2020,\N,103,"Comedy,Drama"
44985,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,\N,84,Thriller


In [61]:
# merge title and ratings datasets
df_movies = df_titles.merge(right=df, on='tconst')
df_movies = df_movies.merge(right=df_ratings, on='tconst')

# filter only movies in the dataset
# df = df.loc[df.titleType == 'movie']
df_movies.reset_index(drop=True, inplace=True)

df_movies

Unnamed: 0,tconst,titleType,primaryTitle_x,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,primaryTitle_y,overview,imdb_link,image_url,averageRating,numVotes
0,tt0002130,movie,Dante's Inferno,L'Inferno,0,1911,\N,71,"Adventure,Drama,Fantasy",1. Inferno,Loosely adapted from Dante's Divine Comedy and...,https://www.imdb.com/title/tt0002130/?ref_=sr_t_1,https://m.media-amazon.com/images/M/MV5BMzY0NT...,7.0,3430
1,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,Fantômas - À l'ombre de la guillotine,0,1913,\N,54,"Crime,Drama",2. Fantômas - À l'ombre de la guillotine,Inspector Juve is tasked to investigate and ca...,https://www.imdb.com/title/tt0002844/?ref_=sr_t_2,https://m.media-amazon.com/images/M/MV5BMTQxND...,6.9,2477
2,tt0003014,movie,Ingeborg Holm,Ingeborg Holm,0,1913,\N,96,Drama,5. Ingeborg Holm,Financial struggles separate a single mother f...,https://www.imdb.com/title/tt0003014/?ref_=sr_t_5,https://m.media-amazon.com/images/M/MV5BMTQyND...,7.0,1422
3,tt0003037,movie,Fantomas: The Man in Black,Juve contre Fantômas,0,1913,\N,61,"Crime,Drama",4. Juve contre Fantômas,In Part Two of Louis Feuillade's 5 1/2-hour ep...,https://www.imdb.com/title/tt0003037/?ref_=sr_t_4,https://m.media-amazon.com/images/M/MV5BMTFkM2...,6.9,1676
4,tt0003165,movie,Fantômas: The Dead Man Who Killed,Le mort qui tue,0,1913,\N,90,"Crime,Drama,Mystery",6. Le mort qui tue,After a body disappears from inside the prison...,https://www.imdb.com/title/tt0003165/?ref_=sr_t_6,https://m.media-amazon.com/images/M/MV5BMjQwMT...,6.9,1330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44982,tt9909248,tvSeries,Race Across the World,Race Across the World,0,2019,\N,59,"Documentary,Game-Show,Reality-TV",838. Race Across the World,5 pairs of travellers race from London to the ...,https://www.imdb.com/title/tt9909248/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYzY0ZT...,8.3,1068
44983,tt9910728,tvSeries,Cruel Istanbul,Zalim Istanbul,0,2019,2020,120,Drama,1901. Cidade Cruel,Seher vive em Istambul com três filhos. A reun...,https://www.imdb.com/title/tt9910728/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BN2NkMz...,5.8,1251
44984,tt9911196,movie,The Marriage Escape,De beentjes van Sint-Hildegard,0,2020,\N,103,"Comedy,Drama",2027. De beentjes van Sint-Hildegard,"A middle-aged veterinary surgeon, believing th...",https://www.imdb.com/title/tt9911196/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYjNjZm...,7.4,3318
44985,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,\N,84,Thriller,1246. Il talento del calabrone,"Dj Steph is a young radio deejay on the rise, ...",https://www.imdb.com/title/tt9916270/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BNmVjMT...,5.8,1477


In [63]:
# to avoid memory issues i'll be loading the datasets bellow in chunks, and savind with only the items in scraped dataset

chunksize = 5000

principals_chunks = pd.read_csv(f'{path_to_datasets}/title.principals.tsv', sep='\t', chunksize=chunksize)

df_principals = pd.DataFrame()

for _, chunk in enumerate(principals_chunks):
  dff = chunk.loc[chunk.tconst.isin(df_movies.tconst)]
  df_principals = pd.concat([df_principals, dff])

df_principals.reset_index(drop=True, inplace=True)
df_principals.to_csv(f'{path_to_datasets}/title.principals.tsv', sep='\t', index=False)

df_principals

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0002130,1,nm0660139,actor,\N,"[""Dante Alighieri""]"
1,tt0002130,2,nm0685283,actor,\N,"[""Virgilio""]"
2,tt0002130,3,nm0209738,actor,\N,"[""Farinata degli Uberti""]"
3,tt0002130,4,nm0209738,actor,\N,"[""Pier delle Vigne""]"
4,tt0002130,5,nm0209738,actor,\N,"[""Il conte Ugolino""]"
...,...,...,...,...,...,...
851672,tt9916362,20,nm0284765,editor,\N,\N
851673,tt9916362,21,nm2857592,casting_director,\N,\N
851674,tt9916362,22,nm1486091,casting_director,casting_director,\N
851675,tt9916362,23,nm7266158,casting_director,\N,\N


In [64]:
principals_unique = df_principals.nconst.unique()
principals_total = df_principals.nconst

print(f'There is {len(principals_total)} principals being {len(principals_unique)} unique values')

There is 851677 principals being 279870 unique values


In [65]:
# load only names whos in principals dataset
df_names_chunk = pd.read_csv(f'{path_to_datasets}/name.basics.tsv', sep='\t', chunksize=chunksize)

df_names = pd.DataFrame()

for chunk in df_names_chunk:
  dff = chunk.loc[chunk.nconst.isin(principals_unique)]
  df_names = pd.concat([df_names, dff], ignore_index=True)

df_names.to_csv(f'{path_to_datasets}/name.basics.tsv', sep='\t', index=False)

In [66]:
# create a dict to hold the par nconst: primaryName
names = {}

for _, p in enumerate(principals_unique):
  names[p] = df_names.loc[df_names.nconst == p, 'primaryName'].values[0]

del(df_names)

names

{'nm0660139': 'Salvatore Papa',
 'nm0685283': 'Arturo Pirovano',
 'nm0209738': 'Giuseppe de Liguoro',
 'nm3942815': 'Pier Delle Vigne',
 'nm1375863': 'Augusto Milla',
 'nm1375916': 'Attilio Motta',
 'nm1374534': 'Emilise Beretta',
 'nm0078205': 'Francesco Bertolini',
 'nm0655824': 'Adolfo Padovan',
 'nm0019604': 'Dante Alighieri',
 'nm1374692': 'Raffaele Caravaglios',
 'nm1376296': 'Emilio Roncarolo',
 'nm1376180': 'Sandro Properzi',
 'nm0622772': 'René Navarre',
 'nm0107631': 'Edmund Breon',
 'nm0577476': 'Georges Melchior',
 'nm0137288': 'Renée Carl',
 'nm0264491': 'Jane Faber',
 'nm0901357': 'André Volbert',
 'nm1207319': 'Naudier',
 'nm0537575': 'Maillard',
 'nm0029301': 'Yvette Andréyor',
 'nm0525283': 'André Luguet',
 'nm0275421': 'Louis Feuillade',
 'nm0019855': 'Marcel Allain',
 'nm0816232': 'Pierre Souvestre',
 'nm0097918': 'Romeo Bosetti',
 'nm0287628': 'Paul Fosse',
 'nm1666136': 'Georges Guérin',
 'nm0307872': 'Robert-Jules Garnier',
 'nm0096737': 'Hilda Borgström',
 'nm034

In [67]:
# for each movie we iterate over crew dataset and create a list of dict containing nconst, name, category
# then we iterate over principals dataset and create a list of dict containing nconst, name, category with the names whos not in crew

# create a helper to filter who is principal and who is crew member
is_principals = ['actor', 'actress', 'self']

# create a dataset to hold teams members
df_team = pd.DataFrame(columns=['tconst', 'principals', 'crew'])

for index, movie in df_movies.iterrows():
    movie_principals = df_principals.loc[df_principals.tconst == movie['tconst']] # get the principals for the given movie tconst

    # Reset the lists for each movie
    principals_team = []
    crew_team = []

    for _, p in movie_principals.iterrows():
        # for each member we create a item with his/her infos
        _principal = {
            'nconst': p['nconst'],
            'primaryName': names[p['nconst']],
            'category': p['category'],
            'characters': p['characters']
        }

        # check if the member is a principal or crew member and then append to the right list
        if p['category'] in is_principals:
            principals_team.append(_principal)
        else:
            crew_team.append(_principal)

    # Assign the lists to the teams dataset
    df_team.loc[len(df_team)] = [movie['tconst'], principals_team, crew_team]

    # No need to break here, if you want to process all movies
    # if index == 37: break

df_team.to_csv(f'{path_to_datasets}/title.teams.tsv', sep='\t', index=False)

df_team

Unnamed: 0,tconst,principals,crew
0,tt0002130,"[{'nconst': 'nm0660139', 'primaryName': 'Salva...","[{'nconst': 'nm0078205', 'primaryName': 'Franc..."
1,tt0002844,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
2,tt0003014,"[{'nconst': 'nm0096737', 'primaryName': 'Hilda...","[{'nconst': 'nm0803705', 'primaryName': 'Victo..."
3,tt0003037,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
4,tt0003165,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
...,...,...,...
44982,tt9909248,"[{'nconst': 'nm0001314', 'primaryName': 'John ...",[]
44983,tt9910728,"[{'nconst': 'nm0476334', 'primaryName': 'Fikre...",[]
44984,tt9911196,"[{'nconst': 'nm0277932', 'primaryName': 'Herma...","[{'nconst': 'nm0631590', 'primaryName': 'Johan..."
44985,tt9916270,"[{'nconst': 'nm0144812', 'primaryName': 'Sergi...","[{'nconst': 'nm1480867', 'primaryName': 'Giaco..."


In [68]:

# merge movies dataset with team dataset
df_movies = df_movies.merge(right=df_team, on='tconst')
# assert only unique values in dataset
df_movies.drop_duplicates(subset=['tconst'], inplace=True)

df_movies

Unnamed: 0,tconst,titleType,primaryTitle_x,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,primaryTitle_y,overview,imdb_link,image_url,averageRating,numVotes,principals,crew
0,tt0002130,movie,Dante's Inferno,L'Inferno,0,1911,\N,71,"Adventure,Drama,Fantasy",1. Inferno,Loosely adapted from Dante's Divine Comedy and...,https://www.imdb.com/title/tt0002130/?ref_=sr_t_1,https://m.media-amazon.com/images/M/MV5BMzY0NT...,7.0,3430,"[{'nconst': 'nm0660139', 'primaryName': 'Salva...","[{'nconst': 'nm0078205', 'primaryName': 'Franc..."
1,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,Fantômas - À l'ombre de la guillotine,0,1913,\N,54,"Crime,Drama",2. Fantômas - À l'ombre de la guillotine,Inspector Juve is tasked to investigate and ca...,https://www.imdb.com/title/tt0002844/?ref_=sr_t_2,https://m.media-amazon.com/images/M/MV5BMTQxND...,6.9,2477,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
2,tt0003014,movie,Ingeborg Holm,Ingeborg Holm,0,1913,\N,96,Drama,5. Ingeborg Holm,Financial struggles separate a single mother f...,https://www.imdb.com/title/tt0003014/?ref_=sr_t_5,https://m.media-amazon.com/images/M/MV5BMTQyND...,7.0,1422,"[{'nconst': 'nm0096737', 'primaryName': 'Hilda...","[{'nconst': 'nm0803705', 'primaryName': 'Victo..."
3,tt0003037,movie,Fantomas: The Man in Black,Juve contre Fantômas,0,1913,\N,61,"Crime,Drama",4. Juve contre Fantômas,In Part Two of Louis Feuillade's 5 1/2-hour ep...,https://www.imdb.com/title/tt0003037/?ref_=sr_t_4,https://m.media-amazon.com/images/M/MV5BMTFkM2...,6.9,1676,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
4,tt0003165,movie,Fantômas: The Dead Man Who Killed,Le mort qui tue,0,1913,\N,90,"Crime,Drama,Mystery",6. Le mort qui tue,After a body disappears from inside the prison...,https://www.imdb.com/title/tt0003165/?ref_=sr_t_6,https://m.media-amazon.com/images/M/MV5BMjQwMT...,6.9,1330,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44982,tt9909248,tvSeries,Race Across the World,Race Across the World,0,2019,\N,59,"Documentary,Game-Show,Reality-TV",838. Race Across the World,5 pairs of travellers race from London to the ...,https://www.imdb.com/title/tt9909248/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYzY0ZT...,8.3,1068,"[{'nconst': 'nm0001314', 'primaryName': 'John ...",[]
44983,tt9910728,tvSeries,Cruel Istanbul,Zalim Istanbul,0,2019,2020,120,Drama,1901. Cidade Cruel,Seher vive em Istambul com três filhos. A reun...,https://www.imdb.com/title/tt9910728/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BN2NkMz...,5.8,1251,"[{'nconst': 'nm0476334', 'primaryName': 'Fikre...",[]
44984,tt9911196,movie,The Marriage Escape,De beentjes van Sint-Hildegard,0,2020,\N,103,"Comedy,Drama",2027. De beentjes van Sint-Hildegard,"A middle-aged veterinary surgeon, believing th...",https://www.imdb.com/title/tt9911196/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYjNjZm...,7.4,3318,"[{'nconst': 'nm0277932', 'primaryName': 'Herma...","[{'nconst': 'nm0631590', 'primaryName': 'Johan..."
44985,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,\N,84,Thriller,1246. Il talento del calabrone,"Dj Steph is a young radio deejay on the rise, ...",https://www.imdb.com/title/tt9916270/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BNmVjMT...,5.8,1477,"[{'nconst': 'nm0144812', 'primaryName': 'Sergi...","[{'nconst': 'nm1480867', 'primaryName': 'Giaco..."


In [84]:
# df_movies.rename({'primaryTitle_x': 'primaryTitle', 'primaryTitle_y': 'primaryTitle_ptBr'}, axis=1, inplace=True)
df_movies.primaryTitle_ptBr = df_movies.primaryTitle_ptBr.apply(lambda x: x.split('. ')[-1])

df_movies

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,primaryTitle_ptBr,overview,imdb_link,image_url,averageRating,numVotes,principals,crew
0,tt0002130,movie,Dante's Inferno,L'Inferno,0,1911,\N,71,"Adventure,Drama,Fantasy",Inferno,Loosely adapted from Dante's Divine Comedy and...,https://www.imdb.com/title/tt0002130/?ref_=sr_t_1,https://m.media-amazon.com/images/M/MV5BMzY0NT...,7.0,3430,"[{'nconst': 'nm0660139', 'primaryName': 'Salva...","[{'nconst': 'nm0078205', 'primaryName': 'Franc..."
1,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,Fantômas - À l'ombre de la guillotine,0,1913,\N,54,"Crime,Drama",Fantômas - À l'ombre de la guillotine,Inspector Juve is tasked to investigate and ca...,https://www.imdb.com/title/tt0002844/?ref_=sr_t_2,https://m.media-amazon.com/images/M/MV5BMTQxND...,6.9,2477,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
2,tt0003014,movie,Ingeborg Holm,Ingeborg Holm,0,1913,\N,96,Drama,Ingeborg Holm,Financial struggles separate a single mother f...,https://www.imdb.com/title/tt0003014/?ref_=sr_t_5,https://m.media-amazon.com/images/M/MV5BMTQyND...,7.0,1422,"[{'nconst': 'nm0096737', 'primaryName': 'Hilda...","[{'nconst': 'nm0803705', 'primaryName': 'Victo..."
3,tt0003037,movie,Fantomas: The Man in Black,Juve contre Fantômas,0,1913,\N,61,"Crime,Drama",Juve contre Fantômas,In Part Two of Louis Feuillade's 5 1/2-hour ep...,https://www.imdb.com/title/tt0003037/?ref_=sr_t_4,https://m.media-amazon.com/images/M/MV5BMTFkM2...,6.9,1676,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
4,tt0003165,movie,Fantômas: The Dead Man Who Killed,Le mort qui tue,0,1913,\N,90,"Crime,Drama,Mystery",Le mort qui tue,After a body disappears from inside the prison...,https://www.imdb.com/title/tt0003165/?ref_=sr_t_6,https://m.media-amazon.com/images/M/MV5BMjQwMT...,6.9,1330,"[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44982,tt9909248,tvSeries,Race Across the World,Race Across the World,0,2019,\N,59,"Documentary,Game-Show,Reality-TV",Race Across the World,5 pairs of travellers race from London to the ...,https://www.imdb.com/title/tt9909248/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYzY0ZT...,8.3,1068,"[{'nconst': 'nm0001314', 'primaryName': 'John ...",[]
44983,tt9910728,tvSeries,Cruel Istanbul,Zalim Istanbul,0,2019,2020,120,Drama,Cidade Cruel,Seher vive em Istambul com três filhos. A reun...,https://www.imdb.com/title/tt9910728/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BN2NkMz...,5.8,1251,"[{'nconst': 'nm0476334', 'primaryName': 'Fikre...",[]
44984,tt9911196,movie,The Marriage Escape,De beentjes van Sint-Hildegard,0,2020,\N,103,"Comedy,Drama",De beentjes van Sint-Hildegard,"A middle-aged veterinary surgeon, believing th...",https://www.imdb.com/title/tt9911196/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYjNjZm...,7.4,3318,"[{'nconst': 'nm0277932', 'primaryName': 'Herma...","[{'nconst': 'nm0631590', 'primaryName': 'Johan..."
44985,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,\N,84,Thriller,Il talento del calabrone,"Dj Steph is a young radio deejay on the rise, ...",https://www.imdb.com/title/tt9916270/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BNmVjMT...,5.8,1477,"[{'nconst': 'nm0144812', 'primaryName': 'Sergi...","[{'nconst': 'nm1480867', 'primaryName': 'Giaco..."


In [85]:
# organize the dataset columns and then save it to disk

df_movies = df_movies[['tconst', 'titleType', 'primaryTitle', 'primaryTitle_ptBr', 'overview', 'genres', 'principals', 'crew', 'startYear', 'runtimeMinutes', 'averageRating', 'numVotes', 'isAdult', 'imdb_link', 'image_url']]
df_movies.to_csv(f'{path_to_datasets}/movies.tsv', sep='\t', index=False)

df_movies


Unnamed: 0,tconst,titleType,primaryTitle,primaryTitle_ptBr,overview,genres,principals,crew,startYear,runtimeMinutes,averageRating,numVotes,isAdult,imdb_link,image_url
0,tt0002130,movie,Dante's Inferno,Inferno,Loosely adapted from Dante's Divine Comedy and...,"Adventure,Drama,Fantasy","[{'nconst': 'nm0660139', 'primaryName': 'Salva...","[{'nconst': 'nm0078205', 'primaryName': 'Franc...",1911,71,7.0,3430,0,https://www.imdb.com/title/tt0002130/?ref_=sr_t_1,https://m.media-amazon.com/images/M/MV5BMzY0NT...
1,tt0002844,movie,Fantômas: In the Shadow of the Guillotine,Fantômas - À l'ombre de la guillotine,Inspector Juve is tasked to investigate and ca...,"Crime,Drama","[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis...",1913,54,6.9,2477,0,https://www.imdb.com/title/tt0002844/?ref_=sr_t_2,https://m.media-amazon.com/images/M/MV5BMTQxND...
2,tt0003014,movie,Ingeborg Holm,Ingeborg Holm,Financial struggles separate a single mother f...,Drama,"[{'nconst': 'nm0096737', 'primaryName': 'Hilda...","[{'nconst': 'nm0803705', 'primaryName': 'Victo...",1913,96,7.0,1422,0,https://www.imdb.com/title/tt0003014/?ref_=sr_t_5,https://m.media-amazon.com/images/M/MV5BMTQyND...
3,tt0003037,movie,Fantomas: The Man in Black,Juve contre Fantômas,In Part Two of Louis Feuillade's 5 1/2-hour ep...,"Crime,Drama","[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis...",1913,61,6.9,1676,0,https://www.imdb.com/title/tt0003037/?ref_=sr_t_4,https://m.media-amazon.com/images/M/MV5BMTFkM2...
4,tt0003165,movie,Fantômas: The Dead Man Who Killed,Le mort qui tue,After a body disappears from inside the prison...,"Crime,Drama,Mystery","[{'nconst': 'nm0622772', 'primaryName': 'René ...","[{'nconst': 'nm0275421', 'primaryName': 'Louis...",1913,90,6.9,1330,0,https://www.imdb.com/title/tt0003165/?ref_=sr_t_6,https://m.media-amazon.com/images/M/MV5BMjQwMT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44982,tt9909248,tvSeries,Race Across the World,Race Across the World,5 pairs of travellers race from London to the ...,"Documentary,Game-Show,Reality-TV","[{'nconst': 'nm0001314', 'primaryName': 'John ...",[],2019,59,8.3,1068,0,https://www.imdb.com/title/tt9909248/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYzY0ZT...
44983,tt9910728,tvSeries,Cruel Istanbul,Cidade Cruel,Seher vive em Istambul com três filhos. A reun...,Drama,"[{'nconst': 'nm0476334', 'primaryName': 'Fikre...",[],2019,120,5.8,1251,0,https://www.imdb.com/title/tt9910728/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BN2NkMz...
44984,tt9911196,movie,The Marriage Escape,De beentjes van Sint-Hildegard,"A middle-aged veterinary surgeon, believing th...","Comedy,Drama","[{'nconst': 'nm0277932', 'primaryName': 'Herma...","[{'nconst': 'nm0631590', 'primaryName': 'Johan...",2020,103,7.4,3318,0,https://www.imdb.com/title/tt9911196/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BYjNjZm...
44985,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,"Dj Steph is a young radio deejay on the rise, ...",Thriller,"[{'nconst': 'nm0144812', 'primaryName': 'Sergi...","[{'nconst': 'nm1480867', 'primaryName': 'Giaco...",2020,84,5.8,1477,0,https://www.imdb.com/title/tt9916270/?ref_=sr_...,https://m.media-amazon.com/images/M/MV5BNmVjMT...
