### Expansion of the Netflix dataset with the people's birthplace

In [1]:
# Imports
import pandas as pd
import numpy as np
import json
import requests
import aiohttp
import asyncio
import urllib.parse
from tqdm.auto import tqdm
import threading
import time

tqdm.pandas()

In [2]:
def load_list_from_json(filename):
    """
    Load list from json file
    :param filename: name of the file
    :return data: data from the json file
    """
    with open(filename, 'r') as f:
        data = json.load(f)
    return data


def save_list_to_json(data, filename):
    """
    Save list to json file
    :param data: data to save
    :param filename: name of the file
    """
    with open(filename, 'w') as f:
        json.dump(data, f)

In [3]:
TMBD_API_KEY = 'eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJiNGE2NTk5ZjMyODRjZjkzOGQ4YzMwMmM1NGY3ZTIyZiIsInN1YiI6IjY2MWZkMzVlYTM5ZDBiMDE0YTU1MGU5ZSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.8um8fPsP5yKxzXYZHZy-5oxv_2EEeAVoX8Rz7J2aUKA'

In [4]:
def retrieve_birthplaces_from_tmdb_api(names, results, thread_num, api_key=TMBD_API_KEY):
    """
    Retrieve birthplaces of people
    :param names: list of names
    :param results: dictionary to store results
    :param thread_num: number of the thread
    :return places_of_birth: list of birthplaces 
    """
    places_of_birth = []
    for name in tqdm(names):
        name_url = urllib.parse.quote(name)
        url = f"https://api.themoviedb.org/3/search/person?query={name_url}&include_adult=false&language=en-US&page=1"
        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {api_key}"
        }
        try:
            response = requests.get(url, headers=headers)
            response_dict = json.loads(response.text)
            person_id = response_dict['results'][0]['id']
            url = f"https://api.themoviedb.org/3/person/{str(person_id)}?language=en-US"
            headers = {
                "accept": "application/json",
                "Authorization": f"Bearer {api_key}"
            }
            response = requests.get(url, headers=headers)
            response_dict = json.loads(response.text)
            places_of_birth.append(response_dict['place_of_birth'])
        except Exception as e:
            print(f"An error occurred for {name}: {str(e)}")
            places_of_birth.append(None)

    results[thread_num] = places_of_birth

In [5]:
def start_multithreading(func, array_list, num_thread):
    """
    Start multithreading 
    :param func: function to be executed
    :param array_list: list of arrays to be passed to the function
    :param num_thread: number of threads
    :return: list of results
    """
    birthplaces_list = []
    results = {}
    start = time.perf_counter()
    threads = []

    for i in range(num_thread):
        t = threading.Thread(target=func, args=(array_list[i]['name'], results, i))
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()

    for i in range(num_thread):
        birthplaces_list.extend(results[i])

    finish = time.perf_counter()
    print(f'Finished in {round(finish - start, 2)} second(s)')
    return birthplaces_list

In [15]:
# Read actual file
df_credits = pd.read_csv('data/credits.csv')  # person_id, id, name, character, role

In [7]:
NUM_THREADS = 8
df_credits_splitted = np.array_split(df_credits, NUM_THREADS)
birthplaces = start_multithreading(retrieve_birthplaces_from_tmdb_api, df_credits_splitted, NUM_THREADS)
save_list_to_json(birthplaces, './birthplaces.json')

  0%|          | 0/9726 [00:00<?, ?it/s]

  0%|          | 0/9725 [00:00<?, ?it/s]

  0%|          | 0/9725 [00:00<?, ?it/s]

  0%|          | 0/9725 [00:00<?, ?it/s]

  0%|          | 0/9725 [00:00<?, ?it/s]

  0%|          | 0/9725 [00:00<?, ?it/s]

  0%|          | 0/9725 [00:00<?, ?it/s]

  0%|          | 0/9725 [00:00<?, ?it/s]

An error occurred for Murali Sharma: list index out of range
An error occurred for Copper Cunningham: list index out of range
An error occurred for Motaz El Demerdash: list index out of range
An error occurred for Aly Mourad: list index out of range
An error occurred for Dany Verissimo-Petit: list index out of range
An error occurred for Pushtiie Shakti: list index out of range
An error occurred for Pushtiie Shakti: list index out of range
An error occurred for Chhota Bheem: list index out of range
An error occurred for Chutki: list index out of range
An error occurred for Bholu: list index out of range
An error occurred for Nanami Kawakami: list index out of range
An error occurred for Loutfi El Hakim: list index out of range
An error occurred for F. El Demerdache: list index out of range
An error occurred for Said El Araby: list index out of range
An error occurred for Naima Wasfy: list index out of range
An error occurred for Ahmed El Tantawi: list index out of range
An error occurr

In [23]:
birthplaces = load_list_from_json('data/birthplaces.json')
birthplaces = [birthplace if birthplace != 'None' else None for birthplace in birthplaces]
df_credits['birthplace'] = birthplaces
df_credits.to_csv('data/credits_expanded.csv')