### Expansion of the Netflix dataset with the people's birthplace

In [1]:
# Imports
import pandas as pd
import numpy as np
import json
import requests
import urllib.parse
from tqdm.auto import tqdm
import threading
import time

tqdm.pandas()

In [2]:
def load_list_from_json(filename):
    """
    Load list from json file
    :param filename: name of the file
    :return data: data from the json file
    """
    with open(filename, 'r') as f:
        data = json.load(f)
    return data


def save_list_to_json(data, filename):
    """
    Save list to json file
    :param data: data to save
    :param filename: name of the file
    """
    with open(filename, 'w') as f:
        json.dump(data, f)

In [3]:
country_to_capital_dict = load_list_from_json('../data/country_to_capital_city.json')

In [4]:
TMBD_API_KEY = 'eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJiNGE2NTk5ZjMyODRjZjkzOGQ4YzMwMmM1NGY3ZTIyZiIsInN1YiI6IjY2MWZkMzVlYTM5ZDBiMDE0YTU1MGU5ZSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.8um8fPsP5yKxzXYZHZy-5oxv_2EEeAVoX8Rz7J2aUKA'

In [5]:
NINJA_API_KEY = 'HbDls4BdiWMziHcA9GCC6Q==EwPwuLfohR6FZtGi'

In [6]:
def retrieve_birthplaces_from_tmdb_api(names, results, thread_num, api_key=TMBD_API_KEY):
    """
    Retrieve birthplaces of people
    :param names: list of names
    :param results: dictionary to store results
    :param thread_num: number of the thread
    """
    places_of_birth = []
    for name in tqdm(names):
        name_url = urllib.parse.quote(name)
        url = f"https://api.themoviedb.org/3/search/person?query={name_url}&include_adult=false&language=en-US&page=1"
        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {api_key}"
        }
        try:
            response = requests.get(url, headers=headers)
            response_dict = json.loads(response.text)
            person_id = response_dict['results'][0]['id']
            url = f"https://api.themoviedb.org/3/person/{str(person_id)}?language=en-US"
            headers = {
                "accept": "application/json",
                "Authorization": f"Bearer {api_key}"
            }
            response = requests.get(url, headers=headers)
            response_dict = json.loads(response.text)
            places_of_birth.append(response_dict['place_of_birth'])
        except Exception as e:
            print(f"An error occurred for {name}: {str(e)}")
            places_of_birth.append(None)

    results[thread_num] = places_of_birth

In [7]:
def start_multithreading_birthplaces(func, array_list, num_thread):
    """
    Start multithreading 
    :param func: function to be executed
    :param array_list: list of arrays to be passed to the function
    :param num_thread: number of threads
    :return: list of results
    """
    birthplaces_list = []
    results = {}
    start = time.perf_counter()
    threads = []

    for i in range(num_thread):
        t = threading.Thread(target=func, args=(array_list[i]['name'], results, i))
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()

    for i in range(num_thread):
        birthplaces_list.extend(results[i])

    finish = time.perf_counter()
    print(f'Finished in {round(finish - start, 2)} second(s)')
    return birthplaces_list

In [8]:
# Read actual file
df_credits = pd.read_csv('../data/credits.csv')  # person_id, id, name, character, role

In [None]:
NUM_THREADS = 8
df_credits_splitted = np.array_split(df_credits, NUM_THREADS)
birthplaces = start_multithreading_birthplaces(retrieve_birthplaces_from_tmdb_api, df_credits_splitted, NUM_THREADS)
save_list_to_json(birthplaces, './birthplaces.json')

In [None]:
birthplaces = load_list_from_json('../data/birthplaces.json')
birthplaces = [birthplace if birthplace != 'None' else None for birthplace in birthplaces]
df_credits['birthplace'] = birthplaces
df_credits.to_csv('data/credits_expanded.csv')

### Retrieving latitude and longitude of the birthplaces

In [9]:
def place_parser(place):
    """
    Parse a string containing a place of birth
    :param place: the place to parse
    :return: city, state, country
    """
    USA_SYNONYMES = ['USA', 'United States', 'United States of America', 'US', 'U.S.A.', 'U.S.A', 'U.S.', 'U.S', 'America', 'United States of America (USA)', 'Estados Unidos']
    
    if place is None:
        return None, None, None
        
    if '-' in place and ',' not in place:
        place_splitted = place.split('-')
    else:
        place_splitted = place.split(',')
    
    if len(place_splitted) == 1:
        try:
            capital_city = country_to_capital_dict[place_splitted[0].strip()]
        except Exception as e:
            print(f"An error occurred in the place_parser: the country {str(e)} is not found !")
            capital_city = None
        return capital_city, None, place_splitted[0].strip()
    elif len(place_splitted) == 2:
        if place_splitted[-1].strip() == 'UK':
            return place_splitted[0].strip(), None, 'GB'
        if place_splitted[-1].strip() in USA_SYNONYMES:
            return place_splitted[0].strip(), None, 'US'
        return place_splitted[0].strip(), None, place_splitted[1].strip()
    elif len(place_splitted) == 3:
        if place_splitted[-1].strip() == 'UK':
            return place_splitted[0].strip(), None, 'GB'
        if place_splitted[-1].strip() in USA_SYNONYMES:
            return place_splitted[0].strip(), place_splitted[1].strip(), 'US'
        return place_splitted[0].strip(), place_splitted[1].strip(), place_splitted[2].strip()
    else:
        if place_splitted[-1].strip() == 'UK':
            return place_splitted[0].strip(), None, 'GB'
        if place_splitted[-1].strip() in USA_SYNONYMES:
            return place_splitted[1].strip(), place_splitted[2].strip(), 'US'
        return place_splitted[1].strip(), place_splitted[2].strip(), place_splitted[3].strip()

In [10]:


def retrieve_latlong_from_ninja_api(places, results, errors, lock_res, lock_error, api_key=NINJA_API_KEY):
    """
    Retrieve latitude and longitude of a place
    :param places: list of place names
    :param results: dictionary to store results (conccurent access)
    :param errors: list to store errors (conccurent access)
    :param lock_res: lock for the results
    :param lock_error: lock for the errors
    """

    def make_api_call(api_url, api_key):
        """
        Make an API call to retrieve the latitude and longitude of a place
        :param api_url: the url of the Ninja API
        :param api_key: the Ninja API key
        """
        while True:
            response = requests.get(api_url, headers={'X-Api-Key': api_key})
            if response.status_code == requests.codes.ok:
                try:
                    city_info = json.loads(response.text)[0]
                    lock_res.acquire()
                    results[place_parsed_string] = (city_info['latitude'], city_info['longitude'])
                    lock_res.release()
                except Exception as e:
                    lock_error.acquire()
                    errors.append(place)
                    lock_error.release()
                    print(f"Error {e}: {place} was not found (was parsed into {place_parsed_string})")
                break
            else:
                print("Error:", response.status_code, response.text, "Retrying in 5 seconds...")
                time.sleep(5) # Wait for 5 seconds before retrying because of API Throttled

    for place in tqdm(places):
        place_parsed = (city, state, country) = place_parser(place)
        if city and country:
            place_parsed_string = ', '.join(place_parsed) if state else ', '.join([city, country])
        else: 
            continue
            
        if place_parsed_string in results:
            continue
        else:
            api_url = f'https://api.api-ninjas.com/v1/geocoding?city={city}&state={state}&country={country}' if state else f'https://api.api-ninjas.com/v1/geocoding?city={city}&country={country}'
            make_api_call(api_url, api_key)

In [11]:
def start_multithreading_latlong(func, array_list, results, errors, num_thread):
    """
    Start multithreading 
    :param func: function to be executed
    :param array_list: list of arrays to be passed to the function
    :param results: dictionary to store results
    :param errors: list to store errors
    :param num_thread: number of threads
    """
    start = time.perf_counter()
    threads = []
    lock_res = threading.Lock()
    lock_error = threading.Lock()
    for i in range(num_thread):
        t = threading.Thread(target=func, args=(array_list[i], results, errors, lock_res, lock_error))
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()

    finish = time.perf_counter()
    print(f'Finished in {round(finish - start, 2)} second(s)')

In [12]:
NUM_THREADS = 4
df = pd.read_csv('../data/credits_expanded.csv')
birthplaces = df["birthplace"].dropna().unique()
print(f'Number of unique birthplaces: {len(birthplaces)}')
birthplaces_splitted = np.array_split(birthplaces, NUM_THREADS)

Number of unique birthplaces: 8309


In [13]:
results = {}
errors = []
start_multithreading_latlong(retrieve_latlong_from_ninja_api, birthplaces_splitted, results, errors, NUM_THREADS)
save_list_to_json(results, '../data/place_to_latlong.json')
save_list_to_json(errors, '../data/places_not_found.json')

  0%|          | 0/2077 [00:00<?, ?it/s]

  0%|          | 0/2077 [00:00<?, ?it/s]

  0%|          | 0/2077 [00:00<?, ?it/s]

  0%|          | 0/2078 [00:00<?, ?it/s]

Error list index out of range: South East London - England - UK was not found (was parsed into South East London, GB)
Error list index out of range: Agouk, Ivory Coast was not found (was parsed into Agouk, Ivory Coast)
Error list index out of range: Voskresensk, Moscow Oblast, Russian SFSR, USSR [now Russia] was not found (was parsed into Moscow Oblast, Russian SFSR, USSR [now Russia])
Error list index out of range: County Cork, Ireland was not found (was parsed into County Cork, Ireland)
Error list index out of range: Shreveport. Louisiana, USA was not found (was parsed into Shreveport. Louisiana, US)
An error occurred in the place_parser: the country 'America' is not found !
Error list index out of range: Rabun County, Georgia, USA was not found (was parsed into Rabun County, Georgia, US)
Error list index out of range: South Carolina, USA was not found (was parsed into South Carolina, US)
Error list index out of range: Tangiers, Morocco was not found (was parsed into Tangiers, Morocc