### Expansion of the Netflix dataset with the people's birthplace

In [51]:
# Imports
import pandas as pd
import numpy as np
import json
import requests
import urllib.parse
from tqdm.auto import tqdm
import threading
import time
import gpt_wrapper
from gpt_wrapper.chat import Chat
import os
import ast

tqdm.pandas()

In [2]:
def load_list_from_json(filename):
    """
    Load list from json file
    :param filename: name of the file
    :return data: data from the json file
    """
    with open(filename, 'r') as f:
        data = json.load(f)
    return data


def save_list_to_json(data, filename):
    """
    Save list to json file
    :param data: data to save
    :param filename: name of the file
    """
    with open(filename, 'w') as f:
        json.dump(data, f)

In [3]:
country_to_capital_dict = load_list_from_json('../data/country_to_capital_city.json')

In [4]:
TMBD_API_KEY = 'None'

In [5]:
NINJA_API_KEY = 'None'

In [6]:
def retrieve_birthplaces_from_tmdb_api(names, results, thread_num, api_key=TMBD_API_KEY):
    """
    Retrieve birthplaces of people
    :param names: list of names
    :param results: dictionary to store results
    :param thread_num: number of the thread
    """
    places_of_birth = []
    for name in tqdm(names):
        name_url = urllib.parse.quote(name)
        url = f"https://api.themoviedb.org/3/search/person?query={name_url}&include_adult=false&language=en-US&page=1"
        headers = {
            "accept": "application/json",
            "Authorization": f"Bearer {api_key}"
        }
        try:
            response = requests.get(url, headers=headers)
            response_dict = json.loads(response.text)
            person_id = response_dict['results'][0]['id']
            url = f"https://api.themoviedb.org/3/person/{str(person_id)}?language=en-US"
            headers = {
                "accept": "application/json",
                "Authorization": f"Bearer {api_key}"
            }
            response = requests.get(url, headers=headers)
            response_dict = json.loads(response.text)
            places_of_birth.append(response_dict['place_of_birth'])
        except Exception as e:
            print(f"An error occurred for {name}: {str(e)}")
            places_of_birth.append(None)

    results[thread_num] = places_of_birth

In [7]:
def start_multithreading_birthplaces(func, array_list, num_thread):
    """
    Start multithreading 
    :param func: function to be executed
    :param array_list: list of arrays to be passed to the function
    :param num_thread: number of threads
    :return: list of results
    """
    birthplaces_list = []
    results = {}
    start = time.perf_counter()
    threads = []

    for i in range(num_thread):
        t = threading.Thread(target=func, args=(array_list[i]['name'], results, i))
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()

    for i in range(num_thread):
        birthplaces_list.extend(results[i])

    finish = time.perf_counter()
    print(f'Finished in {round(finish - start, 2)} second(s)')
    return birthplaces_list

In [18]:
# Read actual file
df_credits = pd.read_csv('../data/credits.csv')  # person_id, id, name, character, role

In [None]:
NUM_THREADS = 8
df_credits_splitted = np.array_split(df_credits, NUM_THREADS)
birthplaces = start_multithreading_birthplaces(retrieve_birthplaces_from_tmdb_api, df_credits_splitted, NUM_THREADS)
save_list_to_json(birthplaces, './birthplaces.json')

In [None]:
birthplaces = load_list_from_json('../data/birthplaces.json')
birthplaces = [birthplace if birthplace != 'None' else None for birthplace in birthplaces]
df_credits['birthplace'] = birthplaces
df_credits.to_csv('data/credits_expanded.csv')

### Retrieving latitude and longitude of the birthplaces

In [101]:
def place_parser(place):
    """
    Parse a string containing a place of birth
    :param place: the place to parse
    :return: city, state, country
    """
    USA_SYNONYMES = ['USA', 'United States', 'United States of America', 'US', 'U.S.A.', 'U.S.A', 'U.S.', 'U.S',
                     'America', 'United States of America (USA)', 'Estados Unidos']

    if place is None:
        return None, None, None

    if '-' in place and ',' not in place:
        place_splitted = place.split('-')
    else:
        place_splitted = place.split(',')

    if len(place_splitted) == 1:
        try:
            country = place_splitted[0].strip()
            if country in USA_SYNONYMES: country = 'United States of America'
            capital_city = country_to_capital_dict[country]
        except Exception as e:
            print(f"An error occurred in the place_parser: the country {str(e)} is not found !")
            capital_city = None
        return capital_city, None, place_splitted[0].strip()
    elif len(place_splitted) == 2:
        if place_splitted[-1].strip() == 'UK':
            return place_splitted[0].strip(), None, 'GB'
        if place_splitted[-1].strip() in USA_SYNONYMES:
            return place_splitted[0].strip(), None, 'US'
        return place_splitted[0].strip(), None, place_splitted[1].strip()
    elif len(place_splitted) == 3:
        if place_splitted[-1].strip() == 'UK':
            return place_splitted[0].strip(), None, 'GB'
        if place_splitted[-1].strip() in USA_SYNONYMES:
            return place_splitted[0].strip(), place_splitted[1].strip(), 'US'
        return place_splitted[0].strip(), place_splitted[1].strip(), place_splitted[2].strip()
    else:
        if place_splitted[-1].strip() == 'UK':
            return place_splitted[0].strip(), None, 'GB'
        if place_splitted[-1].strip() in USA_SYNONYMES:
            return place_splitted[1].strip(), place_splitted[2].strip(), 'US'
        return place_splitted[1].strip(), place_splitted[2].strip(), place_splitted[3].strip()

In [86]:
def retrieve_latlong_from_ninja_api(places, results, errors, lock_res, lock_error, api_key=NINJA_API_KEY):
    """
    Retrieve latitude and longitude of a place
    :param places: list of place names
    :param results: dictionary to store results (conccurent access)
    :param errors: list to store errors (conccurent access)
    :param lock_res: lock for the results
    :param lock_error: lock for the errors
    """

    def make_api_call(api_url, api_key):
        """
        Make an API call to retrieve the latitude and longitude of a place
        :param api_url: the url of the Ninja API
        :param api_key: the Ninja API key
        """
        while True:
            response = requests.get(api_url, headers={'X-Api-Key': api_key})
            if response.status_code == requests.codes.ok:
                try:
                    city_info = json.loads(response.text)[0]
                    lock_res.acquire()
                    results[place_parsed_string] = (city_info['latitude'], city_info['longitude'])
                    lock_res.release()
                except Exception as e:
                    lock_error.acquire()
                    errors.append(place)
                    lock_error.release()
                    print(f"Error {e}: {place} was not found (was parsed into {place_parsed_string})")
                break
            else:
                print("Error:", response.status_code, response.text, "Retrying in 5 seconds...")
                time.sleep(5)  # Wait for 5 seconds before retrying because of API Throttled

    for place in tqdm(places):
        place_parsed = (city, state, country) = place_parser(place)
        if city and country:
            place_parsed_string = ', '.join(place_parsed) if state else ', '.join([city, country])
        else:
            continue

        if place_parsed_string in results:
            continue
        else:
            api_url = f'https://api.api-ninjas.com/v1/geocoding?city={city}&state={state}&country={country}' if state else f'https://api.api-ninjas.com/v1/geocoding?city={city}&country={country}'
            make_api_call(api_url, api_key)

In [10]:
def start_multithreading_latlong(func, array_list, results, errors, num_thread):
    """
    Start multithreading 
    :param func: function to be executed
    :param array_list: list of arrays to be passed to the function
    :param results: dictionary to store results
    :param errors: list to store errors
    :param num_thread: number of threads
    """
    start = time.perf_counter()
    threads = []
    lock_res = threading.Lock()
    lock_error = threading.Lock()
    for i in range(num_thread):
        t = threading.Thread(target=func, args=(array_list[i], results, errors, lock_res, lock_error))
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()

    finish = time.perf_counter()
    print(f'Finished in {round(finish - start, 2)} second(s)')

In [17]:
NUM_THREADS = 4
df = pd.read_csv('../data/credits_expanded.csv')
birthplaces = df["birthplace"].dropna().unique()
print(f'Number of unique birthplaces: {len(birthplaces)}')
birthplaces_splitted = np.array_split(birthplaces, NUM_THREADS)

Number of unique birthplaces: 8309


In [None]:
results = {}
errors = []
start_multithreading_latlong(retrieve_latlong_from_ninja_api, birthplaces_splitted, results, errors, NUM_THREADS)
save_list_to_json(results, '../data/place_to_latlong.json')
save_list_to_json(errors, '../data/places_not_found.json')

### RETRIEVE THE COUNTRY OF THE BIRTHPLACE

In [11]:
gpt_wrapper.api_base = "http://mnlp-backend-938795011.eu-central-1.elb.amazonaws.com"
gpt_wrapper.api_key = "None"

In [12]:
def gpt_retrieve(list, counts, thread_num, chuck=20):
    chat = None
    list_split = np.array_split(list, chuck)
    count = counts[thread_num]
    for i, places in tqdm(enumerate(list_split), total=len(list_split)):
        # Skip already processed places
        if i < count:
            continue

        string_of_places = ""
        for place in places:
            if not place:
                string_of_places += 'None' + '§'
            else:
                string_of_places += str(place) + '§'
        string_of_places = string_of_places.rstrip("§")

        new_chat = True
        error = ""
        
        while True:
            content = error + string_of_places
            error = ""
            try:
                if new_chat:
                    chat = Chat.create("Chat Data Viz" + ' ' + str(thread_num) + ' ' + str(i))
                    message = chat.ask(content=content,
                                       instruction="Given a list of places separated by '§', You'll return only a list of dict where each place is processed in the format: (city, state, country, country in ISO 3166-1 alpha-2 format, (latitude, longitude)) with 6 decimal precision for numbers. Put the floats into strings. All names will be in English. For example: {\"city\": \"Paris\", \"state\": \"Île-de-France\", \"country\": \"France\", \"country_code\": \"FR\", \"coordinates\": [\"48.864716\", \"2.349014\"]}. Provide data in json format, as if you did a json.dump with the list, without the ```json ... ``` and with ',' delimiters. The response must have EXACTLY as many elements as the input list. If the place is 'nan' return 'None'. If a particular place appears consecutively multiple times, you should return it the same number of times it occurs in the sequence. If a country doesn't exist anymore, put today's equivalence (ex: Russia instead of USSR).")
                    new_chat = False
                else:
                    message = chat.ask(content=content)
                
                response = json.loads(str(message))

                if len(response) == len(places):
                    path = f'../data/gpt_data_thread_{thread_num}_count_{count}.json'
                    count += 1
                    new_path = f'../data/gpt_data_thread_{thread_num}_count_{count}.json'
                    data = load_list_from_json(path)
                    data.extend(response)
                    os.remove(path)
                    save_list_to_json(data, new_path)
                    break
                else:
                    error = f'You made an error in the previous generation: wrong generated length {len(response)} instead of {len(places)}. Do it again: \n\n\n'
                    print(f'Error: wrong generated length {len(response)} instead of {len(places)} :(.  Retrying...')
                    print(f'Input message: {string_of_places}')
                    print(f'Generated message: {response}')

            except Exception as e:
                if 'context_length_exceeded' in str(e):
                    new_chat = True
                elif "Expecting \',\' delimiter" in str(e) or "Expecting value" in str(e):
                    error = f'You made an error in the previous generation: {str(e)}. Do it again with the same list WITHOUT the fucking error: \n\n\n'
                    print(f"An error occurred: {str(e)}")
                    print(places)
                    print(' what you gave me: ' + str(message) + '\n\n')
                else:
                    print(f"An error occurred: {str(e)}")
                    print('Retrying in 15 seconds...')
                    time.sleep(15)

In [13]:
def start_multithreading_gpt(func, array_list, counts, num_thread):
    """
    Start multithreading 
    :param func: function to be executed
    :param array_list: list of arrays to be passed to the function
    :param num_thread: number of threads
    :return results: dictionary to store results
    """
    start = time.perf_counter()
    threads = []

    for i in range(num_thread):
        t = threading.Thread(target=func, args=(array_list[i], counts, i))
        t.start()
        threads.append(t)

    for thread in threads:
        thread.join()

    finish = time.perf_counter()
    print(f'Finished in {round(finish - start, 2)} second(s)')

In [14]:
Chat.budget()

{'limit': 10000000, 'usage': 3607323}

In [37]:
NUM_THREADS = 4
places = load_list_from_json('../data/places_not_found.json')
places_split = np.array_split(places, NUM_THREADS)

638

In [24]:
#for i in range(4):
#    path = f'../data/gpt_data_thread_{i}_count_0.json'
#    save_list_to_json([], path)

In [34]:
counts = [8, 20, 20, 20] # Number of already processed places, useful in case of interruption
start_multithreading_gpt(gpt_retrieve, places_split, counts, NUM_THREADS)

  0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [44]:
# Fetch the threads' result and concatenate them
list_gpt = []
for i in range(NUM_THREADS):
    data = load_list_from_json(f'../data/gpt_data_thread_{i}_count_{20}.json')
    list_gpt.extend(data)
    
dict = {}
for j in range(len(places)):
    dict[places[j]] = list_gpt[j]
    
save_list_to_json(dict, '../data/places_not_found_to_latlong.json')

### Construct the new data frame summarizing all data 


In [87]:
place_to_latlong = load_list_from_json('../data/place_to_latlong.json')
place_not_found_to_latlong = load_list_from_json('../data/place_not_found_to_latlong.json')

df_credits_expanded = pd.read_csv('../data/credits_expanded.csv')
df_credits_expanded

Unnamed: 0.1,Unnamed: 0,person_id,id,name,character,role,birthplace
0,0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR,"Greenwich Village, New York City, New York, USA"
1,1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR,"Los Angeles, California, USA"
2,2,7064,tm84618,Albert Brooks,Tom,ACTOR,"Beverly Hills, California, USA"
3,3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR,"Brooklyn, New York City, New York, USA"
4,4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR,"Memphis, Tennessee, USA"
...,...,...,...,...,...,...,...
77796,77796,736339,tm1059008,Adelaida Buscato,María Paz,ACTOR,
77797,77797,399499,tm1059008,Luz Stella Luengas,Karen Bayona,ACTOR,"Medellin, Antioquia, Colombia"
77798,77798,373198,tm1059008,Inés Prieto,Fanny,ACTOR,
77799,77799,378132,tm1059008,Isabel Gaona,Cacica,ACTOR,


In [106]:
# Append ISO to place_to_latlong
data = load_list_from_json('../data/place_to_latlong.json')
country_to_ISO = load_list_from_json('../data/country_to_ISO.json')
new_data = {}

def append_iso(string_place):
    string_parsed = string_place.split(',')
    country = string_parsed[-1].strip()
        
    new_k = ""
    i = 0
    
    if country == 'US':
        while string_parsed[i].strip() != 'US':
            new_k += string_parsed[i].strip() + ', '
            i += 1
        new_k += 'United States, US'
    elif country == 'GB':
        while string_parsed[i].strip() != 'GB':
            new_k += string_parsed[i].strip() + ', '
            i += 1
        new_k += 'United Kingdom, GB'
    
    else:
        iso = country_to_ISO.get(country, "None")
        new_k = string_place + ', ' + iso
    
    return new_k


for k, v in data.items():
    new_k = append_iso(k)
    new_data[new_k] = v
    
    
save_list_to_json(new_data, '../data/new_place_to_latlong.json')

In [107]:
df_credits = pd.read_csv('../data/credits.csv')
place_not_found_to_latlong = load_list_from_json('../data/place_not_found_to_latlong.json')
new_place_to_latlong = load_list_from_json('../data/new_place_to_latlong.json')

cities = []
states = []
countries = []
isos = []
latlongs = []

def decode_latlong(latlong):
    if not latlong or latlong == "None" or (latlong[0] == "None" and latlong[1] == "None"): return None
    return float(latlong[0]), float(latlong[1])

def get_place_info(place):
    if not place:
        return None, None, None, None, None
    
    if place in place_not_found_to_latlong:
        infos = place_not_found_to_latlong[place]
        city = infos.get('city', None)
        state = infos.get('state', None)
        country = infos.get('country', None)
        iso = infos.get('country_code', None)
        latlong = decode_latlong(infos.get('coordinates'))
    
    else:
        place = city, state, country = place_parser(place)
        if not country:
            return None, None, None, None, None
        if not state:
            place = (city, country)
        if not city:
            place = country
        place = append_iso(', '.join(place))
        country = place.split(', ')[-2]
        iso = place.split(', ')[-1]
        if place in new_place_to_latlong:
            latlong = decode_latlong(new_place_to_latlong[place])
        else:
            latlong = None
            
    return city, state, country, iso, latlong


# Iterate over each row in the DataFrame
for index, row in df_credits_expanded.iterrows():
    if pd.isna(df_credits_expanded.at[index, 'birthplace']):
        cities.append(None)
        states.append(None)
        countries.append(None)
        isos.append(None)
        latlongs.append(None)
        continue
    
    place = row['birthplace']
    place_info = get_place_info(place)
    cities.append(place_info[0])
    states.append(place_info[1])
    countries.append(place_info[2])
    isos.append(place_info[3])
    latlongs.append(place_info[4])
    
df_credits['city'] = cities
df_credits['state'] = states
df_credits['country'] = countries
df_credits['iso'] = isos
df_credits['latlong'] = latlongs

df_credits.to_csv('../data/new_credits.csv')

An error occurred in the place_parser: the country 'Hong Kong' is not found !
An error occurred in the place_parser: the country 'Peshawar' is not found !
An error occurred in the place_parser: the country 'مصر' is not found !
An error occurred in the place_parser: the country 'Peshawar' is not found !
An error occurred in the place_parser: the country 'New York City' is not found !
An error occurred in the place_parser: the country 'Como (Italy)' is not found !
An error occurred in the place_parser: the country 'USSR [now Russia]' is not found !
An error occurred in the place_parser: the country '日本，静冈县，磐田市' is not found !
An error occurred in the place_parser: the country 'Hong Kong' is not found !
An error occurred in the place_parser: the country 'Prayagraj' is not found !
An error occurred in the place_parser: the country 'Delhi' is not found !
An error occurred in the place_parser: the country 'Hong Kong' is not found !
An error occurred in the place_parser: the country 'Arizona'

In [108]:
df = pd.read_csv('../data/new_credits.csv')
df

Unnamed: 0.1,Unnamed: 0,person_id,id,name,character,role,city,state,country,iso,latlong
0,0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR,New York City,New York,United States,US,"(40.7127281, -74.0060152)"
1,1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR,Los Angeles,California,United States,US,"(34.0536909, -118.242766)"
2,2,7064,tm84618,Albert Brooks,Tom,ACTOR,Beverly Hills,California,United States,US,"(34.0696501, -118.3963062)"
3,3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR,New York City,New York,United States,US,"(40.7127281, -74.0060152)"
4,4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR,Memphis,Tennessee,United States,US,"(35.1335022, -89.9668758)"
...,...,...,...,...,...,...,...,...,...,...,...
77796,77796,736339,tm1059008,Adelaida Buscato,María Paz,ACTOR,,,,,
77797,77797,399499,tm1059008,Luz Stella Luengas,Karen Bayona,ACTOR,Medellin,Antioquia,Colombia,CO,"(6.2443382, -75.573553)"
77798,77798,373198,tm1059008,Inés Prieto,Fanny,ACTOR,,,,,
77799,77799,378132,tm1059008,Isabel Gaona,Cacica,ACTOR,,,,,
