In [5]:
import os

import numpy as np
import pandas as pd
from geopy.distance import geodesic

The datasets here were collected from 2 different sources:
- https://simplemaps.com/data/tz-cities
- https://worldpopulationreview.com/countries/cities/tanzania

In [6]:
directory_to_data = os.path.join('..', '..', 'data')

# Import Tanzania cities coordinates & population dataset
tanzania_cities = pd.read_csv(os.path.join(directory_to_data, 'tz.csv'))

# Import 2021 population dataset
tanzania_pop = pd.read_csv(os.path.join(directory_to_data, 'csvData.csv'))
tanzania_pop = tanzania_pop[['name', '2021']]
tanzania_pop.columns = ['city', 'population_2021']

# Merge 2 datasets on city names 
tz_df = tanzania_pop.merge(tanzania_cities, how='left', left_on='city', right_on='city')
tz_df.head()

# Drop the 2 population columns from the first df
tz_df.drop(['population', 'population_proper'], axis=1, inplace=True)

# Create another column that store the coordinates as a tuple
tz_df['coordinates'] = list(zip(tz_df.lat, tz_df.lng))

Then I will select the cities with population above 100,000 (this benchmark is currently arbitrary)

In [8]:
tz_pop_above100k = tz_df[tz_df.population_2021 > 100000].copy()

# Drop rows with missing latitude and/or longitude
tz_pop_above100k.dropna(subset=['lat', 'lng'], inplace=True)

tz_pop_above100k.to_pickle(os.path.join(directory_to_data, 'tanzania_big_cities.pkl'))

In [9]:
def closest_point(x):
    '''
    Returns: a tuple
    - tuple[0]: name of nearest big city (population over 100000)
    - tuple[1]: geodesic distance from point x to nearest big city (tuple[0])
    '''
    
    output = None
    for _, city in tz_pop_above100k.iterrows():
        distance = geodesic(x, city.coordinates).km
        city_name = city.city
        if (output is None) or (distance < output[1]):
            output = (city_name, distance)
    return output