In [7]:
## Imports
from splinter import Browser
from bs4 import BeautifulSoup as soup
from pprint import pprint
import pandas as pd

Source:
https://ghdx.healthdata.org/countries

In [8]:
## Web Scraping:
# Set up Splinter
browser = Browser('chrome')

# Visit the IHME site with regions and countries
url = 'https://ghdx.healthdata.org/countries'
browser.visit(url)

# Scrape the website
html = browser.html

# Create a BeautifulSoup object from the scraped HTML
countries_page = soup(html, 'html.parser')

# Get the container with relevant data
all_columns = countries_page.find('div', class_='view')

# List with html classes containing desired data
columns = ["country-col1","country-col2","country-col3"]

# Sets up a dictionary to hold which countries belong to which regions
regions_with_countries = []

# Loops through list of html sections 
for column in columns:
    # Grabs the html section of interest
    html_column = all_columns.find("div", class_=column)
    # Sets up empty lists to hold the region names and the lists of countries in each region
    regions = []
    # country_lists = []
    # Scrapes each region name from the html and appends them to the list of regions
    for region in html_column.findAll("h3"):
        regions.append(region.text)
    # Sets up a counter to keep track of regions
    region_counter = 0
    # Scrapes lists of the countries in each region and appends them to the list of country lists
    for country_set in html_column.findAll("ul"):
        countries_temp = country_set.findAll("a")
        country_list = []
        for country in countries_temp:
            pair = {}
            pair['country'] = country.text.split(" (")[0]
            pair['region'] = regions[region_counter]
            regions_with_countries.append(pair)
        region_counter += 1
        
# Quits the browsing session
browser.quit()

In [19]:
# Makes a DataFrame of country/ region pairs
df = pd.DataFrame(regions_with_countries)

# Establishes the base filepath to find and write the data
## NOTE: This will be different for each computer
filepath = "C:/Users/kronh/OneDrive/Documents/UofTCoding_bootcamp/project_3_data"

# Imports the regions csv in order to add a region id column to the countries DataFrame
regions_df = pd.read_csv(f"{filepath}/data_csv/regions.csv")

# joins the DataFrames to add a region_ID to the countries DataFrame
joined_df = df.merge(regions_df, how='left', on='region')

# Renames the id column
joined_df.rename(columns={'id':'region_id'}, inplace=True)

# Removes the region column
joined_df.drop(axis=0, columns='region', inplace=True)

# Reorganizes the DataFrame so the PK comes first (id) and the FK comes next (region_id)
joined_df = joined_df[['region_id', 'country']]

Source: 
https://www.kaggle.com/datasets/paultimothymooney/latitude-and-longitude-for-every-country-and-state?resource=download

In [25]:
## Gets country coordinates and adds them to the dataframe

# Reads the csv with coordinates
coordinates_df = pd.read_csv(f"{filepath}/data_csv/world_country_and_usa_states_latitude_and_longitude.csv")

# Removes unnecessary columns
coordinates_df = coordinates_df[["country_code", "country", "latitude", "longitude"]]

# joins the DataFrames to add region_id and country_id to the spending/ population DataFrame
final_df = pd.merge(joined_df, coordinates_df, how='left', on='country')

# Creates a country ID column
final_df.insert(0, 'id', range(1,1+len(final_df)))

In [26]:
# Exports the final DataFrame to a csv for input into a database
final_df.to_csv(f"{filepath}/data_csv/countries.csv", index=False)