In [None]:
import pandas as pd
import pycountry
from fuzzywuzzy import fuzz, process

In [None]:
outbreak_data_df = pd.read_csv("data/outbreak_data.csv")
outbreak_data_df.info()

In [None]:
outbreak_data_df["spatial_scale"].value_counts()

In [None]:
outbreak_data_df[outbreak_data_df["spatial_scale"] == "admin2"]

ADM0 - Country
ADM1 - Province/State
ADM2 - District/Region
ADM3 - Local Governemtn/Area Councils
ADM4 - Ward/Village

Using the outbreak_data_df DataFrame, I am creating a location DataFrame by splitting the 'location' string. 

In [None]:
location_df = outbreak_data_df["location"].str.split("::", expand=True)
column_names = ["ADM0", "ADM1", "ADM2", "ADM3", "AMD4"]
location_df.columns = column_names
location_df

In [None]:
adm0 = location_df["ADM0"].values

Using the pycountry package, I am creating a DataFrame of the countries that match the values of the location_df['AMD1'] column.

In [None]:
adm1 = location_df["ADM1"].values
fuzzy_countries_data = []
for adm1 in adm1:
    fuzzy_search = pycountry.countries.search_fuzzy(adm1)[0]
    fuzzy_countries_data.append(
        {
            "ADM1": adm1,
            "Country": fuzzy_search.name,
            "Alpha 3": fuzzy_search.alpha_3,
        }
    )
fuzzy_countries_df = pd.DataFrame(fuzzy_countries_data)
fuzzy_countries_df

Using the pycountry.subdivisions data, I am creating a new DataFrame that I can to extract the ADM2 Names, the country, and its corresponding code.

In [None]:
subdivisions = pycountry.subdivisions

# Create a list to store subdivision data
subdivision_data = []

# Iterate over subdivisions and extract relevant information
for subdivision in subdivisions:
    subdivision_data.append(
        {
            "ADM2 Name": subdivision.name,
            "Code": subdivision.code,
            "Country": subdivision.country.name,
            "Country Code": subdivision.country.alpha_2,
        }
    )

# Create a DataFrame from the subdivision data
subdivision_df = pd.DataFrame(subdivision_data)
subdivision_df

In [None]:
merged_df = pd.merge(subdivision_df, fuzzy_countries_df, on="Country")
merged_df

In [None]:
adm2 = location_df["ADM2"].values
subdivison_matches = []
for adm2 in adm2:
    matched_subdivision = process.extractOne(adm2, subdivision_df["ADM2 Name"].values)
    subdivison_matches.append(
        {
            "Location String": outbreak_data_df["location"],
            "AMD2": adm2,
            "ADM2 Name": matched_subdivision[0],
            "Name Score": matched_subdivision[1],
        }
    )

subdivison_matches_df = pd.DataFrame(subdivison_matches)
subdivison_matches_df

In [None]:
final_merged_df = pd.merge(merged_df, subdivison_matches_df, on="ADM2 Name")
final_merged_df