In [None]:
import pandas as pd
import pycountry
import folium
import folium.map
from folium.plugins import TimeSliderChoropleth
import geopandas as gpd
import plotly.express as px
from fuzzywuzzy import process

Let's begin by reading our outbreak data and creating a dataframe.

In [None]:
outbreak_data_df = pd.read_csv("data/outbreak_data.csv")
pd.set_option("display.max_columns", None)
outbreak_data_df.columns

Taking a look at the value counts of our spatial_scale column, it's clear that administrative level 2 yields the largest count. This is helpful in helping us determine the granulity of our data exploration.

In [None]:
outbreak_data_df["spatial_scale"].value_counts()

In [None]:
admin2_rows_df = outbreak_data_df[outbreak_data_df["spatial_scale"] == "admin2"]
admin2_rows_df

ADM0 - Country
ADM1 - Province/State
ADM2 - District/Region
ADM3 - Local Governemtn/Area Councils
ADM4 - Ward/Village

Using the outbreak_data_df DataFrame, I am creating a location DataFrame by splitting the 'location' string. 

In [None]:
location_df = admin2_rows_df["location"].str.split("::", expand=True)
column_names = ["Continent", "ISO3", "admin1", "admin2"]
location_df.columns = column_names
location_df

Using the pycountry package, I am creating a DataFrame of the countries that match the values of the location_df['ADM2'] column.

In [None]:
fuzzy_countries_data = []
adm0 = location_df["ISO3"].values
for country in adm0:
    fuzzy_search = pycountry.countries.search_fuzzy(country)[0]
    fuzzy_countries_data.append(
        {
            "ADM0": country,
            "Country": fuzzy_search.name,
            "ISO3": fuzzy_search.alpha_3,
        }
    )
fuzzy_countries_df = pd.DataFrame(fuzzy_countries_data)
fuzzy_countries_df

Using the pycountry.subdivisions data, I am creating a new DataFrame that I can to extract the ADM2 Names, the country, and its corresponding code.

In [None]:
subdivisions = pycountry.subdivisions

# Create a list to store subdivision data
subdivision_data = []

# Iterate over subdivisions and extract relevant information
for subdivision in subdivisions:
    subdivision_data.append(
        {
            "ADM2 Name": subdivision.name,
            "Code": subdivision.code,
            "Country": subdivision.country.name,
            "Country Code": subdivision.country.alpha_2,
            "Type": subdivision.type,
        }
    )

# Create a DataFrame from the subdivision data
subdivision_df = pd.DataFrame(subdivision_data)
subdivision_df

Let's cross reference the country info using the CountryInfo package

In [None]:
from countryinfo import CountryInfo

provinces = CountryInfo("Democratic Republic of the Congo").provinces()
provinces

In [None]:
# to group by year and month, we'll focus on the extract the year value from the 'start_date'
outbreak_data_df["s_Date"] = pd.to_datetime(
    outbreak_data_df["start_date"], format="%m/%d/%Y"
)
outbreak_data_df["e_Date"] = pd.to_datetime(
    outbreak_data_df["end_date"], format="%m/%d/%Y"
)
outbreak_data_df["s_month"] = outbreak_data_df["s_Date"].dt.month
outbreak_data_df["s_year"] = outbreak_data_df["s_Date"].dt.year
outbreak_data_df

In [None]:
adm2 = location_df["admin2"]
adm2_list = []
for adm2 in adm2:
    adm2_list.append(adm2.title())
fuzzy_countries_df["ADM2"] = adm2_list
fuzzy_countries_df

In [None]:
import requests
import zipfile
import io

gadm_url_prefix = "https://geodata.ucdavis.edu/gadm/gadm4.1/json/"


def get_gadm_data(url: str):
    response = requests.get(url)

    if response.status_code == 200:
        # Successful response
        path = "./data/geojson/"
        with zipfile.ZipFile(io.BytesIO(response.content)) as zip_file:
            zip_file.extractall(path)

    else:
        # Unsuccessful response
        print("Error: Failed to download data from URL:", url)


url_suffix = f'gadm41_{fuzzy_countries_df["ISO3"][2]}_2.json.zip'
print(gadm_url_prefix + url_suffix)
get_gadm_data(gadm_url_prefix + url_suffix)


def build_suffix(country: str) -> str:
    return f"gadm41_{country}_2.json.zip"

In [None]:
import json

countries = fuzzy_countries_df["ISO3"].unique()
files = []
for index, country in enumerate(countries):
    suffix = build_suffix(country)
    url = gadm_url_prefix + suffix
    file_path = get_gadm_data(url)
    path_prefix = "/data/geojson/"
    files.append(f"gadm41_{country}_2.json")

files
feature_collection = []
for file_path in files:
    with open(f"./data/geojson/{file_path}", "r") as file:
        # Load the contents of the file into a Python data structure
        data = json.load(file)
        feature_collection = feature_collection.append(
            FeatureCollection.parse_raw(data)
        )
        admin2_gdf = gpd.GeoDataFrame.from_features(data).rename(
            {"GID_0": "ISO3"}, axis=1
        )
admin2_gdf

In [None]:
subdivison_matches = []


def admin2_match(df_row):
    iso3, admin2 = df_row.loc[["ISO3", "admin2"]]
    choices = admin2_gdf.query(f"ISO3 == '{iso3}'")["NAME_2"]
    name_2, *_ = process.extractOne(admin2, choices, score_cutoff=70) or (None,)

    return pd.concat([df_row, pd.Series({"NAME_2": name_2})])


matched_admin2s_df = (
    location_df.drop_duplicates()
    .sort_values(["ISO3", "admin2"])
    .apply(admin2_match, axis=1)
)

matched_admin2s_df.to_csv("matched_admin2s.csv", index=False)
matched_admin2s_df

In [None]:
cholera_df_combined = pd.concat([outbreak_data_df, fuzzy_countries_df], axis=1)
cholera_df_combined

In [None]:
cholera_df_combined["outbreak_number"].unique()

In [None]:
repeated_outbreaks = cholera_df_combined.groupby(["s_year", "ISO3"]).max()[
    "outbreak_number"
]
repeated_outbreaks

In [None]:
yearly_cases = (
    cholera_df_combined.groupby(["s_year", "ISO3", "ADM2"])
    .sum(numeric_only=True)["total_suspected_cases"]
    .reset_index()
)
yearly_cases.info()

In [None]:
merged_df = pd.merge(cholera_df_combined, yearly_cases, on="ISO3")
merged_df

In [None]:
yearly_snapshot = px.choropleth(
    yearly_cases,
    locations="ISO3",
    color="total_suspected_cases",
    hover_name="ISO3",
    color_continuous_scale=px.colors.sequential.Plasma,
    animation_frame="s_year",
    animation_group="ISO3",
    range_color=[0, 100000],
)

yearly_snapshot.update_geos(scope="africa")

yearly_snapshot.show()

In [None]:
def africa_map():
    return folium.Map(
        location=[-4.61216, 23.32187],
        tiles="Stamen Terrain",
        zoom_start=2,
        scroll_wheel_zoom=False,
    )

In [None]:
m = africa_map()
folium.TileLayer("Stamen Terrain", overlay=True, name="Stamen Terrain").add_to(m)

In [None]:
benin_geodata = gpd.read_file(
    "/Users/jennifertran/Code/ds/geospatial-ds-cholera-lab/data/geojson/gadm41_BEN_2.json"
)
geo_data = (
    gpd.GeoSeries(benin_geodata.set_index("NAME_1")["geometry"]).to_json(indent=2),
)
geo_data

In [None]:
counties = cholera_df_combined.loc[cholera_df_combined["ISO3"] == "BEN", "ADM2"]
counties

In [None]:
styledict = yearly_cases.set_index("ISO3")["total_suspected_cases"].to_dict()
time_indexed_style = {}

for ISO3, total_suspected_cases in styledict.items():
    time_indexed_style[ISO3] = {
        "fillOpacity": 0.7,
        "color": "black",
        "fillColor": "YlGn",
        "weight": 1.0,
        "opacity": 0.7,
        "dashArray": "3",
        "radius": total_suspected_cases * 0.5,
    }

In [None]:
TimeSliderChoropleth(
    benin_geodata,
    name="Outbreaks over time",
    styledict=time_indexed_style,
    overlay=True,
).add_to(m)

In [None]:
print(benin_geodata.set_index("NAME_1")["geometry"])

In [None]:
choropleth = folium.Choropleth(
    name="Cholera Outbreaks in Benin",
    geo_data=benin_geodata,
    data=yearly_cases,
    columns=["ADM2", "total_suspected_cases"],
    # displaying missing data in purple
    nan_fill_color="blue",
    nan_fill_opacity=0.4,
    # YlGn refers to yellow and green
    fill_color="YlGn",
    fill_opacity=0.7,
    line_opacity=0.1,  # type: ignore
    key_on="feature.id",
    legend_name="Cholera Outbreaks in Benin",
    overlay=False,
).add_to(m)


m

In [None]:
benin_geojson = gpd.GeoSeries(benin_geodata.set_index("NAME_1")["geometry"]).to_json()
benin_geojson

In [None]:
yearly_snapshot = px.choropleth(
    yearly_cases,
    locations="NAME_2",
    color="total_suspected_cases",
    hover_name="NAME_2",
    color_continuous_scale=px.colors.sequential.Plasma,
    animation_frame="s_year",
    animation_group="NAME_2",
    range_color=[0, 100000],
    geojson=feature_collection,
    featureidkey="properties.NAME_2",
)

yearly_snapshot.update_geos(scope="africa")

yearly_snapshot.show()