# Load and Merge Alcohol Consumption data with GeoData

In [1]:
%load_ext lab_black
%load_ext autoreload
%autoreload 2

In [2]:
import json
import glob
import os
import requests

import geopandas as gpd
import pandas as pd
import pycountry

In [3]:
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

In [4]:
usecols = ["Country", "Beverage Types"]
new_col_names = ["Year", "Country", "Value"]
shapefile_col_names = ["ADMIN", "ADM0_A3", "geometry"]
new_shapefile_col_names = ["country", "country_code", "geometry"]

In [5]:
PROJ_ROOT_DIR = os.getcwd()
raw_data_dir = os.path.join(PROJ_ROOT_DIR, "app", "data", "raw")
shapefile_filepath = os.path.join(
    raw_data_dir, "ne_110m_admin_0_countries", "ne_110m_admin_0_countries.shp"
)

In [6]:
def download_data(raw_data_dir):
    years_ranges = [
        [1960, 1979],
        [1980, 1999],
        [2000, 2009],
        [2010, 2020],
    ]
    for s_e in years_ranges:
        years = list(range(min(s_e), max(s_e) + 1))[::-1]
        year_str = [f";YEAR:{year}" for year in years]
        url = f"https://apps.who.int/gho/athena/data/xmart.csv?target=GHO/SA_0000001400&profile=crosstable&filter=COUNTRY:*{''.join(year_str)}&x-sideaxis=COUNTRY;DATASOURCE;ALCOHOLTYPE&x-topaxis=GHO;YEAR"

        filepath = os.path.join(raw_data_dir, f"data2_{min(years)}_{max(years)}.csv")
        if not os.path.exists(filepath):
            print(f"Downloading years in range - {min(years)}-{max(years)}...", end="")
            r = requests.get(url)
            with open(filepath, "wb") as f:
                f.write(r.content)
            print("Done")
        else:
            print(
                f"Found file for years in range - {min(years)}-{max(years)}. Doing nothing."
            )

In [15]:
def get_data_by_year(
    year,
    raw_data_dir,
    usecols=["Country", "Beverage Types"],
    new_col_names=["Year", "Country", "Value"],
):
    d = {
        "data2_1960_1979": list(range(1960, 1979 + 1)),
        "data2_1980_1999": list(range(1980, 1999 + 1)),
        "data2_2000_2009": list(range(2000, 2009 + 1)),
        "data2_2010_2020": list(range(2010, 2020 + 1)),
    }
    filename = ""
    for k, v in d.items():
        if any(e == year for e in v):
            filename = f"{k}.csv"
    filepath = os.path.join(raw_data_dir, filename)
    # print(filepath)
    df = pd.read_csv(filepath, header=1, usecols=usecols + [" " + str(year)])
    # display(df.head())
    df = df[df["Beverage Types"] == " All types"].drop(
        columns=["Beverage Types"], axis=1
    )
    # display(df.head())
    df = df.set_index(["Country"]).unstack().reset_index()
    df.columns = new_col_names
    # display(df.head())
    # print(df.shape)
    df = df.dropna()
    return df.copy()

In [8]:
download_data(raw_data_dir)

Downloading years in range - 1960-1979...Done
Downloading years in range - 1980-1999...Done
Downloading years in range - 2000-2009...Done
Downloading years in range - 2010-2020...Done


In [10]:
df = get_data_by_year(2016, raw_data_dir, usecols=usecols, new_col_names=new_col_names)
display(df.head())

Unnamed: 0,Year,Country,Value
0,2016,Afghanistan,0.02
1,2016,Albania,4.67
2,2016,Algeria,0.6
3,2016,Andorra,9.94
4,2016,Angola,5.38


In [11]:
gdf = gpd.read_file(shapefile_filepath)[shapefile_col_names]
gdf.columns = new_shapefile_col_names
display(gdf.head())

Unnamed: 0,country,country_code,geometry
0,Fiji,FJI,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,United Republic of Tanzania,TZA,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,Canada,CAN,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,United States of America,USA,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


In [12]:
d = {
    "Bolivia (Plurinational State of)": "BOL",
    "Democratic People's Republic of Korea": "PRK",
    "Democratic Republic of the Congo": "COD",
    "Iran (Islamic Republic of)": "IRN",
    "Micronesia (Federated States of)": "FSM",
    "Republic of Korea": "KOR",
    "Republic of Moldova": "MDA",
    "United Kingdom of Great Britain and Northern Ireland": "GBR",
    "United Republic of Tanzania": "TZA",
    "United States of America": "USA",
    "Venezuela (Bolivarian Republic of)": "VEN",
}
codes = []
for index, row in df.iterrows():
    country = row["Country"].split(r" (")[0]
    # if index== 20:
    #     print(country)
    try:
        code = pycountry.countries.get(name=country).alpha_3
    except:
        if row["Country"] in list(d.keys()):
            code = d[row["Country"]]
        else:
            code = row["Country"]
    # if len(code) > 3:
    #     print(index, code)
    codes.append(code)
df["Code"] = codes

In [13]:
df_merged = gdf.merge(df, right_on="Code", left_on="country_code", how="left")

In [14]:
df_merged.loc[df_merged.isna().any(axis=1)]

Unnamed: 0,country,country_code,geometry,Year,Country,Value,Code
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",,,,
20,Falkland Islands,FLK,"POLYGON ((-61.20000 -51.85000, -60.00000 -51.2...",,,,
22,Greenland,GRL,"POLYGON ((-46.76379 82.62796, -43.40644 83.225...",,,,
23,French Southern and Antarctic Lands,ATF,"POLYGON ((68.93500 -48.62500, 69.58000 -48.940...",,,,
45,Puerto Rico,PRI,"POLYGON ((-66.28243 18.51476, -65.77130 18.426...",,,,
79,Palestine,PSX,"POLYGON ((35.39756 31.48909, 34.92741 31.35344...",,,,
134,New Caledonia,NCL,"POLYGON ((165.77999 -21.08000, 166.59999 -21.7...",,,,
140,Taiwan,TWN,"POLYGON ((121.77782 24.39427, 121.17563 22.790...",,,,
159,Antarctica,ATA,"MULTIPOLYGON (((-48.66062 -78.04702, -48.15140...",,,,
160,Northern Cyprus,CYN,"POLYGON ((32.73178 35.14003, 32.80247 35.14550...",,,,
