In [75]:
import pandas as pd
import googlemaps
import os
base = "subnational_raw/"
gmaps = googlemaps.Client('AIzaSyCgQpFWRf8jh7xp7wkEUvXaHROh4_ILEd0')

## Define useful functions

In [76]:
def melt(df, country, indicator, indicator_category, unit_of_measure, columns, latlng):
    """
     -- Input: 
    temp = a UNAIDS CSV file with the multiple years format read in the for loop that loops throught the files. 
    file = the name of the file to check for bugs if the assert is trigged
    indicator = the name of the indicator
    indicator_category = the name of the key population 
    unit_of_measure = can be either a number or a percentage
    
    A function to melt all given dataframes in the right format.
    -- Output:
    Melted dataframe in the right format. 
         """
    #id_vars: define identifier variables
    identifiers = "States"
    
    df = pd.melt(df, id_vars = identifiers, var_name="year", value_vars=columns)
    df['country'] = country
    df['legend'] = indicator_category
    df['indicator'] = indicator
    df['unit_of_measure'] = unit_of_measure
    df.rename(columns={"States":"province"}, inplace=True) ## Rename to province
    df = df[df.province != 'National'] ## Remove national
    df = pd.merge(df, latlng, on=['country', 'province'])
    return df

def get_meta_data(file_name):
    data = file_name.replace(".csv", "").replace('-1st', '').split("_")
    return {"country":data[0], "indicator": data[1], "unit_of_measure": data[2], "category":data[3]}


# def get_lat_lng(row):
#     """Finds the center of each province"""
#     resp = gmaps.geocode('{}, {}'.format(row['province'], row['country']))
#     location = resp[0]['geometry']['location']
#     return pd.Series([location['lat'], location['lng']])

## Get unique country + province

In [10]:
latlng = pd.DataFrame(columns=["country", "province"])
error = []
for f in os.listdir(base):
    try:
        df = pd.read_csv(base + f)
        df = df[df.States != "National"]
        country = get_meta_data(f)["country"].replace('-1st', '')
        locations = pd.DataFrame()
        locations['province'] = df['States']
        locations['country'] = country
        latlng = latlng.append(locations)
    except:
        print('Got error for: ', f)
        error.append(f)

Got error for:  Nigeria-1st_AIDS mortality per 1000 population_Percentage_All population.csv
Got error for:  Nigeria-1st_AIDS mortality per 1000 population_Percentage_Females.csv
Got error for:  Nigeria-1st_AIDS mortality per 1000 population_Percentage_Males.csv
Got error for:  Nigeria-1st_AIDS orphans_Number_AIDS orphans (0-17).csv
Got error for:  Nigeria-1st_AIDS-related deaths_Number_All adults (15-49).csv
Got error for:  Nigeria-1st_AIDS-related deaths_Number_All people aged 50 and over.csv
Got error for:  Nigeria-1st_AIDS-related deaths_Number_All population.csv
Got error for:  Nigeria-1st_AIDS-related deaths_Number_All young people (15-24).csv
Got error for:  Nigeria-1st_AIDS-related deaths_Number_Children (0-14).csv
Got error for:  Nigeria-1st_AIDS-related deaths_Number_Female adults (15-49).csv
Got error for:  Nigeria-1st_AIDS-related deaths_Number_Females.csv
Got error for:  Nigeria-1st_AIDS-related deaths_Number_Male adults (15-49).csv
Got error for:  Nigeria-1st_AIDS-related

In [None]:
# import regex as re
# for fn in error:
#     f = open(base + fn, 'r')
# #     new = open(base + fn.replace('.csv', '.new') + '.csv', 'w')
#     new = open(base + '.csv', 'w')
#     content = f.read()
#     content = re.sub(r'FCT, Abuja', r'FCT Abuja', content)
#     new.write(content)

## Get geolocation for all the province/country combinations

In [22]:
import numpy as np
def get_lat_lng(row):
    """Finds the center of each province"""
    row['country'] = row['country'].replace('-2nd', '')
    resp = gmaps.geocode('{}, {}'.format(row['province'], row['country']))
    try:
        location = resp[0]['geometry']['location']
    except:
        print('{}, {}'.format(row['province'], row['country']))
        location = {'lat': np.nan, 'lng': np.nan}
    return pd.Series([location['lat'], location['lng']])

latlng = latlng.drop_duplicates()
latlng[['lat', 'lng']] = latlng.apply(get_lat_lng, axis=1)

Nyanza, Kenya
Nyanza, Kenya
Kikuube, Uganda


In [None]:
# lat = 9.7277758
# lng = 6.0951867
# latlng.loc[latlng.province == 'Niger', ['lat', 'lng']] = lat, lng
# latlng

## Parse data

In [72]:
# files = os.listdir(base)
# f = files[0]
# meta_data = get_meta_data(f)
# country = meta_data["country"]
# indicator = meta_data["indicator"]
# unit_of_measure = meta_data["unit_of_measure"]
# category = meta_data["category"]
# df_o = pd.read_csv(base + f)
# df_o.columns = df_o.columns.str.strip()
# columns = df_o.columns
# columns = [x for x in columns if x.isdigit()] ## Get only digit columns as these are years
# df = melt(df_o, country, indicator, category, unit_of_measure, columns, latlng)
# df

In [62]:
files = [item for item in os.listdir(base) if item not in error]
data = []
error_files = []
for f in files:
    try:
        meta_data = get_meta_data(f)
        country = meta_data["country"]
        indicator = meta_data["indicator"]
        unit_of_measure = meta_data["unit_of_measure"]
        category = meta_data["category"]
        df_o = pd.read_csv(base + f)
        df_o.columns = df_o.columns.str.strip()
        columns = df_o.columns
        columns = [x for x in columns if x.isdigit()] ## Get only digit columns as these are years
        df = melt(df_o, country, indicator, category, unit_of_measure, columns, latlng)
        data.append(df)
    except:
        error_files.append(f)
        
error_files

[]

## Aggregate all the seperate dataframes

In [79]:
df.columns = ['', 'date', 'mixed value', 'country', 'legend', 'indicator', 'unit_of_measure', 'lat', 'lng']

In [80]:
df = pd.DataFrame()
for d in data:
    df = df.append(d, ignore_index=True)

In [81]:
df.country.unique()

array(['Ethiopia', 'Kenya', 'Malawi', 'Mozambique', 'Nigeria', 'Tanzania',
       'Uganda', 'Zambia', 'Zimbabwe'], dtype=object)

In [82]:
import regex as re
df['value'].replace(to_replace="[^0-9,.]", value="", regex=True, inplace=True)

df['indicator'] = df['indicator'] + ' - SN'

In [83]:
df.to_csv("unaids_sn_15-09-2020.csv", index=False)