###Post processes Gaul Locations with different buffer sizes after manually adding them in QGIS
##Harmonizes country names
##Selects the best match for multiple rows with the same DHSID and differing country names

In [1]:
import difflib
import numpy as np
import pandas as pd

In [2]:
file = '/mnt/datadisk/data/Projects/water/inputs/final_locations_gaul.csv'
df = pd.read_csv(file)
df


Unnamed: 0,country,DHSID,DHSCC,DHSYEAR,DHSCLUST,CCFIPS,ADM1FIPS,ADM1FIPSNA,ADM1SALBNA,ADM1SALBCO,...,adm0_code_,adm0_name_,Shape_Ar_1,adm2_cod_1,adm2_nam_1,adm1_cod_1,adm1_nam_1,adm0_cod_1,adm0_nam_1,Shape_Ar_2
0,Burundi,BU201200000070,BU,2012.0,70.0,BY,,,,,...,43.0,Burundi,0.013315,40621.0,Ryansoro,40548.0,Gitega,43.0,Burundi,0.013315
1,Burundi,BU201600000414,BU,2016.0,414.0,BY,,,,,...,43.0,Burundi,0.013315,40621.0,Ryansoro,40548.0,Gitega,43.0,Burundi,0.013315
2,Burundi,BU201600000247,BU,2016.0,247.0,BY,,,,,...,43.0,Burundi,0.013315,40621.0,Ryansoro,40548.0,Gitega,43.0,Burundi,0.013315
3,Burundi,BU201600000329,BU,2016.0,329.0,BY,,,,,...,43.0,Burundi,0.013315,40621.0,Ryansoro,40548.0,Gitega,43.0,Burundi,0.013315
4,Burundi,BU201600000309,BU,2016.0,309.0,BY,,,,,...,43.0,Burundi,0.013315,40621.0,Ryansoro,40548.0,Gitega,43.0,Burundi,0.013315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250499,Mali,ML201800000319,ML,2018.0,319.0,ML,,,,,...,,,,,,,,,,
250500,Mali,ML201800000320,ML,2018.0,320.0,ML,,,,,...,,,,,,,,,,
250501,Mozambique,MZ201100000610,MZ,2011.0,610.0,MZ,MZ11,Maputo (city),Maputo (city),MOZ006,...,,,,,,,,,,
250502,Nigeria,NG200800000773,NG,2008.0,773.0,NG,NI50,RIVERS,RIVERS,NGA033,...,,,,,,,,,,


In [3]:
countries = df["country"].unique()
adm0_names = df["adm0_name"].unique()
unmatched_countries = [c for c in countries if c not in adm0_names]
unmatched_adm0_names = [c for c in adm0_names if c not in countries]
print(unmatched_countries)
print(unmatched_adm0_names)
country_rename = {'Congo Democratic Republic': 'Democratic Republic of the Congo', 'Tanzania': 'United Republic of Tanzania', "Cote d'Ivoire": "Côte d'Ivoire"}
df["country"] = df["country"].replace(country_rename)

['Congo Democratic Republic', 'Tanzania', "Cote d'Ivoire"]
['Democratic Republic of the Congo', 'United Republic of Tanzania', 'South Sudan', 'Sudan', nan, 'Somalia', 'Western Sahara', 'Guinea-Bissau', 'Mauritania', "Côte d'Ivoire"]


In [4]:
#combine multiple columns if values are missing
for adm in ['adm0_name', 'adm1_name', 'adm2_name']:
    #select rows where admX_name is missing and fill in with admX_name_
    df.loc[df[adm].isnull(), adm] = df.loc[df[adm].isnull(), adm+'_']
    # missing_adm = df[df[adm].isnull()]

#compare 2 columns for overlapping values
#not one single match...
print(len(df))
print(len(df[df['ADM1NAME'] != df['adm1_name']]))
print(len(df[df['ADM1NAME'] == df['adm1_name']]))
print(len(df[df['DHSREGNA'] != df['adm1_name']]))
print(len(df[df['DHSREGNA'] == df['adm1_name']]))
df['ADM1NAME'] = df['ADM1NAME'].fillna('').astype(str)
df['adm1_name'] = df['adm1_name'].fillna('').astype(str)
df['DHSREGNA'] = df['DHSREGNA'].fillna('').astype(str)

# difflib.get_close_matches(df['adm1_name'], df['ADM1NAME'])

250504
175294
75210
210093
40411


In [5]:
def select_row(group):
    """Cleanup multiple rows with the same dhsid but differing country names"""
    # Check if any row in the group has country == adm0_name
    matching_rows = group[group['country'] == group['adm0_name']]
    if not matching_rows.empty:
        #compare adm1_name and ADM1NAME
        if len(matching_rows) > 1:
            best_match_l = []
            for _, row in matching_rows.iterrows():
                if row['ADM1NAME'] in difflib.get_close_matches(row['adm1_name'], matching_rows['ADM1NAME']):
                    best_match_l.append(row)
                if row['DHSREGNA'] in difflib.get_close_matches(row['adm1_name'], matching_rows['DHSREGNA']):
                    best_match_l.append(row)
                if len(best_match_l) >= 2:
                    # match_adm = [row['adm1_name'] for row in best_match_l]
                    match_values_l = [best_match_l[0]['ADM1NAME'], best_match_l[1]['DHSREGNA']]
                    for row1, match in zip(best_match_l, match_values_l):
                        if match in difflib.get_close_matches(row1['adm1_name'], match_values_l):
                            best_match_l.append(row1)
                            # update match count
                            select_row.match_count += 1
                            return row1
            else:
                # If no close match, return the first row in the group
                return matching_rows.iloc[0]
        else:
            return matching_rows.iloc[0]
    else:
        # Otherwise, return the first row in the group
        return group.iloc[0]

# initialize match count
select_row.match_count = 0
result_df = df.groupby('DHSID').apply(select_row)

print("Number of matches found:", select_row.match_count)
print(len(result_df))

Number of matches found: 24526
70756


In [8]:
#drop columns
cols = ['adm0_name_', 'adm1_name_', 'adm2_name_', 'adm0_code_', 'adm1_code_', 'adm2_code_' ]
final_df = result_df.drop(columns=cols)
final_df

Unnamed: 0_level_0,country,DHSID,DHSCC,DHSYEAR,DHSCLUST,CCFIPS,ADM1FIPS,ADM1FIPSNA,ADM1SALBNA,ADM1SALBCO,...,adm0_name,Shape_Area,Shape_Ar_1,adm2_cod_1,adm2_nam_1,adm1_cod_1,adm1_nam_1,adm0_cod_1,adm0_nam_1,Shape_Ar_2
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AO200600000001,Angola,AO200600000001,AO,2006.0,1.0,AO,,,,,...,Angola,0.289515,0.289515,4222.0,Lobito,399.0,Benguela,8.0,Angola,0.289515
AO200600000002,Angola,AO200600000002,AO,2006.0,2.0,AO,,,,,...,Angola,0.289515,0.289515,4222.0,Lobito,399.0,Benguela,8.0,Angola,0.289515
AO200600000003,Angola,AO200600000003,AO,2006.0,3.0,AO,,,,,...,Angola,0.196329,0.196329,4216.0,Benguela,399.0,Benguela,8.0,Angola,0.196329
AO200600000004,Angola,AO200600000004,AO,2006.0,4.0,AO,,,,,...,Angola,0.329282,0.329282,4254.0,Seles,403.0,Cuanza Sul,8.0,Angola,0.330459
AO200600000005,Angola,AO200600000005,AO,2006.0,5.0,AO,,,,,...,Angola,0.569600,0.569600,4208.0,Dande,398.0,Bengo,8.0,Angola,0.569600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZW201500000396,Zimbabwe,ZW201500000396,ZW,2015.0,396.0,ZW,,,,,...,Zimbabwe,0.081806,0.212495,33067.0,Goromonzi,69550.0,Mashonaland East,271.0,Zimbabwe,0.212495
ZW201500000397,Zimbabwe,ZW201500000397,ZW,2015.0,397.0,ZW,,,,,...,Zimbabwe,0.671006,0.671006,33056.0,Makoni,3437.0,Manicaland,271.0,Zimbabwe,0.671006
ZW201500000398,Zimbabwe,ZW201500000398,ZW,2015.0,398.0,ZW,,,,,...,Zimbabwe,1.681446,1.681446,33076.0,Hurungwe,3440.0,Mashonaland West,271.0,Zimbabwe,1.681446
ZW201500000399,Zimbabwe,ZW201500000399,ZW,2015.0,399.0,ZW,,,,,...,Zimbabwe,0.081806,0.081806,33080.0,Zvimba,3440.0,Mashonaland West,271.0,Zimbabwe,0.518954


In [9]:
final_df[final_df["country"] != final_df["adm0_name"]][["adm0_name", "country", "DHSID", "adm1_name", "ADM1NAME", "adm2_name"]]

Unnamed: 0_level_0,adm0_name,country,DHSID,adm1_name,ADM1NAME,adm2_name
DHSID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BJ199651402023,,Benin,BJ199651402023,,OUEME,
BJ201200000126,,Benin,BJ201200000126,,Atlantique,
BJ201200000128,,Benin,BJ201200000128,,Atlantique,
BJ201200000500,,Benin,BJ201200000500,,Mono,
BJ201200000569,,Benin,BJ201200000569,,Oueme,
...,...,...,...,...,...,...
ZM201800000179,,Zambia,ZM201800000179,,Luapula,
ZM201800000387,Democratic Republic of the Congo,Zambia,ZM201800000387,Katanga,Luapula,Haut-Katanga
ZW200500000450,Mozambique,Zimbabwe,ZW200500000450,Tete,Mashonaland Central,Magoe
ZW200500001060,Mozambique,Zimbabwe,ZW200500001060,Tete,Mashonaland Central,Magoe


In [10]:
#write country into adm0_name if adm0_name is missing or country differs from adm0_name
final_df.loc[final_df["country"] != final_df["adm0_name"], "adm0_name"] = final_df.loc[final_df["country"] != final_df["adm0_name"], "country"]

In [11]:
final_df.describe()

Unnamed: 0,DHSYEAR,DHSCLUST,ADM1DHS,DHSREGCO,LATNUM,LONGNUM,ALT_GPS,ALT_DEM,adm2_code,adm1_code,adm0_code,Shape_Area,Shape_Ar_1,adm2_cod_1,adm1_cod_1,adm0_cod_1,Shape_Ar_2
count,70756.0,70756.0,70756.0,70756.0,70756.0,70756.0,70756.0,70756.0,70537.0,70537.0,70537.0,70537.0,70721.0,70732.0,70732.0,70732.0,70732.0
mean,2010.405817,383943.5,1583.887275,33.393267,3.536423,18.061645,7385.895991,827.449021,63013.97658,25257.370486,4616.82259,0.539649,0.542024,62128.11767,25301.761508,4613.572556,0.570762
std,7.60249,2306894.0,3623.487541,491.277119,14.745476,18.725756,4215.088461,1493.371782,56143.363957,29295.233661,12703.917535,1.486384,1.482488,55796.468123,29362.900877,12699.152143,1.491075
min,1986.0,1.0,0.0,0.0,-34.463232,-17.503485,-306.0,-92.0,1379.0,398.0,6.0,5.5e-05,0.0001,1379.0,398.0,8.0,0.000101
25%,2005.0,121.0,4.0,2.0,-4.30604,0.318297,1468.0525,66.0,21987.0,1824.0,94.0,0.033509,0.034948,21977.0,1828.0,94.0,0.041683
50%,2012.0,258.0,9.0,4.0,6.052601,28.241568,9999.0,356.0,42215.0,3427.0,169.0,0.161193,0.16284,42175.0,3426.5,170.0,0.178381
75%,2016.0,505.0,37.0,8.0,12.126208,33.23919,9999.0,1181.0,66017.0,48368.0,243.0,0.53386,0.5398,65959.0,48369.0,243.0,0.568114
max,2022.0,61508060.0,10228.0,9999.0,35.858894,50.457728,9999.0,9999.0,191425.0,190430.0,40765.0,28.762152,28.762152,191424.0,190431.0,40765.0,28.762152


In [12]:
final_df.to_csv('/mnt/datadisk/data/Projects/water/inputs/all_locations_gaul_final2.csv', index=False)
