# Aggregate All Water System Data

Load, clean, and aggregate all Water Systems data. Include only piped water systems (leave non-piped systems to different analysis)

In [1]:
import pandas as pd
import geopandas as gpd

## Config

In [2]:
afc_file = "./data/AFPW - Community Water Systems.xlsx"
af_file = "./data/AFPW-PipedWaterSystems-AssuranceFund.xlsx"
hh_file = "./data/AFPW-PipedWaterSystems-HHSurvey.xlsx"
imp_file = "./data/AFPW-PipedWaterSystems-REALIMP.xlsx"
swn_file = "./data/AFPW-PipedWaterSystems-REALSWN.xlsx"

mwater_ba_file = "./data/AFPW-PipedWaterSystems-mWater-BrongAhafo.xlsx"
mwater_aev_file = "./data/AFPW-PipedWaterSystems-mWater-AshantiEasternVolta.xlsx"

In [3]:
# Regions and Select region
region_pcodes = {
    "Ashanti": "GH02",
    "Eastern": "GH06",
    "Volta": "GH14",
    "Bono": "GH03",
    "Ahafo": "GH01"
}

## Load and Clean

#### Assurance Fund - Communities

In [26]:
afc_df = pd.read_excel(afc_file, sheet_name="All Community Systems")
afc_df = afc_df.rename(columns={"Region": "REGION", "District": "DISTRICT", "Community": "COMMUNITY",
                                "Source": "SOURCE", "System Managed By": "OPERATOR",
                                "Latitude": "LATITUDE", "Longitude": "LONGITUDE",
                                "System Type": "SYSTEM_TYPE"})
afc_df["REGION"] = afc_df["REGION"].str.strip()
afc_df["COUNTRY"] = "Ghana"
afc_df = afc_df[afc_df["REGION"].isin(region_pcodes.keys())].copy()

print(afc_df.shape)
afc_df.head(1)

(56, 17)


Unnamed: 0,REGION,DISTRICT,COMMUNITY,Community (Alt. Name),SOURCE,SYSTEM_TYPE,Source of System Type,OPERATOR,Source of System Managed By,LATITUDE,LONGITUDE,Location Description,Source Comments,Population,Source of Population,Comments,COUNTRY
0,Ahafo,Asunafo North,Akrodie,,Aquaya - AF,Piped Water,Aquaya,AF,,6.692092,-2.554405,WSMT Office,WSMTs OFFICES AND STANDPIPES LOCATIONS - 2024-...,,,,Ghana


#### Assurance Fund - System Components

In [30]:
af_df = pd.read_excel(af_file)
af_df = af_df.rename(columns={"OPERATOR_MANAGER": "OPERATOR"})
af_df = af_df.dropna(subset=["LATITUDE", "LONGITUDE"])
af_df["REGION"] = af_df["REGION"].str.strip()
af_df["COUNTRY"] = "Ghana"
af_df = af_df[af_df["REGION"].isin(region_pcodes.keys())].copy()

print(af_df.shape)
af_df.head(1)

(374, 13)


Unnamed: 0,SYSTEMID,REGION,DISTRICT,COMMUNITY,SOURCE,OPERATOR,SYSTEM_TYPE,COMPONENT_TYPE,DESCRIPTION,LATITUDE,LONGITUDE,COMMENT,COUNTRY
0,7,Ahafo,Asunafo North,Akrodie,Aquaya-AF,Unknown,Piped Water,Office,,6.692092,-2.554405,,Ghana


#### Aquaya - Household Survey

In [92]:
hh_df = pd.read_excel(hh_file)
hh_df = hh_df.rename(columns={"Water System": "COMMUNITY", "Operator": "OPERATOR"})
hh_df = hh_df.drop(columns=["Unnamed: 6"])
hh_df = hh_df.dropna(subset=["LATITUDE", "LONGITUDE"])
hh_df["REGION"] = hh_df["REGION"].str.strip()
hh_df["SYSTEM_TYPE"] = "Piped Water"
hh_df = hh_df[hh_df["REGION"].isin(region_pcodes.keys())].copy()

print(hh_df.shape)
hh_df.head(1)

(135, 19)


Unnamed: 0,COUNTRY,REGION,DISTRICT,COMMUNITY,LATITUDE,LONGITUDE,Year of birth,Years in Ops(Age),TreatmentType,Number of Connections,Payment Method,OPERATOR,Total Std,Funct Std,Priv Con,Sch,Inst.,SOURCE,SYSTEM_TYPE
0,Ghana,Ashanti,Bekwai Municipal,Abodom Water System,6.442147,-1.55075,2022.0,21.0,SSF,233,Postpaid,CWSA,18.0,14.0,215.0,0.0,0.0,Aquaya-HHSurvey,Piped Water


#### REALWater IMP Study

In [99]:
imp_df = pd.read_excel(imp_file)
imp_df["REGION"] = imp_df["REGION"].str.strip()
imp_df["OPERATOR"] = "Unknown"
imp_df = imp_df.rename(columns={"Water System": "COMMUNITY"})
imp_df = imp_df[imp_df["REGION"].isin(region_pcodes.keys())].copy()

print(imp_df.shape)
imp_df.head(1)

(477, 10)


Unnamed: 0,COMMUNITY,LATITUDE,LONGITUDE,Type,LocationDescription,COUNTRY,REGION,DISTRICT,SOURCE,OPERATOR
50,Gbadzeme Water and Sanitation Management Team,6.598277,0.502011,System Location,,Ghana,Volta,Ho West,Aquaya-REALIMP,Unknown


#### Safe Water Network

In [98]:
swn_df = pd.read_excel(swn_file)
swn_df = swn_df.rename(columns={"MANAGED / OPERATED BY": "OPERATOR"})
swn_df = swn_df[swn_df["REGION"].isin(region_pcodes.keys())].copy()

print(swn_df.shape)
swn_df.head(1)

(69, 11)


Unnamed: 0,COUNTRY,REGION,DISTRICT,COMMUNITY,SOURCE,OPERATOR,COMPONENT TYPE,DESCRIPTION,LATITUDE,LONGITUDE,COMMENT
15,Ghana,Ahafo,Tano North,Adrobaa,Aquaya-REAL-SWN,SWN,Unknown,Sample storage tank or other component,7.26635,-2.137014,


#### mWater

In [24]:
mba_df = pd.read_excel(mwater_ba_file)
maev_df = pd.read_excel(mwater_aev_file)
mwater_df = pd.concat([mba_df, maev_df])

mwater_df = mwater_df.rename(columns={"Water System": "COMMUNITY", "Managed By": "OPERATOR"})
mwater_df["REGION"] = mwater_df["REGION"].str.strip()
mwater_df = mwater_df[mwater_df["REGION"].isin(region_pcodes.keys())].copy()

print(mwater_df.shape)
mwater_df.head(1)

(2276, 27)


Unnamed: 0,COMMUNITY,Description,Type,DISTRICT,REGION,COUNTRY,LATITUDE,LONGITUDE,Location Accuracy,Location Altitude,...,Date added,Date last modified,Drilling method,Pump/lifting device,Supply for groundwater system,Supply for piped system,Treatment works,Depth (m),Photos,SOURCE
0,Maame Ebuah,,Piped into public tap / standpipe / basin,Ahafo Ano North,Ashanti,Ghana,6.933563,-2.23958,0.0,,...,2021-03-23T09:35:10.565Z,2023-10-25T21:44:01.904Z,,,,,,,,Public-mWater


## Compile master of all Systems (and Components) Data

In [1]:
sorted((set(afc_df.columns) & set(af_df.columns) & set(hh_df.columns) & set(imp_df.columns) & set(swn_df.columns) & set(mwater_df.columns)))

NameError: name 'afc_df' is not defined

In [101]:
col_superset = ['COUNTRY', 'REGION', 'DISTRICT', 'COMMUNITY', 'SOURCE', 'OPERATOR', 'LATITUDE', 'LONGITUDE']

In [102]:
systems_df = pd.concat([afc_df[col_superset].copy(),
                        af_df[col_superset].copy(),
                        hh_df[col_superset].copy(),
                        imp_df[col_superset].copy(),
                        swn_df[col_superset].copy(),
                        mwater_df[col_superset].copy()])

systems_gdf = gpd.GeoDataFrame(systems_df, geometry=gpd.points_from_xy(systems_df["LONGITUDE"], systems_df["LATITUDE"]), crs="EPSG:4326")
systems_gdf.shape

(3387, 9)

In [103]:
systems_gdf.head(1)

Unnamed: 0,COUNTRY,REGION,DISTRICT,COMMUNITY,SOURCE,OPERATOR,LATITUDE,LONGITUDE,geometry
0,Ghana,Ahafo,Asunafo North,Akrodie,Aquaya - AF,AF,6.692092,-2.554405,POINT (-2.55441 6.69209)


In [104]:
systems_gdf.tail(1)

Unnamed: 0,COUNTRY,REGION,DISTRICT,COMMUNITY,SOURCE,OPERATOR,LATITUDE,LONGITUDE,geometry
2275,Ghana,Eastern,Fanteakwa,Gideon Ofori Twumasi,Public-mWater,Safe Water Network,6.323716,-0.498676,POINT (-0.49868 6.32372)


## Explore

In [105]:
systems_gdf["REGION"].value_counts()

REGION
Ashanti    1314
Volta       817
Eastern     781
Bono        314
Ahafo       161
Name: count, dtype: int64

In [116]:
systems_gdf["SOURCE"].value_counts()

SOURCE
Public-mWater      2276
Aquaya-REALIMP      477
Aquaya-AF           430
Aquaya-HHSurvey     135
Aquaya-REAL-SWN      69
Name: count, dtype: int64

In [109]:
systems_gdf["OPERATOR"].value_counts()

OPERATOR
Safe Water Network    1762
Unknown                830
World Vision WASH      401
Akatsi                 161
WSMT                    77
Public                  69
Project Maji            43
CWSA                    22
WaterAid                 5
Name: count, dtype: int64

## Fixes

In [115]:
systems_gdf.loc[systems_gdf["SOURCE"] == "Aquaya - AF", "SOURCE"] = "Aquaya-AF"

systems_gdf.loc[systems_gdf["OPERATOR"] == "ristome (user)", "OPERATOR"] = "Unknown"
systems_gdf.loc[systems_gdf["OPERATOR"] == "Ahafo Region-Ghana", "OPERATOR"] = "Unknown"
systems_gdf.loc[systems_gdf["OPERATOR"] == "SWN, WSMT", "OPERATOR"] = "SWN"
systems_gdf.loc[systems_gdf["OPERATOR"] == "CWSA, WSMT", "OPERATOR"] = "CWSA"
systems_gdf.loc[systems_gdf["OPERATOR"] == "Nana233 (user)", "OPERATOR"] = "Unknown"
systems_gdf.loc[systems_gdf["OPERATOR"] == "World Vision WASH -> WV Ghana WASH", "OPERATOR"] = "World Vision WASH"
systems_gdf.loc[systems_gdf["OPERATOR"] == "AF", "OPERATOR"] = "Unknown"
systems_gdf.loc[systems_gdf["OPERATOR"] == "SWN", "OPERATOR"] = "Safe Water Network"

## Map

In [110]:
systems_gdf.explore(column="SOURCE", cmap="tab10", tiles="Cartodb positron")

In [111]:
systems_gdf.explore(column="OPERATOR", cmap="tab10", tiles="Cartodb positron")

## Export

In [117]:
export_pref = "./export/AFPW-AllPipedSystemsAndComponents"

In [118]:
systems_gdf.to_file(export_pref + ".geojson")

In [119]:
pd.DataFrame(systems_gdf.drop(columns=["geometry"])).to_excel(export_pref + ".xlsx", index=False)