# Data Cleaning: REAL / SWN data

(source: Valerie Bauza, Aquaya REAL Water and SWN)

Two files:

- SWN INTERNAL - Station Location GPS 16 Feb 2023.xlsx (Original from SWN. Incomplete, inaccurate. But has district and region information)
- SWN INTERNAL - SWN_Storage Tank_GPS.xlsx (Updated from VB, survey storage tank locations)

In [42]:
import pandas as pd

In [43]:
comms_file = "./data/SWN INTERNAL - Station Location GPS 16 Feb 2023.xlsx"
tanks_file = "./data/SWN INTERNAL - SWN_Storage Tank_GPS.xlsx"

## Load and basic clean

In [44]:
# Load and basic clean
comms_df = pd.read_excel(comms_file, skiprows=[0,1,2])
comms_df = comms_df.drop(["#"], axis=1)
comms_df.shape

(115, 5)

In [45]:
comms_df.head(2)

Unnamed: 0,Community,Region,District,GPS LAT.,GPS LONG
0,Beyin,Western,Jomorro,,
1,Eikwe,Western,Ellembelle,4.96498,-2.4666


In [98]:
# Load and basic clean
tanks_df = pd.read_excel(tanks_file)
tanks_df[["LATITUDE", "LONGITUDE"]] = tanks_df["form.storageTank_si.gps"].str.split(" ", expand=True)[[0, 1]]
tanks_df = tanks_df.drop(["form.storageTank_si.gps"], axis=1)
tanks_df.shape

(100, 4)

In [99]:
tanks_df.head(3)

Unnamed: 0,form.community.id,community_name,LATITUDE,LONGITUDE
0,1,Beyin,4.9892601,-2.596853
1,2,Eikwe,4.9649546,-2.466758
2,3,Ayisakro,4.8944028,-2.2125789


## Name Match for Admin Info Assignment

In [100]:
for t in tanks_df.itertuples():
    c_df = comms_df[comms_df["Community"]==t.community_name]
    if c_df.shape[0] < 1:
        print(f"Tank Commnity {t.community_name} missing match")
        continue
    tanks_df.at[t.Index, "REGION"] = c_df.iloc[0]["Region"]
    tanks_df.at[t.Index, "DISTRICT"] = c_df.iloc[0]["District"]

In [101]:
tanks_df

Unnamed: 0,form.community.id,community_name,LATITUDE,LONGITUDE,REGION,DISTRICT
0,1,Beyin,4.9892601,-2.596853,Western,Jomorro
1,2,Eikwe,4.9649546,-2.466758,Western,Ellembelle
2,3,Ayisakro,4.8944028,-2.2125789,Western,Nzema East
3,4,Apataim,4.9183262,-2.2319688,Western,Nzema East
4,5,Agyan,4.8325023,-2.2007041,Western,Nzema East
...,...,...,...,...,...,...
95,99,Tapa Amanya,7.4549086,0.3400183,Oti,Biakoye
96,100,Apesokubi,7.565834,0.403682,Oti,Biakoye
97,101,Aveme,6.9355183,0.2649983,Volta,North Dayi
98,101,Aveme,6.9356602,0.2649907,Volta,North Dayi


In [102]:
assert tanks_df["DISTRICT"].isna().sum() == tanks_df["DISTRICT"].isna().sum() == 0

## Final Cleanup and Export

In [103]:
tanks_df = tanks_df.drop(["form.community.id"], axis=1)
tanks_df = tanks_df.rename(columns={"community_name": "COMMUNITY"})
tanks_df["COUNTRY"] = "Ghana"
tanks_df["SOURCE"] = "Aquaya-REAL-SWN"
tanks_df["MANAGED / OPERATED BY"] = "SWN"
tanks_df["COMPONENT TYPE"] = "Unknown"
tanks_df["DESCRIPTION"] = "Sample storage tank or other component"
tanks_df["COMMENT"] = ""

In [104]:
tanks_df.head(2)

Unnamed: 0,COMMUNITY,LATITUDE,LONGITUDE,REGION,DISTRICT,COUNTRY,SOURCE,MANAGED / OPERATED BY,COMPONENT TYPE,DESCRIPTION,COMMENT
0,Beyin,4.9892601,-2.596853,Western,Jomorro,Ghana,Aquaya-REAL-SWN,SWN,Unknown,Sample storage tank or other component,
1,Eikwe,4.9649546,-2.466758,Western,Ellembelle,Ghana,Aquaya-REAL-SWN,SWN,Unknown,Sample storage tank or other component,


In [105]:
tanks_df = tanks_df[["COUNTRY", "REGION", "DISTRICT", "COMMUNITY", "SOURCE", "MANAGED / OPERATED BY",
                     "COMPONENT TYPE", "DESCRIPTION", "LATITUDE", "LONGITUDE", "COMMENT"]]
tanks_df.tail(2)

Unnamed: 0,COUNTRY,REGION,DISTRICT,COMMUNITY,SOURCE,MANAGED / OPERATED BY,COMPONENT TYPE,DESCRIPTION,LATITUDE,LONGITUDE,COMMENT
98,Ghana,Volta,North Dayi,Aveme,Aquaya-REAL-SWN,SWN,Unknown,Sample storage tank or other component,6.9356602,0.2649907,
99,Ghana,Oti,Biakoye,Worawora,Aquaya-REAL-SWN,SWN,Unknown,Sample storage tank or other component,7.5143567,0.38113,


In [106]:
tanks_df.to_excel("./data/AFPW-PipedWaterSystems-REALSWN.xlsx", index=False)