# Purpose
This program extracts the relevant DHS-part dealing with water sources.

In [13]:
import os
import pandas as pd
import pyreadstat
from zipfile import ZipFile
import shutil
from ethiopian_date import EthiopianDateConverter as edc
import datetime

In [14]:
# Optional if ZIP Data with FL.zip ending was downloaded
def remove_FL(dir_name:str):
    #Remove all Zip-Data which doesn't include .sav
    folder = os.listdir(dir_name)
    for item in folder:
        if item.endswith("FL.zip") or item.endswith("FL.ZIP"):
            os.remove(os.path.join(dir_name, item))
            
# Optional if not only Houehold Surveys (HR) were downloaded
def remove_all_except_HR(dir_name:str):
    #Remove all Zip-Data which don't belong to a HR-survey (a HR survey is indicated by 'HR' on the 3-4 position
    #of the survey name)

    folder = os.listdir(dir_name)
    for item in folder:
        if not "HR" in item[2:4] and (item.endswith(".zip") or item.endswith(".ZIP")):
            os.remove(os.path.join(dir_name, item))  

In [15]:
def get_sav(listOfFileNames:list, newpath:str, zipObject: ZipFile):
    ZipFile
    # Extract solely the sav-file from the zip and save them into a seperate folder
    for fileName in listOfFileNames:
        if fileName.endswith('.sav') or fileName.endswith('.SAV'):
            # Extract the sav file from the zip
            zipObject.extract(fileName, newpath)
            
            
def extract_bigger_zip(zip_dir:str,filenames:str,newpath:str):
#For the case if a zip folder includes zip folders: 
#Take zip folder to get sav within main zip folder which is the biggest on of all w.r.t. to size (bytes)           
    
    #Create temporary directory for this survey
    zips_dir = zip_dir[:zip_dir.find('.')]
    if not os.path.exists(zips_dir):
        os.makedirs(zips_dir)
        
    #Extract the zip folders into the directory    
    with ZipFile(zip_dir, 'r') as zipObj:
        zipObj.extractall(zips_dir)

    big_size = 0
    big_zip = None
    
    #Compare zip folders and take the biggest one
    for item in filenames:
        if item.endswith('.zip') or item.endswith('.ZIP'):
            file_dir = os.path.join(zips_dir, item)
            curr_size= os.stat(file_dir).st_size
            if curr_size >= big_size:
                big_size = curr_size
                big_zip = file_dir
    
    # Now check via check_sav if dbf-file is given or if you have again zip-files within the current zip file
    check_sav(big_zip,newpath)    
    
#Second mainpart for single zip file        
def check_sav(zip_dir:str, newpath:str):
    
    big_zip = None
    #Get list of all elements within the zipfile
    with ZipFile(zip_dir, 'r') as zipObject:
        listOfFileNames = zipObject.namelist()
        
        #Check if any element within the zip is a sav file if yes jump into function get_sav
        if any((element.endswith('.sav') or element.endswith('.SAV')) for element in listOfFileNames):
            get_sav(listOfFileNames, newpath, zipObject)
            
        #If "if-clause" not true, check here if any element is a zip file if yes jump into extract_bigger_zip
        elif any((element.endswith('.zip') or element.endswith('.ZIP')) for element in listOfFileNames):
            extract_bigger_zip(zip_dir, listOfFileNames, newpath)  
            
def delete_zip_folders(dir_name:str,newpath:str):
    
    #Delete all directories which may be created during the process to figure out which zip folder is bigger             
    list_subfolders_with_paths = [f.path for f in os.scandir(dir_name) if f.is_dir()]
    for folder in list_subfolders_with_paths:
        if not folder == newpath:
            shutil.rmtree(folder)
            
#Main part-> runs through all zip files in a directory  
def extract_sav(dir_name:str, newpath:str):
    '''The HR Surveys retrieved (for SPSS) contain usually a sav-file from which
    we can gather all relevant information; in this part we save those sav-files in a seperate folder'''
    
    #Create folder for SAV files
    if not os.path.exists(newpath):
        os.makedirs(newpath)
        
    #List of all survey zips in the directory dir_name    
    folder = os.listdir(dir_name)
    
    #Extract sav file (if existing) from each zip file via check_sav
    for item in folder:
        if item.endswith('.zip') or item.endswith('.ZIP'):
            zip_dir = os.path.join(dir_name, item)
            check_sav(zip_dir, newpath)
    
    #Delete all directories via the following function
    delete_zip_folders(dir_name,newpath)
    
    return newpath

In [24]:
# Create csv data with information of year, water source, and region type for each cluster

# Find the label number for the question regarding water source
def find_water_var(df: pd.DataFrame, meta_dict: pyreadstat._readstat_parser.metadata_container):
    water = None
    for i in meta_dict:
        #Check if meta_dict is empty
        if meta_dict is None or meta_dict[i] is None:
            print('Water', meta_dict[i])
        # If question is given in meta_dict[i] we take the label i of it (if it is the first one we find)
        elif "source of drinking water" in meta_dict[i].lower():
                if not df[i].isnull().all().all():
                    if water == None:
                        water = i
                    else:
                        print('Water...2nd possibility', i)
    return water

# Find the label number for the cluster number
def find_cluster_var(df: pd.DataFrame, meta_dict: pyreadstat._readstat_parser.metadata_container):
    cluster = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print(meta_dict[i])
        # If question is given in meta_dict[i] we take the label i of it (if it is the first one we find)
        elif "Cluster number" in meta_dict[i] or "cluster number" in meta_dict[i]:
                        if not df[i].isnull().all().all():
                            cluster = i

    return cluster

# Find the label number for the year the survey was conducted
def find_year_var(df: pd.DataFrame, meta_dict: pyreadstat._readstat_parser.metadata_container):
    year = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print(meta_dict[i])
        # If question is given in meta_dict[i] we take the label i of it (if it is the first one we find)
        elif "Year of interview" in meta_dict[i] or "year of interview" in meta_dict[i]:
                    if not df[i].isnull().all().all():
                        year = i
    return year

# Find the label number for the residence type
def find_residence_var(df: pd.DataFrame, meta_dict: pyreadstat._readstat_parser.metadata_container):
    residence = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print(meta_dict[i])
        # If question is given in meta_dict[i] we take the label i of it (if it is the first one we find)
        elif  "Type of place of residence" in meta_dict[i] or "type of place of residence" in meta_dict[i]:
            if not df[i].isnull().all().all():
                residence = i
    return residence      

#Changing from ethopian date in HR-Surveys(not needed for GE-Data) to Gregorian dates
def get_eth_to_gregorian(df_year:pd.DataFrame, year_HV:str, len_row:int):
    
    #Translation of Ethopian year into the (two) Gregorian years as one Ethopian year spans over 2 Greg. years
    #and return the list which contains the years values (num_year times each)
    def gregorian_dates (year:int, num_year:int):
        greg_list = list()
        
        year_first_half = edc.to_gregorian(int(year),1,1).year
        year_second_half = None
        for month in range(1,13):
            # 5.day as the 13 Month has at usually only 5 days (except leapyear 6 days).
            year_scnd = edc.to_gregorian(int(year), month, 5).year
            if year_scnd != year_first_half:
                year_second_half = year_scnd

        greg_list.extend([year_first_half]*num_year)
        greg_list.extend([year_second_half]*num_year)


        return greg_list

    def check_for_emtpy_year_rows(num_years:int, table_len:int, year:int):
        diff = table_len-num_years
        return [year]*diff

    #Main
    gregorian_list = list()
    #Extract the earliest year
    year_min = df_year[year_HV].min()
    #Extract the latest year the survey was conducted
    year_max = df_year[year_HV].max()

    #For the case that the earliest and the latest year are not the same we need to translate both year into Gregorian
    # the min year in Gregorian dates gets to replace half of the Ethopian dates; and max year in Gregorian dates gets to replace
    #half of the ethopian date. A year in Ethopian year spans over two years in Gregorian dates, hence we need to 
    #split the number of Ethopian date values to replace for each Gregorian year by four (two for earliest Eth. date
    #and two for latest Eth. date)
    if year_max != year_min:
        num_year = int(len_row/4)
        #Add the  sub-year-list via gregorian_dates to the main year list gregorian_list
        gregorian_list.extend(gregorian_dates(year_min, num_year))
        gregorian_list.extend(gregorian_dates(year_max, num_year))
        #Case that num_year == len(crosstab[year]) due to int-transformation, add to rows w/o values year_max)
        if not len_row == num_year*4:
            #Extend the year list gregorian_list so that the length of the list == #cluster
            gregorian_list.extend(check_for_emtpy_year_rows(num_year*4,len_row, year_max))
    
    #If you have only one year in Ethopian year. The survey spans at most over two years in Gregorian date
    else:
        num_year = int(len_row/2)
        gregorian_list.extend(gregorian_dates(year_max, num_year))
        if not len_row == num_year*2:
            gregorian_list.extend(check_for_emtpy_year_rows(num_year*2, len_row, year_max))

    return gregorian_list

def get_csv(file:str, export_path:str):
    #Encoding of the sav file into dataframe df and meta file meta
    df, meta = pyreadstat.read_sav(file, encoding = 'LATIN1')
    #from meta file we extract the columns names and labels/ID
    meta_dict = dict(zip(meta.column_names, meta.column_labels))
    
    cluster = None
    water = None
    year = None
    residence = None
    #The relevant columns names have ID such as HV201...those labels are for the same question (like "Residence type")
    #usually identically for all surveys; but if not we jump into a function which searches for the label for the 
    #question we desire to include in our csv
    
    #Water source question has usually the label HV201 if not existing as label in meta_dict, jump into find_water_var
    if 'HV201' in meta_dict.keys():
        water = 'HV201'
    else:
        water = find_water_var(df, meta_dict)
        print('Water',water,file[file.rfind('/'):])
    #Cluster (number) question has usually the label HV001 if not existing as label in meta_dict, jump into find_cluster_var
    if 'HV001' in meta_dict.keys():
        cluster = 'HV001'
    else:
        cluster = find_cluster_var(df, meta_dict)
        print('Cluster',cluster, file[file.rfind('/'):])
    
    #Year question has usually the label HV007 if not existing as label in meta_dict, jump into find_year_var
    if 'HV007' in meta_dict.keys():
        year = 'HV007'
    else:
        year = find_year_var(df, meta_dict)
        print('Year',year, file[file.rfind('/'):])
        
    #Residence type question has usually the label HV025 if not existing as label in meta_dict, jump into find_residence_var
    if 'HV025' in meta_dict.keys():
        residence = 'HV025'
    else:
        residence = find_residence_var(df, meta_dict)
        print('residence', residence, file[file.rfind('/'):])
    
    
    try:      
        #Construction cross tab for cluster (rows) & water sources (each water source has its own column)
        crosstab = pd.crosstab(df[cluster], df[water].map(meta.variable_value_labels[water]),rownames = ["cluster"],colnames = ["Properties"], dropna=True)
        #export = file[file.rfind('/'):file.rfind('.')]
        #Get survey name (solely without path to it)
        filename = os.path.basename(file[:file.rfind(".")])
        #Add year column to the crosstab
        
        # In Ethopian HR Surveys the year is given based on the Ethopian calendar. Hence, a "translation" to
        # Gregorian calendar is required
        if filename.startswith('ETHR'):
            #Translation into gregorian calendar
            gregorian_list = get_eth_to_gregorian(df, year, len(crosstab))
            #Adding year column gregorian_list to cross tab
            crosstab.insert(loc = len(crosstab.columns), column = "year", value = gregorian_list)  
        else:               
            # Construct cross tab for cluster & year -> form: E.g. cluster: 24, 2014: 1, 2015: 0 [column name : entry]
            table = pd.crosstab(df[cluster] , df[year], rownames = ["cluster"], colnames= ["year"], dropna=True)
            #As table contains for each year mention in the survey an own column, we need to merge them to
            #one single column where the column values are the years, hence idmax (so from e.g. 2014:0, 2015:1 for 
            #cluster 24  we geht year : 2015)
            years = table.idxmax(axis=1)
            #Add years to cross tab with column name 'year'
            crosstab['year'] = years

        #Add residence values (Rural, Urban, Refugee)
        # Construct cross tab for cluster & residence-> form: E.g. cluster: 24, urban: 1, rural: 0, refugee:0 [column name : entry]
        residence_tab = pd.crosstab(df[cluster], df[residence].map(meta.variable_value_labels[residence]), rownames = ["Cluster"], dropna=True)
        #As table contains for each residence type mention in the survey an own column, we need to merge them to
        #one single column where the column values are the residence type, hence idmax (so from e.g. urban: 1, rural: 0, refugee:0 for 
        #cluster 24  we geht residence: urban)
        residences = residence_tab.idxmax(axis=1)
        #Add residences to cross tab with column name residence
        crosstab['residence'] = residences
        
        #Rename column name of the cluster column to cluster as it has been not named yet (Unnamed: 0)
        crosstab.rename( columns={'Unnamed: 0':'cluster'}, inplace=True )
        #Define the export path
        export = os.path.join(export_path, filename+'-water_source.csv')
        #Export the cross tab as csv-file
        crosstab.to_csv(export)
    except Exception as e:
        print('Error', os.path.basename(file), e)

def create_csv(dir_sav:str, dir_csv:str):
    '''Create CSV files for each survey (if they have a dbf file); The information for the csv files are extracted
    from the dbf file; the final csv file contains the following columns 'cluster' (number), 'year', 'residence' 
    (type) and the columsn regarding the water sources; for each cluster of the survey there is a seperate row'''

    #Create the general Csv-directory where all csv files are stored
    if not os.path.exists(dir_csv):
        os.makedirs(dir_csv)
    #List of all paths to the dbf files
    directory = os.listdir(dir_sav)
    for file in directory: 
        sav_path = os.path.join (dir_sav, file)
        get_csv(sav_path, dir_csv)
    return 

In [17]:
'''AS the Sentinel 2 data used as input for the Neuronal Network is only available since May 2015, we decided to 
only use surveys which were conducted from 2013-until now. Hence, this part moves all surveys conducted before
2013 to a seperate folder. Note: As soon as one cluster has the year 2013 or later, we do not move them to
the seperate folder'''

def split_before_2013 (export_path:str):
    
    #Create seperate folder for the surveys conducted before 2013
    before_2013 = os.path.join(export_path, "before_2013")
    if not os.path.exists(before_2013):
        os.makedirs(before_2013)
    
    #Check which files where conducted before 2013
    directory = os.listdir(export_path)    
    for file in directory:
        if file.endswith('.csv'):
            csv_file = os.path.join(export_path, file)
            survey_year = pd.read_csv(csv_file, usecols = ['year'])
            #Check if the latest year is smaller than 2013. If yes, move them to the new directory
            if survey_year['year'].max()< 2013:
                new_path = os.path.join(before_2013, file)
                os.rename(csv_file, new_path)
    

In [18]:
'''As we use Sentinel Data as Input for the NN, we require the locations of the clusters for retrieving them.
The location is given within the GeoData DHS files. But not all HR-Survey have corresponding Geodata files. Hence,
all surveys which do not have GeoData file, are moved to seperate folder. For this step it is required that
the program satellite_images_gee was already executed or at least the csv files for the GeoData were already
created'''
def check_year_country (gps_dir:str, water_file:str):
    
    dir_gps = os.listdir(gps_dir)
    #Run through all GeoData files to check if one might correspond to your water file (HR file) w.r.t to year
    #country. If yes, we return a False and don't move (in the main function into the seperate folder)
    for gps_csv in dir_gps:
        if gps_csv.endswith('.csv'):
            gps_file = os.path.join(gps_dir, gps_csv)
            #Read out only year column from current GeoData csv file
            gps_years = pd.read_csv(gps_file, usecols = ['year'])
            #Take a random value of them column (here of index 1) and use it as reference year for the GeoData
            gps_year = gps_years.iloc[1]['year']
            #Read out only year column from current GeoData csv file
            water_years = pd.read_csv(water_file, usecols = ['year'])
            #if gps_year['year'].equals(water_year['year']):
            #Jump into if-clause if any the value given in the HR-Survey water column is equal to the GeoData year gps_year
            if any(water_years.iloc[i]['year'] == gps_year for i in range(len(water_years))):
                #Jump into If-Clause, if the number of clusters is the same
                if len(gps_years) == len(water_years):
                    #Get country abbreviation of GeoData and HR file (abbreviation corresponds to the first two
                    #letter of the survey/file name)
                    country_gps = os.path.basename(gps_file)[:2]
                    country_water = os.path.basename(water_file)[:2]
                    #Jump into the if-clause and return False, if the country is the same
                    if country_gps == country_water:
                        water = os.path.basename(water_file[:water_file.rfind('-')])
                        gps = os.path.basename(gps_file[:gps_file.rfind(".")])
                        print(water, gps)
                        return False
    return True

#Main function of this part
def split_no_gps(dir_csv:str, dir_no_gps:str, dir_gps_zips:str, dir_gps_csv:str):
    
    #Create directory for the Hr survey which do not have corresponding GeoData files
    if not os.path.exists(dir_no_gps):
        os.makedirs(dir_no_gps)
    
    water_dir = os.listdir(dir_csv)
    gps_dir = os.listdir(dir_gps_zips)
    gps_cvs_dir = os.listdir(dir_gps_csv)
    
    
    for water_file in water_dir:
        if water_file.endswith('.csv'):
            #Some HR files and corresponding GeoData files have the same survey name except the abbrevation 
            # 'HR' and 'GE'. Hence, we at first check if the survey name is the same if 'HR' is replaced by
            #'GE'. If not, we need to jump into the function check_year_country to decide if a corresponding
            #Ge file exists. If check_year_country() is true we move this csv-file (HR survey) to the newly create folder.
            possible_gps_name = water_file.replace('HR', 'GE')[:water_file.rfind('-')]
            if not any(possible_gps_name in gps_file for gps_file in gps_cvs_dir):
                current_path = os.path.join(dir_csv, water_file)
                if check_year_country(dir_gps_csv, current_path):
                    new_path = os.path.join(dir_no_gps, water_file)
                    os.rename(current_path, new_path)

In [19]:
#Create from all single csv-files one big. This simplifies later one when creating the categorical labels for the 
#CNN the coding. Furthermore, it is easier to analyze the water sources and their namings in the surveys overall

def create_single_csv(dir_csv:str, path:str):
    
    directory = os.listdir(dir_csv)    
    big_csv = pd.DataFrame()
    # Path to joined file (if already existing delete to avoid adding it in the for-loop to the csv data)
    if os.path.exists(path):
        os.remove(path)
    
    for file in directory:
        if file.endswith('.csv'):
            csv_file = os.path.join(dir_csv, file)
            current_csv = pd.read_csv(csv_file)
            #Add ID/survey name as column to current_csv file; name clip at -water_source.csv
            filename = os.path.basename(file)[:file.find('-')]
            ID = [filename]*len(current_csv)
            idx = 0
            current_csv.insert(loc=idx, column='ID', value = ID)
            #Append the current df to final big df file
            big_csv = pd.concat([big_csv, current_csv])
            
    #Export as csv file   
    big_csv.to_csv(path,index = False)


In [20]:
# We manually joined columns as discussed with Sven & Yvonne (June 2021) to minimize the number of possible labels

def merge_columns_big_csv(dir_big_csv:str, path:str):
    # Path to joined file (if already existing delete to avoid adding it in the for-loop to the csv data)
    if os.path.exists(path):
        os.remove(path)
    
    df = pd.read_csv(dir_big_csv)
        
    df_grouped = df['ID']
    df_grouped = pd.Series.to_frame(df_grouped)
    df_grouped['cluster'] = df['cluster']
    df_grouped['residence'] = df['residence']
    df_grouped['year'] = df['year']
    df = df.fillna(0)
    df_grouped['piped'] = df['Piped into dwelling'] + df['Piped to yard/plot']+df["Piped to neighbour's house"]\
                            +df['Piped to neighbor'] + df['Public fountain'] + df['Public tap/standpipe']+\
                            +df['Piped from the neighbor']+df['Public to neighborhood']+df["Neighbour's tap"]+\
                            +df["Neighbor's house"]
    df_grouped['groundwater'] = df['Tube well or borehole'] + df['Hand pump / Tube well or borehole']+\
                                df['Borehole in yard/plot'] +df['Public borehole'] +df['Borehole with pump']+\
                                df['Protected well'] +df['Unprotected well']+df['Protected spring']+\
                                df['Unprotected spring']
    df_grouped['surface water'] = df['Lake/pond/river/channel/irrigation channel']+df['River/dam/lake/ponds/stream/canal/irrigation channel']+\
                                    df['Gravity flow scheme']
    df_grouped['rain'] = df['Rainwater']
    df_grouped['external source'] = df['Tanker truck'] + df['Cart with small tank'] + df['Vendor']+\
                                    df['Motorcycle with three wheels'] + df['Bicycle with jerrycans']
    df_grouped['bottled water'] = df['Bottled water'] + df['Bottled water or sachets']+df['Water sachets']+\
                                    df['Bag water'] + df['Sachet water (in a bag)'] + df['Sachet'] +\
                                    df['Water in plastic bag'] + df['Water in sachet'] + df['Sachet water'] +\
                                    df['Mineral water in sachet']

    df_grouped.to_csv(path, index = False)

In [1]:
# Main part 

#Paths
dir_corr = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/'
dir_zip = os.path.join(dir_corr,'SAV_Data')
dir_sav = os.path.join(dir_zip, 'SAV_file')
dir_gps_zips = os.path.join(dir_corr, 'GPS_Data')
dir_gps_csv = os.path.join(dir_gps_zips, 'gps_csv')
dir_csv =  os.path.join(dir_zip, 'water-source')
dir_no_gps = os.path.join(dir_csv, 'no_GPS_from_2013')
dir_joined_csv = os.path.join(dir_csv, 'joined-surveys-2013.csv')
dir_joined_csv_merged_columns = os.path.join(dir_csv, 'joined-surveys-2013-grouped.csv')

#Functions
remove_FL(dir_zip)
print('remove FL done')
remove_all_except_HR(dir_zip)
print('remove_all_except_HR done')
extract_sav(dir_zip, dir_sav)
print('created sav files')
create_csv(dir_sav, dir_csv)
print('Create water source csv files')
split_before_2013(dir_csv)
print('Split into two subsets (before and after 2013)')
#PLEASE NOTE: For the next function the 
if os.path.isdir(dir_gps_csv):
    split_no_gps(dir_csv, dir_no_gps, dir_gps_zips, dir_gps_csv)
    print('Moved all files without gps data into seperate subfolder')
else:
    print('No GPS-CSV created yet and hence, water_csv cannot be classified into without or with GPS data')
    
create_single_csv(dir_csv, dir_joined_csv) 
print('Created single big csv')

merge_columns_big_csv(dir_joined_csv, dir_joined_csv_merged_columns)

NameError: name 'os' is not defined

HV001 Cluster number
HV201 Water source
HV007 Year of interview
HV025 Residence


Please Note that the column names are rather diverse (1) although they may indicate the same source (e.g. *River/dam/lake/ponds/stream/canal/irrigation channel* and *Lake/pond/river/channel/irrigation channel*) or (2) there are different categories used (e.g. UGHR7IFL-water_source has the category *Bicycle with jerrycans* which others don't have)