In [1]:
import os
import pandas as pd
import pyreadstat
from zipfile import ZipFile
import shutil
from ethiopian_date import EthiopianDateConverter as edc
import datetime

In [2]:
# Optional if ZIP Data with FL.zip ending was downloaded
def remove_FL(dir_name):
    #Remove all Zip-Data which doesn't include .sav
    folder = os.listdir(dir_name)
    for item in folder:
        if item.endswith("FL.zip") or item.endswith("FL.ZIP"):
            os.remove(os.path.join(dir_name, item))
            
# Optional if not only Houehold Surveys (HR) were downloaded
def remove_all_except_HR(dir_name):
    folder = os.listdir(dir_name)
    for item in folder:
        if not "HR" in item[2:4] and (item.endswith(".zip") or item.endswith(".ZIP")):
            os.remove(os.path.join(dir_name, item))  

In [3]:
# Extract solely the sav-file from the zip and save them into a seperate folder
def get_sav(listOfFileNames, newpath, zipObject):

    for fileName in listOfFileNames:
        if fileName.endswith('.sav') or fileName.endswith('.SAV'):
            # Extract a single file from zip
            zipObject.extract(fileName, newpath)
            
            
def extract_bigger_zip(zip_dir,filenames,newpath):
    zips_dir = zip_dir[:zip_dir.find('.')]
    if not os.path.exists(zips_dir):
        os.makedirs(zips_dir)
    with ZipFile(zip_dir, 'r') as zipObj:
    #Extract all the contents of zip file in different directory
        zipObj.extractall(zips_dir)

    big_size = 0
    big_zip = None
    for item in filenames:
        if item.endswith('.zip') or item.endswith('.ZIP'):
            file_dir = os.path.join(zips_dir, item)
            curr_size= os.stat(file_dir).st_size
            if curr_size >= big_size:
                big_size = curr_size
                big_zip = file_dir
    check_sav(big_zip,newpath)    
    
#Second mainpart for single zip file        
def check_sav(zip_dir, newpath):
    big_zip = None
    with ZipFile(zip_dir, 'r') as zipObject:
        listOfFileNames = zipObject.namelist()

        if any((element.endswith('.sav') or element.endswith('.SAV')) for element in listOfFileNames):
            get_sav(listOfFileNames, newpath, zipObject)
        elif any((element.endswith('.zip') or element.endswith('.ZIP')) for element in listOfFileNames):
            extract_bigger_zip(zip_dir, listOfFileNames, newpath)  
            
def delete_zip_folders(dir_name,newpath):
    list_subfolders_with_paths = [f.path for f in os.scandir(dir_name) if f.is_dir()]
    for folder in list_subfolders_with_paths:
        if not folder == newpath:
            shutil.rmtree(folder)
#Main part-> runs through all zip files in directory  
def extract_sav(dir_name, newpath):
    #Create folder for SAV files
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    folder = os.listdir(dir_name)

    for item in folder:
        if item.endswith('.zip') or item.endswith('.ZIP'):
            zip_dir = os.path.join(dir_name, item)
            check_sav(zip_dir, newpath)
            
    delete_zip_folders(dir_name,newpath)
    
    return newpath

In [4]:
# Create csv data with information of year, water source, and region type for each cluster

def find_water_var(df, meta_dict):
    water = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print('Water', meta_dict[i])
        elif "source of drinking water" in meta_dict[i].lower():
                if not df[i].isnull().all().all():
                    if water == None:
                        water = i
                    else:
                        print('Water...2nd possibility', i)
    return water
def find_cluster_var(df, meta_dict):
    cluster = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print(meta_dict[i])
        elif "Cluster number" in meta_dict[i] or "cluster number" in meta_dict[i]:
                        if not df[i].isnull().all().all():
                            cluster = i

    return cluster

def find_year_var(df, meta_dict):
    year = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print(meta_dict[i])
        elif "Year of interview" in meta_dict[i] or "year of interview" in meta_dict[i]:
                    if not df[i].isnull().all().all():
                        year = i
    return year

def find_residence_var(df, meta_dict):
    residence = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print(meta_dict[i])
        elif  "Type of place of residence" in meta_dict[i] or "type of place of residence" in meta_dict[i]:
            if not df[i].isnull().all().all():
                residence = i
    return residence      

#Changing from ethopian date in HR-Surveys(not GE-Data) to gregorian dates
def get_eth_to_gregorian(df_year, year_HV, len_row):
    
    def gregorian_dates (year, num_year):
        greg_list = list()

        year_first_half = edc.to_gregorian(int(year),1,1).year
        year_second_half = None
        for month in range(1,13):
            # 5.day as the 13 Month has at usually only 5 days (except leapyear 6 days).
            year_scnd = edc.to_gregorian(int(year), month, 5).year
            if year_scnd != year_first_half:
                year_second_half = year_scnd

        greg_list.extend([year_first_half]*num_year)
        greg_list.extend([year_second_half]*num_year)


        return greg_list

    def check_for_emtpy_year_rows(num_years, table_len, year):
        diff = table_len-num_years
        return [year]*diff

    #Main
    gregorian_list = list()
    year_min = df_year[year_HV].min()
    year_max = df_year[year_HV].max()

    if year_max != year_min:
        num_year = int(len_row/4)
        gregorian_list.extend(gregorian_dates(year_min, num_year))
        gregorian_list.extend(gregorian_dates(year_max, num_year))
        #Case that num_year == len(crosstab[year] due to int-transformation, add to rows w/o values year_max)
        if not len_row == num_year*4:
            gregorian_list.extend(check_for_emtpy_year_rows(num_year*4,len_row, year_max))
    else:
        num_year = int(len_row/2)
        gregorian_list.extend(gregorian_dates(year_max, num_year))
        if not len_row == num_year*2:
            gregorian_list.extend(check_for_emtpy_year_rows(num_year*2, len_row, year_max))

    return gregorian_list

def get_csv(file, export_path):
    df, meta = pyreadstat.read_sav(file, encoding = 'LATIN1')
    meta_dict = dict(zip(meta.column_names, meta.column_labels))
    cluster = None
    water = None
    year = None
    residence = None
    if 'HV201' in meta_dict.keys():
        water = 'HV201'
    else:
        water = find_water_var(df, meta_dict)
        print('Water',water,file[file.rfind('/'):])
    
    if 'HV001' in meta_dict.keys():
        cluster = 'HV001'
    else:
        cluster = find_cluster_var(df, meta_dict)
        print('Cluster',cluster, file[file.rfind('/'):])

    if 'HV007' in meta_dict.keys():
        year = 'HV007'
    else:
        year = find_year_var(df, meta_dict)
        print('Year',year, file[file.rfind('/'):])

    if 'HV025' in meta_dict.keys():
        residence = 'HV025'
    else:
        residence = find_residence_var(df, meta_dict)
        print('residence', residence, file[file.rfind('/'):])
        
    try:      
        crosstab = pd.crosstab(df[cluster], df[water].map(meta.variable_value_labels[water]),rownames = ["cluster"],colnames = ["Properties"], dropna=True)
        export = file[file.rfind('/'):file.rfind('.')]
        filename = os.path.basename(file[:file.rfind(".")])
        #Add years
        if filename.startswith('ETHR'):
            gregorian_list = get_eth_to_gregorian(df, year, len(crosstab))
            crosstab.insert(loc = len(crosstab.columns), column = "year", value = gregorian_list)  
        else:               
            table = pd.crosstab(df[cluster] , df[year], rownames = ["cluster"], colnames= ["year"], dropna=True)
            years = table.idxmax(axis=1)
            crosstab['year'] = years

        #Add residence values (Rural, Urban)   
        residence_tab = pd.crosstab(df[cluster] , df[residence].map(meta.variable_value_labels[residence]), rownames = ["Cluster"], dropna=True)
        residences = residence_tab.idxmax(axis=1)
        crosstab['residence'] = residences
        
        crosstab.rename( columns={'Unnamed: 0':'cluster'}, inplace=True )
        export = os.path.join(export_path, filename+'-water_source.csv')
        crosstab.to_csv(export)
    except Exception as e:
        print('Error', os.path.basename(file), e)

In [5]:
def create_csv(dir_sav, dir_csv):
    if not os.path.exists(dir_csv):
        os.makedirs(dir_csv)

    directory = os.listdir(dir_sav)
    for file in directory: 
        #print("This is the file", file)
        sav_path = os.path.join (dir_sav, file)
        get_csv(sav_path, dir_csv)
    return 

In [6]:
def split_before_2013 (export_path):
    before_2013 = os.path.join(export_path, "before_2013")
    if not os.path.exists(before_2013):
        os.makedirs(before_2013)

    directory = os.listdir(export_path)    
    for file in directory:
        #print(file)
        if file.endswith('.csv'):
            csv_file = os.path.join(export_path, file)
            survey_year = pd.read_csv(csv_file, usecols = ['year'])

            if survey_year['year'].max()< 2013:
                new_path = os.path.join(before_2013, file)
                os.rename(csv_file, new_path)
    

In [7]:
def check_year_country (gps_dir, water_file):
    dir_gps = os.listdir(gps_dir)
     
    for gps_csv in dir_gps:
        if gps_csv.endswith('.csv'):
            gps_file = os.path.join(gps_dir, gps_csv)
            gps_years = pd.read_csv(gps_file, usecols = ['year'])
            gps_year = gps_years.iloc[1]['year']
            water_years = pd.read_csv(water_file, usecols = ['year'])
            #if gps_year['year'].equals(water_year['year']):
            if any(water_years.iloc[i]['year'] == gps_year for i in range(len(water_years))):
                if len(gps_years) == len(water_years):
                    country_gps = os.path.basename(gps_file)[:2]
                    country_water = os.path.basename(water_file)[:2]
                    if country_gps == country_water:
                        water = os.path.basename(water_file[:water_file.rfind('-')])
                        gps = os.path.basename(gps_file[:gps_file.rfind(".")])
                        print(water, gps)
                        return False
    return True

def split_no_gps(dir_csv, dir_no_gps, dir_gps_zips, dir_gps_csv):
    if not os.path.exists(dir_no_gps):
        os.makedirs(dir_no_gps)
    
    water_dir = os.listdir(dir_csv)
    gps_dir = os.listdir(dir_gps_zips)
    gps_cvs_dir = os.listdir(dir_gps_csv)
    
    for water_file in water_dir:
        if water_file.endswith('.csv'):
            possible_gps_name = water_file.replace('HR', 'GE')[:water_file.rfind('-')]
            if not any(possible_gps_name in gps_file for gps_file in gps_cvs_dir):
                current_path = os.path.join(dir_csv, water_file)
                if check_year_country(dir_gps_csv, current_path):
                    new_path = os.path.join(dir_no_gps, water_file)
                    os.rename(current_path, new_path)

In [8]:
def create_single_csv(dir_csv):
    directory = os.listdir(dir_csv)    
    big_csv = pd.DataFrame()
    # Path to joined file (if already existing delete to avoid adding it in the for-loop to the csv data)
    path = os.path.join(dir_csv, 'joined-surveys-after-2003.csv')
    if os.path.exists(path):
        os.remove(path)
    
    for file in directory:
        if file.endswith('.csv'):
            csv_file = os.path.join(dir_csv, file)
            current_csv = pd.read_csv(csv_file)
            #Add ID as column to current_csv file; name clip at -water_source.csv
            filename = os.path.basename(file)[:file.find('-')]
            ID = [filename]*len(current_csv)
            idx = 0
            current_csv.insert(loc=idx, column='ID', value = ID)
            #Append it to big csv file
            big_csv = pd.concat([big_csv, current_csv])
       
    big_csv.to_csv(path,index = False)

    return path

In [9]:
# Main part 
dir_corr = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/'
dir_zip = os.path.join(dir_corr,'SAV_Data')
dir_sav = os.path.join(dir_zip, 'SAV_file')
dir_gps_zips = os.path.join(dir_corr, 'GPS_Data')
dir_gps_csv = os.path.join(dir_gps_zips, 'gps_csv')
dir_csv =  os.path.join(dir_zip, 'water-source')
dir_no_gps = os.path.join(dir_csv, 'no_GPS_from_2013')

remove_FL(dir_zip)
print('FL_done')
remove_all_except_HR(dir_zip)
print('remove_all_except_HR done')
#extract_sav(dir_zip, dir_sav)
print('created sav files')
create_csv(dir_sav, dir_csv)
print('Create water source csv files')
split_before_2013(dir_csv)
print('Split into two subsets (before and after 2013)')
#PLEASE NOTE: For the next function the 
if os.path.isdir(dir_gps_csv):
    split_no_gps(dir_csv, dir_no_gps, dir_gps_zips, dir_gps_csv)
    print('Moved all files without gps data into seperate subfolder')
else:
    print('No GPS-CSV created yet and hence, water_csv cannot be classified into without or with GPS data')
    
big_csv_path = create_single_csv(dir_csv) 

FL_done
remove_all_except_HR done
created sav files
Error NGHR21FL.SAV 'HV201'
Water...2nd possibility sh100
Water hv201 /UGHR6AFL.SAV
Cluster hv001 /UGHR6AFL.SAV
Year hv007 /UGHR6AFL.SAV
residence hv025 /UGHR6AFL.SAV
Error JOHR21FL.SAV 'HV201'
Create water source csv files
Split into two subsets (before and after 2013)
GHHR7BFL GHGE7AFL
GHHR72FL GHGE71FL
GHHR82FL GHGE81FL
KEHR72FL KEGE71FL
MLHR72FL MLGE71FL
MWHR72FL MWGE71FL
NGHR7AFL NGGE7BFL
RWHR70FL RWGE72FL
SLHR72FL SLGE71FL
SNHR7HFL SNGE7AFL
SNHR8BFL SNGE8AFL
TGHR61FL TGGE62FL
TZHR7BFL TZGE7AFL
UGHR7BFL UGGE7AFL
UGHR72FL UGGE71FL
JOHR73FL JOGE71FL
Moved all files without gps data into seperate subfolder


Please Note that the column names are rather diverse (1) although they may indicate the same source (e.g. *River/dam/lake/ponds/stream/canal/irrigation channel* and *Lake/pond/river/channel/irrigation channel*) or (2) there are different categories used (e.g. UGHR7IFL-water_source has the category *Bicycle with jerrycans* which others don't have)


HV001 Cluster number
HV201 Water source
HV007 Year of interview
HV025 Residence

PCR -> PCOM Package