In [1]:
import os
import pandas as pd
import pyreadstat
from zipfile import ZipFile
import shutil

In [2]:
# Optional if ZIP Data with FL.zip ending was downloaded
def remove_FL(dir_name):
    #Remove all Zip-Data which doesn't include .sav
    folder = os.listdir(dir_name)
    for item in folder:
        if item.endswith("FL.zip") or item.endswith("FL.ZIP"):
            os.remove(os.path.join(dir_name, item))
            
# Optional if not only Houehold Surveys (HR) were downloaded
def remove_all_except_HR(dir_name):
    folder = os.listdir(dir_name)
    for item in folder:
        if not "HR" in item[2:4] and (item.endswith(".zip") or item.endswith(".ZIP")):
            os.remove(os.path.join(dir_name, item))  

In [3]:
# Extract solely the sav-file from the zip and save them into a seperate folder
def get_sav(listOfFileNames, newpath, zipObject):

    for fileName in listOfFileNames:
        if fileName.endswith('.sav') or fileName.endswith('.SAV'):
            # Extract a single file from zip
            zipObject.extract(fileName, newpath)
            
            
def extract_bigger_zip(zip_dir,filenames,newpath):
    zips_dir = zip_dir[:zip_dir.find('.')]
    if not os.path.exists(zips_dir):
        os.makedirs(zips_dir)
    with ZipFile(zip_dir, 'r') as zipObj:
    #Extract all the contents of zip file in different directory
        zipObj.extractall(zips_dir)

    big_size = 0
    big_zip = None
    for item in filenames:
        if item.endswith('.zip') or item.endswith('.ZIP'):
            file_dir = os.path.join(zips_dir, item)
            curr_size= os.stat(file_dir).st_size
            if curr_size >= big_size:
                big_size = curr_size
                big_zip = file_dir
    check_sav(big_zip,newpath)    
    
#Second mainpart for single zip file        
def check_sav(zip_dir, newpath):
    big_zip = None
    with ZipFile(zip_dir, 'r') as zipObject:
        listOfFileNames = zipObject.namelist()

        if any((element.endswith('.sav') or element.endswith('.SAV')) for element in listOfFileNames):
            get_sav(listOfFileNames, newpath, zipObject)
        elif any((element.endswith('.zip') or element.endswith('.ZIP')) for element in listOfFileNames):
            extract_bigger_zip(zip_dir, listOfFileNames, newpath)  
            
def delete_zip_folders(dir_name,newpath):
    list_subfolders_with_paths = [f.path for f in os.scandir(dir_name) if f.is_dir()]
    for folder in list_subfolders_with_paths:
        if not folder == newpath:
            shutil.rmtree(folder)
#Main part-> runs through all zip files in directory  
def extract_sav(dir_name, newpath):
    #Create folder for SAV files
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    folder = os.listdir(dir_name)

    for item in folder:
        if item.endswith('.zip') or item.endswith('.ZIP'):
            zip_dir = os.path.join(dir_name, item)
            check_sav(zip_dir, newpath)
            
    delete_zip_folders(dir_name,newpath)
    
    return newpath

In [4]:
# Create csv data with information of year, water source, and region type for each cluster
def get_csv(file, export_path):
    df, meta = pyreadstat.read_sav(file, encoding = 'LATIN1')
    meta_dict = dict(zip(meta.column_names, meta.column_labels))
    cluster = None
    water = None
    year = None
    residence = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print(meta_dict[i])
        else:
            if "Source of drinking water" in meta_dict[i]:
                if not df[i].isnull().all().all():
                    water = i
            elif "Cluster number" in meta_dict[i] or "cluster number" in meta_dict[i]:
                if not df[i].isnull().all().all():
                    cluster = i
            elif "Year of interview" in meta_dict[i] or "year of interview" in meta_dict[i]:
                    if not df[i].isnull().all().all():
                        year = i
            elif "Type of place of residence" in meta_dict[i] or "type of place of residence" in meta_dict[i]:
                    if not df[i].isnull().all().all():
                        residence = i
    # V113 Source of drinking water, V115 Time to get to water source
    #V001 Cluster number
    #year of interview
    #Type of place of residence
    
    if cluster is not None and water is not None:
        #print('Want to create CSV')
        try:
            crosstab = pd.crosstab(df[cluster], df[water].map(meta.variable_value_labels[water]),rownames = ["Cluster"],colnames = ["Properties"], dropna=True)
            crosstab['Year'] = df[year]
            crosstab['Residence'] = df[residence].map(meta.variable_value_labels[residence])
            export = file[file.rfind('/'):file.rfind('.')]
            #print(export)
            crosstab.to_csv(export_path+export+'-water_source.csv')

        except Exception as e:
            print(e)

In [5]:
def create_csv(dir_sav, dir_csv):
    if not os.path.exists(dir_csv):
        os.makedirs(dir_csv)

    directory = os.listdir(dir_sav)
    for file in directory: 
        #print("This is the file", file)
        sav_path = os.path.join (dir_sav, file)
        get_csv(sav_path, dir_csv)
    return 

In [6]:
def split_before_2013 (export_path):
    before_2013 = os.path.join(export_path, "before_2013")
    if not os.path.exists(before_2013):
        os.makedirs(before_2013)

    directory = os.listdir(export_path)    
    for file in directory:
        #print(file)
        if file.endswith('.csv'):
            csv_file = os.path.join(export_path, file)
            survey_year = pd.read_csv(csv_file, usecols = ['Year'])
            if survey_year['Year'].max()< 2013:
                new_path = os.path.join(before_2013, file)
                os.rename(csv_file, new_path)
    

In [7]:
def split_no_gps(dir_csv, dir_no_gps, dir_gps_zips):
    if not os.path.exists(dir_no_gps):
        os.makedirs(dir_no_gps)

    water_dir = os.listdir(dir_csv)
    gps_dir = os.listdir(dir_gps_zips)
    
    for water_file in water_dir:
        if water_file.endswith('.csv'):
            possible_gps_name = water_file.replace('HR', 'GE')[:water_file.rfind('-')]
            if not any(possible_gps_name in gps_file for gps_file in gps_dir):
                new_path = os.path.join(dir_no_gps, water_file)
                old_path = os.path.join(dir_csv, water_file)
                os.rename(old_path, new_path)

In [8]:
def create_single_csv(dir_csv):
    directory = os.listdir(dir_csv)    
    big_csv = pd.DataFrame()
    # Path to joined file (if already existing delete to avoid adding it in the for-loop to the csv data)
    path = os.path.join(dir_csv, 'joined-surveys-after-2003.csv')
    if os.path.exists(path):
        os.remove(path)
    
    for file in directory:
        if file.endswith('.csv'):
            csv_file = os.path.join(dir_csv, file)
            current_csv = pd.read_csv(csv_file)
            #Add ID as column to current_csv file; name clip at -water_source.csv
            filename = os.path.basename(file)[:file.find('-')]
            ID = [filename]*len(current_csv)
            idx = 0
            current_csv.insert(loc=idx, column='ID', value = ID)
            #Append it to big csv file
            big_csv = pd.concat([big_csv, current_csv])
       
    big_csv.to_csv(path,index = False)

    return path

In [10]:
# Main part 
dir_corr = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/'
dir_zip = os.path.join(dir_corr,'SAV_Data')
dir_sav = os.path.join(dir_zip, 'SAV_file')
dir_gps_zips = os.path.join(dir_corr, 'GPS_Data')
dir_csv =  os.path.join(dir_zip, 'water-source')
dir_no_gps = os.path.join(dir_csv, 'no_GPS_from_2013')

remove_FL(dir_zip)
print('FL_done')
remove_all_except_HR(dir_zip)
print('remove_all_except_HR done')
extract_sav(dir_zip, dir_sav)
print('created sav files')
create_csv(dir_sav, dir_csv)
print('Create water source csv files')
split_before_2013(dir_csv)
print('Split into two subsets (before and after 2013)')
split_no_gps(dir_csv, dir_no_gps, dir_gps_zips)
print('Moved all files without gps data into seperate subfolder')
big_csv_path = create_single_csv(dir_csv) 

FL_done
remove_all_except_HR done
created sav files
None
Create water source csv files
Split into two subsets (before and after 2013)
Moved all files without gps data into seperate subfolder


Please Note that the column names are rather diverse (1) although they may indicate the same source (e.g. *River/dam/lake/ponds/stream/canal/irrigation channel* and *Lake/pond/river/channel/irrigation channel*) or (2) there are different categories used (e.g. UGHR7IFL-water_source has the category *Bicycle with jerrycans* which others don't have)

TZGE7AFL exisitiert als GPS datei aber nicht als HR TZHR7ASV; kein einzefall -> ungleich gewicht zwischen gps(mehr) und dhs (weniger)