In [1]:
import os
import pandas as pd
import pyreadstat
from zipfile import ZipFile


In [2]:
# Optional if ZIP Data with FL.zip ending was downloaded
def remove_FL(dir_name):
    #Remove all Zip-Data which doesn't include .sav
    folder = os.listdir(dir_name)
    for item in folder:
        if item.endswith("FL.zip") or item.endswith("FL.ZIP"):
            os.remove(os.path.join(dir_name, item))
            
# Optional if not only Houehold Surveys (HR) were downloaded
def remove_all_except_HR(dir_name):
    folder = os.listdir(dir_name)
    for item in folder:
        if not "HR" in item[2:4] and (item.endswith(".zip") or item.endswith(".ZIP")):
            os.remove(os.path.join(dir_name, item))  

In [3]:
# Extract solely the sav-file from the zip and save them into a seperate folder
def extract_sav(dir_name):
    #Create folder for SAV files
    newpath =  os.path.join(dir_name,'SAV_file')
    if not os.path.exists(newpath):
        os.makedirs(newpath)

    folder = os.listdir(dir_name)

    for item in folder:
        if item.endswith('.zip') or item.endswith('.ZIP'):
            with ZipFile(dir_name+'/'+item, 'r') as zipObject:
                listOfFileNames = zipObject.namelist()
                for fileName in listOfFileNames:
                    if fileName.endswith('.sav') or fileName.endswith('.SAV'):
                        # Extract a single file from zip
                        zipObject.extract(fileName, newpath)
                        
    return newpath

In [9]:
# Create csv data with information of year, water source, and region type for each cluster
def get_csv(file, export_path):
    df, meta = pyreadstat.read_sav(file, encoding = 'LATIN1')
    meta_dict = dict(zip(meta.column_names, meta.column_labels))
    cluster = None
    water = None
    year = None
    residence = None
    for i in meta_dict:
        if meta_dict is None or meta_dict[i] is None:
            print(meta_dict[i])
        else:
            if "Source of drinking water" in meta_dict[i]:
                if not df[i].isnull().all().all():
                    water = i
            elif "Cluster number" in meta_dict[i] or "cluster number" in meta_dict[i]:
                if not df[i].isnull().all().all():
                    cluster = i
            elif "Year of interview" in meta_dict[i] or "year of interview" in meta_dict[i]:
                    if not df[i].isnull().all().all():
                        year = i
            elif "Type of place of residence" in meta_dict[i] or "type of place of residence" in meta_dict[i]:
                    if not df[i].isnull().all().all():
                        residence = i
    # V113 Source of drinking water, V115 Time to get to water source
    #V001 Cluster number
    #year of interview
    #Type of place of residence
    
    if cluster is not None and water is not None:
        #print('Want to create CSV')
        try:
            crosstab = pd.crosstab(df[cluster], df[water].map(meta.variable_value_labels[water]),rownames = ["Cluster"],colnames = ["Properties"], dropna=True, normalize='columns')
            crosstab['Year'] = df[year]
            crosstab['Residence'] = df[residence].map(meta.variable_value_labels[residence])
            export = file[file.rfind('/'):file.rfind('.')]
            #print(export)
            crosstab.to_csv(export_path+export+'-water_source.csv')

        except Exception as e:
            print(e)

In [5]:
def create_csv(dir_sav, dir_csv):
    if not os.path.exists(dir_csv):
        os.makedirs(dir_csv)

    directory = os.listdir(dir_sav)
    for file in directory: 
        #print("This is the file", file)
        sav_path = os.path.join (dir_sav, file)
        get_csv(sav_path, dir_csv)
    return 

In [6]:
def split_before_2013 (export_path):
    before_2013 = os.path.join(export_path, "before_2013")
    if not os.path.exists(before_2013):
        os.makedirs(before_2013)

    directory = os.listdir(export_path)    
    for file in directory:
        #print(file)
        if file.endswith('.csv'):
            csv_file = os.path.join(export_path, file)
            survey_year = pd.read_csv(csv_file, usecols = ['Year'])
            if survey_year['Year'].max()< 2013:
                new_path = os.path.join(before_2013, file)
                os.rename(csv_file, new_path)
    

In [7]:
def create_single_csv(dir_csv):
    directory = os.listdir(dir_csv)    
    big_csv = pd.DataFrame()
    
    for file in directory:
        if file.endswith('.csv'):
            csv_file = os.path.join(dir_csv, file)
            current_csv = pd.read_csv(csv_file)
            #Add ID as column to current_csv file; name clip at -water_source.csv
            filename = os.path.basename(file)[:file.find('-')]
            ID = [filename]*len(current_csv)
            idx = 0
            current_csv.insert(loc=idx, column='ID', value = ID)
            #Append it to big csv file
            big_csv = pd.concat([big_csv, current_csv])
            
    path = os.path.join(dir_csv, 'joined-surveys-after-2003.csv')
    big_csv.to_csv(path,index = False)



In [10]:
# Main part 
dir_zip = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/SAV_Data'
dir_csv =  os.path.join(dir_zip, 'water-source')

remove_FL(dir_zip)
print('FL_done')
remove_all_except_HR(dir_zip)
print('remove_all_except_HR done')
dir_sav_file = extract_sav(dir_zip)
print('created sav files')
create_csv(dir_sav_file, dir_csv)
print('Create water source csv files')
split_before_2013(dir_csv)
print('Splitted into two subsets (before and after 2013)')
create_single_csv(dir_csv) 

FL_done
remove_all_except_HR done
created sav files
This is the file AOHR51FL.SAV
Want to create CSV
This is the file AOHR62FL.SAV
Want to create CSV
This is the file AOHR71FL.SAV
Want to create CSV
This is the file BFHR7AFL.SAV
Want to create CSV
This is the file BFHR21FL.SAV
Want to create CSV
This is the file BFHR31FL.SAV
Want to create CSV
This is the file BFHR43FL.SAV
Want to create CSV
This is the file BFHR62FL.SAV
Want to create CSV
This is the file BFHR71FL.SAV
Want to create CSV
This is the file BJHR31FL.SAV
Want to create CSV
This is the file BJHR41FL.SAV
Want to create CSV
This is the file BJHR51FL.SAV
Want to create CSV
This is the file BJHR61FL.SAV
Want to create CSV
This is the file BJHR71FL.SAV
Want to create CSV
This is the file BUHR6AFL.SAV
Want to create CSV
This is the file BUHR61FL.SAV
Want to create CSV
This is the file BUHR71FL.SAV
Want to create CSV
This is the file CDHR51FL.SAV
Want to create CSV
This is the file CDHR61FL.SAV
Want to create CSV
This is the file 

Want to create CSV
This is the file ZMHR61FL.SAV
Want to create CSV
This is the file ZMHR71FL.SAV
Want to create CSV
This is the file ZWHR31FL.SAV
Want to create CSV
This is the file ZWHR42FL.SAV
Want to create CSV
This is the file ZWHR52FL.SAV
Want to create CSV
This is the file ZWHR62FL.SAV
Want to create CSV
This is the file ZWHR72FL.SAV
Want to create CSV
Create water source csv files
AOHR51FL-water_source.csv
AOHR62FL-water_source.csv
AOHR71FL-water_source.csv
BFHR7AFL-water_source.csv
BFHR21FL-water_source.csv
BFHR31FL-water_source.csv
BFHR43FL-water_source.csv
BFHR62FL-water_source.csv
BFHR71FL-water_source.csv
BJHR31FL-water_source.csv
BJHR41FL-water_source.csv
BJHR51FL-water_source.csv
BJHR61FL-water_source.csv
BJHR71FL-water_source.csv
BUHR6AFL-water_source.csv
BUHR61FL-water_source.csv
BUHR71FL-water_source.csv
CDHR51FL-water_source.csv
CDHR61FL-water_source.csv
CFHR31FL-water_source.csv
CGHR5AFL-water_source.csv
CGHR51FL-water_source.csv
CGHR61FL-water_source.csv
CIHR3AFL-w

In [None]:
#pd.crosstab(df['V115'].map(meta.variable_value_labels['V115']), df['V113'].map(meta.variable_value_labels['V113']) \
#, dropna=True, normalize='columns')