# Purpose

This code aims at creating a (csv-file) which connects HR-Files and GE-Files of the same survey to each other.
The final file will store for each survey the HR-File name, the GE-file name, the year (derived from the GE-file) and the country abbreviation.

This code assumes that the program ***creating_water_source_files*** was already executed successfully. Hence, all HR-Files which do not have a corresponding GE-File are saved in a seperate folder.


In [1]:
import os
import pandas as pd
import numpy as np

In [4]:
def check_year_country (gps_dir:str, water_file:str):

    dir_gps = os.listdir(gps_dir)
     
    for gps_csv in dir_gps:
        if gps_csv.endswith('.csv'):
            gps_file = os.path.join(gps_dir, gps_csv)
            gps_years = pd.read_csv(gps_file, usecols = ['year'])
            #Take the year of the GE file from a random cluster (here 1) as reference year
            gps_year = gps_years.iloc[1]['year']
            water_years = pd.read_csv(water_file, usecols = ['year'])
            #if gps_year['year'].equals(water_year['year']):
            #Jump into if-clause if the reference year is contained at least once in the year column of the HR file
            if any(water_years.iloc[i]['year'] == gps_year for i in range(len(water_years))):
                #Jump into if-clause if the number of clusters is the same
                if len(gps_years) == len(water_years):
                    #Get the country abbreviation from the HR and the GE file
                    country_gps = os.path.basename(gps_file)[:2]
                    country_water = os.path.basename(water_file)[:2]
                    #If the country abbreviations are identical, return the survey name of the GE file
                    if country_gps == country_water:
                        #water = os.path.basename(water_file[:water_file.rfind('-')])
                        gps = os.path.basename(gps_file[:gps_file.rfind(".")])
                        return gps
    # Returns None for Ge survey name if no corresponding Ge file for the current HR file was found
    return None

#Get for the final data frame (before list) the year and the country from the GeoData file
def ge_year_country(ge_dir:str, ge_name:str):
    
    ge_file = os.path.join(ge_dir, ge_name+'.csv')
    #Get year
    ge_years = pd.read_csv(ge_file, usecols = ['year'])
    ge_year = ge_years.iloc[1]['year']
    #Get country
    ge_country = os.path.basename(ge_name)[:2]
    
    return int(ge_year), ge_country

def create_correspondes_file(hr_dir:str,ge_dir:str):
    
    hr_files = os.listdir(hr_dir)
    ge_files = os.listdir(ge_dir)
    corr_columns = ['HR', 'GE', 'country', 'year']
    corr_list = []
    
    for hr_file in hr_files:
        if hr_file.endswith('.csv'):
            #Get HR survey name
            hr_name = hr_file[:hr_file.rfind('-')]
            possible_gps_name = hr_name.replace('HR', 'GE', 1)
            #Check if name is identical for HR csv file and GE csv file if HR is replaced with GE
            if any(possible_gps_name in ge_file for ge_file in ge_files):
                ge_name = possible_gps_name
            #Check if year, cluster number & country are identical between HR survey and a GE file via check_year_country()
            else:
                hr_file_path = os.path.join(hr_dir, hr_file)
                ge_name = check_year_country(ge_dir, hr_file_path)
            #True if a correspondence was given
            if ge_name is not None:
                #Add corresponding HR and GE files to list 
                year, country = ge_year_country(ge_dir, ge_name)
                survey_list = [hr_name, ge_name, country, year]
                corr_list.append(survey_list)
    
    #Turn list into pandas data frame with corresponding columns and save it as csv
    corr_array = np.array(corr_list)
    corr_df = pd.DataFrame(corr_array, columns = corr_columns)
            
    return(corr_df)

In [5]:
#Main
hr_dir = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/SAV_Data/water-source'
ge_dir = '/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/SatelliteImage__GEE/correlation/GPS_Data/gps_csv'
#Path we want to save the correspondence file to
corr_path = os.path.join('/home/shannon/Dokumente/Dokumente/studium/ASA/Projekt/NN/sentinel', 'corresponding_ge_hr_survey.csv')

corr_df = create_correspondes_file(hr_dir,ge_dir)
#Save file as csv
corr_df.to_csv(corr_path,index = False)
