# Ensemble Machine Learning for Void Filling in Glacier Elevation Change Maps

## 01 - Data Preprocessing

### Imports

In [34]:
import pandas as pd
import geopandas as gpd
import numpy as np
import sys
import os

### Load the data

In [8]:
# Set the path and load the Eastern Himalaya data
ehim_path = './data/HIMAP_RGI/ehim_mb6.shp' 
ehim = gpd.read_file(ehim_path)

# Set the path and load the Western Himalaya data
whim_path = './data/HIMAP_RGI/whim_mb6.shp'
whim = gpd.read_file(whim_path)

### Create the DatasetBuilder Class

In [41]:
save_data = False # Set to true if you would like to save new copies of the data

class DatasetBuilder:
    """
    Attributes:
        dict (dict): Dictionary mapping region names to pandas DataFrames containing glacier data.
        surge_path (str): File path to the surge glacier CSV file.
        dc_path (str): File path to the debris-cover ratio CSV file.
        save (bool): Flag indicating whether to save processed datasets to disk.
        surge (pd.DataFrame): DataFrame containing surge glacier information.
        dc (pd.DataFrame): DataFrame containing debris-cover ratio information.
        
    Methods:
        __init__(region_dict, save=False)
            Initializes the DatasetBuilder with region data and loads auxiliary datasets.
        iter_regs()
            Iterates over all regions in the dictionary, processes each DataFrame,
            and optionally saves the processed dataset to a CSV file.
        _build_ds_from_df(c_df)
            Processes a single region DataFrame by trimming columns, filtering surge glaciers,
            adding debris-cover ratio, computing hypsometric index, and converting aspect.
        _load_surge_debris()
            Loads the surge glacier and debris-cover ratio datasets from their respective CSV files.
        _trim_df(reg_df)
            Selects and renames relevant columns from the input DataFrame for further processing.
        _filter_surge(reg_df)
            Removes glaciers identified as surge-type from the input DataFrame.
        _add_dc(reg_df)
            Joins the debris-cover ratio data to the input DataFrame based on glacier IDs.
        _add_HI(reg_df)
            Computes the hypsometric index (HI) for each glacier.
        _conv_aspect(reg_df)
            Decomposes the 'Aspect' column into sine and cosine components.
    """
    def __init__(self, region_dict, save = save_data):
        self.dict = region_dict
        self.surge_path = "./data/other/surge_glaciers.csv"
        self.dc_path = "./data/other/dc_ratio.csv"
        self.regs = {}
        self.save = save
        self._load_surge_debris()
        self.iter_regs()
        
    def iter_regs(self):
        for reg in self.dict.keys():
            c_df = self._build_ds_from_df(self.dict[reg])
            self.regs[reg] = c_df
            if self.save: c_df.to_csv(f"./data/raw/{reg}_preprocessed.csv", index = False)

    def _build_ds_from_df(self, c_df):
        c_df = self._trim_df(c_df)
        c_df = self._filter_surge(c_df)
        c_df = self._add_dc(c_df)
        c_df = self._add_HI(c_df)
        c_df = self._conv_aspect(c_df)
        return c_df

    def _load_surge_debris(self):
        self.surge = pd.read_csv(self.surge_path)
        self.dc = pd.read_csv(self.dc_path)

    def _trim_df(self, reg_df):
        cols = ['RGIId', 'GLIMSId','CenLon', 'CenLat', 'Area', 'Zmin', 'Zmax', 'Zmed', 'Slope', 'Aspect', 'dhdt_ma', 
        'dhdt_ma_si', 'mb_mwea', 'mb_mwea_si', 'RGIId_x']
        new_df = reg_df[cols].copy()
        new_df.rename(columns = {'RGIId_x': 'RGIId_Full'}, inplace = True)
        return new_df
        
    def _filter_surge(self, reg_df):
        reg_ids = reg_df['GLIMSId']
        surge_ids = self.surge['Glac_ID']
        overlap_ids = set(reg_ids).intersection(surge_ids)

        if len(overlap_ids) == 0:
            return reg_df
        else:
            reg_df_no_overlap = reg_df[~reg_df['GLIMSId'].isin(overlap_ids)]
            return reg_df_no_overlap


    def _add_dc(self, reg_df):
        new_df = reg_df.set_index('RGIId_Full').join(self.dc.set_index('RGIId')['dc_ratio'], lsuffix = '')
        new_df = new_df.reset_index()
        return new_df

    def _add_HI(self, reg_df):
        reg_df['HI'] = (reg_df['Zmax'] - reg_df['Zmed']) / (reg_df['Zmed'] - reg_df['Zmin'])
        reg_df['HI'] = reg_df['HI'].apply(lambda x: -1 / x if x <1 else x)
        return reg_df

    def _conv_aspect(self, reg_df):
        aspect = reg_df['Aspect']
        aspect2 = np.deg2rad(reg_df['Aspect'])
        reg_df['sin_Aspect'], reg_df['cos_Aspect'] = np.sin(aspect2), np.cos(aspect2)
        return reg_df
        

### Instantiate DatasetBuilder object and iterate through each region

In [42]:
regions = {'ehim': ehim, 'whim': whim} # Create a dictionary of the regions of interest

db = DatasetBuilder(regs) # Instantiate the DatasetBuilder class with the dictionary
db.iter_regs() # Iterate through each region and perform pre-processing

In [43]:
ehim_preprocessed = db.regs['ehim'] # Preprocessed Eastern Himalaya Data
whim_preprocessed = db.regs['whim'] # Preprocessed Eastern Himalaya Data


Unnamed: 0,RGIId_Full,RGIId,GLIMSId,CenLon,CenLat,Area,Zmin,Zmax,Zmed,Slope,Aspect,dhdt_ma,dhdt_ma_si,mb_mwea,mb_mwea_si,dc_ratio,HI,sin_Aspect,cos_Aspect
0,RGI60-15.03230,15.03230,G088281E27501N,88.280970,27.501009,0.298,5048,5393,5268,24.7,234,-0.849,0.870,-0.722,0.742,0.129866,-1.760000,-0.809017,-0.587785
1,RGI60-15.03231,15.03231,G088279E27505N,88.278790,27.505355,0.140,5144,5490,5336,25.0,208,-0.729,0.600,-0.620,0.512,0.244286,-1.246753,-0.469472,-0.882948
2,RGI60-15.03229,15.03229,G088282E27509N,88.281902,27.508518,0.207,5192,5568,5397,33.4,113,-0.380,2.162,-0.323,1.838,0.060870,-1.198830,0.920505,-0.390731
3,RGI60-15.03204,15.03204,G088274E27511N,88.273849,27.510660,0.737,5096,5764,5382,32.4,222,-0.609,0.680,-0.518,0.580,0.189281,1.335664,-0.669131,-0.743145
4,RGI60-15.02755,15.02755,G088239E27510N,88.238893,27.510350,0.168,5069,5340,5179,24.0,206,-0.761,0.923,-0.647,0.787,0.251786,1.463636,-0.438371,-0.898794
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2685,RGI60-15.10968,15.10968,G089582E28156N,89.582000,28.156000,7.299,4917,6130,5573,20.6,297,-0.384,0.275,-0.326,0.237,0.033539,-1.177738,-0.891007,0.453990
2686,RGI60-15.10969,15.10969,G089589E28169N,89.589000,28.169000,0.267,5393,5858,5675,25.3,318,-0.339,0.184,-0.288,0.158,0.057303,-1.540984,-0.669131,0.743145
2687,RGI60-13.26441,13.26441,G089652E28173N,89.652000,28.173000,1.266,4987,5460,5278,19.5,347,-1.203,0.465,-1.023,0.403,0.057583,-1.598901,-0.224951,0.974370
2688,RGI60-13.26440,13.26440,G089623E28177N,89.623000,28.177000,7.240,4912,5993,5285,23.5,5,-0.500,0.303,-0.425,0.262,0.026229,1.898123,0.087156,0.996195
