In [1]:
import pandas as pd
import numpy as np
import os
import pyproj
from pyproj import Transformer
import string

In [2]:
# Data Directory
data_dir = "./DOI-WGMS-FoG-2019-12/"

# WGMS Data Files
a_glacier_file = "WGMS-FoG-2019-12-A-GLACIER.csv"
b_glacier_file = "WGMS-FoG-2019-12-B-STATE.csv"
d_change_file = "WGMS-FoG-2019-12-D-CHANGE.csv"
e_massbalance_file = "WGMS-FoG-2019-12-E-MASS-BALANCE-OVERVIEW.csv"
ee_massbalance_file = "WGMS-FoG-2019-12-EE-MASS-BALANCE.csv"


In [3]:
# Main dataframe containing overall information
df_compiled = pd.DataFrame()

### Extract relevant Glacial Characteristics from the WGMS_A file

In [4]:
df_A = pd.read_csv(os.path.join(data_dir, a_glacier_file))
df_A.dropna(axis="rows", subset=["LONGITUDE", "LATITUDE"], inplace=True)

In [5]:
df_A.columns

Index(['POLITICAL_UNIT', 'NAME', 'WGMS_ID', 'GEN_LOCATION', 'SPEC_LOCATION',
       'LATITUDE', 'LONGITUDE', 'PRIM_CLASSIFIC', 'FORM', 'FRONTAL_CHARS',
       'EXPOS_ACC_AREA', 'EXPOS_ABL_AREA', 'PARENT_GLACIER', 'REMARKS',
       'GLACIER_REGION_CODE', 'GLACIER_SUBREGION_CODE'],
      dtype='object')

In [6]:
# Prettify Capitalization
df_A['NAME'] = df_A['NAME'].apply(lambda x: string.capwords(x))

notna = df_A['SPEC_LOCATION'].notna()
df_A.loc[notna,'SPEC_LOCATION'] = df_A.loc[notna,'SPEC_LOCATION'].apply(lambda x: string.capwords(str(x)))

# Change to Float for Consistency
df_A['PRIM_CLASSIFIC'] = df_A['PRIM_CLASSIFIC'].astype(float)
df_A['FORM'] = df_A['FORM'].replace(' ', np.nan).astype(float)
df_A['FRONTAL_CHARS'] = df_A['FRONTAL_CHARS'].astype(float)



In [7]:
# Extract relevant columns
A_columns = [
    "WGMS_ID",
    "LONGITUDE",
    "LATITUDE",
    "POLITICAL_UNIT",
    "GLACIER_REGION_CODE",
    "SPEC_LOCATION",
    "NAME",
    "PRIM_CLASSIFIC",
    "FORM",
    "FRONTAL_CHARS",
    "EXPOS_ACC_AREA",
    "EXPOS_ABL_AREA",
    "REMARKS",
]

df_compiled = df_A.loc[:, A_columns]

### Extract additional data from WGMS_B File

In [8]:
df_B = pd.read_csv(os.path.join(data_dir, b_glacier_file))
df_B = df_B.query("YEAR > 0")
df_B.columns

Index(['POLITICAL_UNIT', 'NAME', 'WGMS_ID', 'YEAR', 'HIGHEST_ELEVATION',
       'MEDIAN_ELEVATION', 'LOWEST_ELEVATION', 'ELEVATION_UNC', 'LENGTH',
       'LENGTH_UNC', 'AREA', 'AREA_UNC', 'SURVEY_DATE',
       'SURVEY_PLATFORM_METHOD', 'INVESTIGATOR', 'SPONS_AGENCY', 'REFERENCE',
       'REMARKS', 'PUB_IN_FOG', 'PUB_IN_GGCB'],
      dtype='object')

In [9]:
df_B_reduced = df_B.loc[
    :,
    [
        "WGMS_ID",
        "YEAR",
        "HIGHEST_ELEVATION",
        "LOWEST_ELEVATION",
        "INVESTIGATOR",
        "SPONS_AGENCY",
        "REFERENCE",
    ],
]
df_B_reduced.drop_duplicates("WGMS_ID", keep="last", inplace=True)

### Time Series

Create time series for the following data:
1. Thickness Change
2. Mass Balance
3. Length

In [10]:
def ts_helper(df, columns):
    """Extract time series data so that: 
        1: There is one measurement per year; multiple measurements are summarized crudely with median()
        """

    # One measurement per Year
    df_median = df.loc[:, columns].groupby(columns[0:2], as_index=False).median()
    df_out = df_median

    return df_out

### Thickness Change

In [11]:
df_D = pd.read_csv(os.path.join(data_dir, d_change_file))
df_D.columns

Index(['POLITICAL_UNIT', 'NAME', 'SURVEY_ID', 'WGMS_ID', 'YEAR', 'LOWER_BOUND',
       'UPPER_BOUND', 'AREA_SURVEY_YEAR', 'AREA_CHANGE', 'AREA_CHANGE_UNC',
       'THICKNESS_CHG', 'THICKNESS_CHG_UNC', 'VOLUME_CHANGE',
       'VOLUME_CHANGE_UNC', 'SURVEY_DATE', 'SD_PLATFORM_METHOD',
       'REFERENCE_DATE', 'RD_PLATFORM_METHOD', 'INVESTIGATOR', 'SPONS_AGENCY',
       'REFERENCE', 'REMARKS', 'PUB_IN_FOG', 'PUB_IN_GGCB', 'REF_ID'],
      dtype='object')

In [12]:
th_columns = ["WGMS_ID", "YEAR", "THICKNESS_CHG",'REFERENCE_DATE']
df_D_ = df_D.loc[:,th_columns]
df_D_.dropna(axis=0, how="any", inplace=True)

In [13]:
df_D_['REFERENCE_DATE'] = df_D_['REFERENCE_DATE'].apply(lambda x: int(str(x)[0:4]))

In [14]:
df_thickness_chg = ts_helper(df_D_, th_columns)

### Area

In [15]:
area_columns = ["WGMS_ID", "YEAR", "AREA"]
df_area = ts_helper(df_B, area_columns)
df_area.dropna(axis="rows", inplace=True)

In [16]:
df_area.head(n=5)

Unnamed: 0,WGMS_ID,YEAR,AREA
0,0,1959,38.9
1,0,1975,38.9
2,0,2014,38.54
4,1,1975,0.63
5,1,2005,0.61


### Length

In [17]:
length_columns = ["WGMS_ID", "YEAR", "LENGTH"]
df_length = ts_helper(df_B, length_columns)
df_length.dropna(axis="rows", inplace=True)


In [18]:
df_length.head(n=5)

Unnamed: 0,WGMS_ID,YEAR,LENGTH
0,0,1959,15.4
1,0,1975,15.4
2,0,2014,14.0
3,1,1960,1.4
4,1,1975,1.4


### Mass Balance

In [19]:
df_EE = pd.read_csv(os.path.join(data_dir, ee_massbalance_file))
df_EE.dropna(axis="rows", subset=["ANNUAL_BALANCE"], inplace=True)

EE_columns = ["WGMS_ID", "YEAR", "ANNUAL_BALANCE"]
df_mass_balance = ts_helper(df_EE, EE_columns)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### Extract site Time Series Characteristics

In [20]:
df_wgms = df_compiled.loc[:, ["WGMS_ID"]]

### First Measurement

In [21]:
dfs = [
    df.loc[:, ["WGMS_ID", "YEAR"]].set_index("WGMS_ID")
    for df in [df_thickness_chg, df_mass_balance, df_length, df_area]
]
dfs_ = [df.groupby("WGMS_ID")["YEAR"].min() for df in dfs]
df_first_measurement = pd.concat(dfs_, axis=1, join="outer").min(axis=1).reset_index()
df_first_measurement.rename({0: "FIRST_MEAS"}, axis=1, inplace=True)

### Years of Measurements

In [22]:
dfs = [
    df.loc[:, ["WGMS_ID", "YEAR"]]
    for df in [df_thickness_chg, df_mass_balance, df_length, df_area]
]
dfs_ = pd.concat(dfs).drop_duplicates(keep="first")
df_year_measurement = dfs_.groupby("WGMS_ID").size().reset_index()
df_year_measurement.rename({0: "YEAR_MEASUREMENTS"}, axis=1, inplace=True)

### Concatenate

In [23]:
df_compiled = df_compiled.merge(df_B_reduced, how="left", on="WGMS_ID", validate="1:1")
df_compiled = df_compiled.merge(df_first_measurement, on="WGMS_ID", how="left")
df_compiled = df_compiled.merge(df_year_measurement, on="WGMS_ID", how="left")

# Set first measurement to 2020 if value is Nan, measured years to zero if value is NaN
df_compiled.replace(
    {
        "FIRST_MEAS": {np.nan: 2020},
        "YEAR_MEASUREMENTS": {np.nan: 0},
        "PRIM_CLASSIFIC": {np.nan: 10},
        "FORM": {np.nan: 10},
        "FRONTAL_CHARS": {np.nan: 10},
        "SPEC_LOCATION": {np.nan: "N/A"},
        "NAME": {np.nan: "N/A"},
        "INVESTIGATOR": {np.nan: "N/A"},
        "SPONS_AGENCY": {np.nan: "N/A"},
        "REMARKS": {np.nan: "N/A"},
        "REFERENCE": {np.nan: "N/A"},

    },
    inplace=True,
)

In [24]:
df_compiled.head(n=10)

Unnamed: 0,WGMS_ID,LONGITUDE,LATITUDE,POLITICAL_UNIT,GLACIER_REGION_CODE,SPEC_LOCATION,NAME,PRIM_CLASSIFIC,FORM,FRONTAL_CHARS,...,EXPOS_ABL_AREA,REMARKS,YEAR,HIGHEST_ELEVATION,LOWEST_ELEVATION,INVESTIGATOR,SPONS_AGENCY,REFERENCE,FIRST_MEAS,YEAR_MEASUREMENTS
0,3628,73.235,37.1,AF,ASC,Upper Issik Valley,Northern Issik,10.0,10.0,10.0,...,,,,,,,,,2020.0,0.0
1,10452,70.17,35.595,AF,ASW,Chumar Valley,Pir Yakh,6.0,3.0,8.0,...,NE,Local people call this glacier PIR YAKH which ...,2018.0,5070.0,4400.0,"Abeer Ahmad Sajood, Hedayatullah Arian","Hydrometeorology Department, Geoscience Facult...",,2018.0,1.0
2,13308,73.60173,37.28307,AF,ASC,,Unnamed 13308,10.0,10.0,10.0,...,,Information derived from RGI5.0.,2000.0,5662.0,4720.0,,,RGI5.0,2000.0,2.0
3,13310,73.61128,37.25005,AF,ASC,,Unnamed 13310,10.0,10.0,10.0,...,,Information derived from RGI5.0.,2000.0,5373.0,4682.0,,,RGI5.0,2000.0,2.0
4,13311,73.51735,37.21651,AF,ASC,,Unnamed 13311,10.0,10.0,10.0,...,,Information derived from RGI5.0.,2000.0,5348.0,4523.0,,,RGI5.0,2000.0,2.0
5,13312,73.49909,37.19965,AF,ASC,,Unnamed 13312,10.0,10.0,10.0,...,,Information derived from RGI5.0.,2000.0,5615.0,4760.0,,,RGI5.0,2000.0,2.0
6,13582,73.07324,37.04305,AF,ASC,,Unnamed 13582,10.0,10.0,10.0,...,,Information derived from RGI5.0.,2002.0,5584.0,4811.0,,,RGI5.0,2002.0,6.0
7,13583,73.09385,37.0755,AF,ASC,,Unnamed 13583,10.0,10.0,10.0,...,,Information derived from RGI5.0.,2002.0,5522.0,4862.0,,,RGI5.0,2002.0,6.0
8,13584,73.10701,37.09954,AF,ASC,,Unnamed 13584,10.0,10.0,10.0,...,,Information derived from RGI5.0.,2002.0,5686.0,4817.0,,,RGI5.0,2002.0,6.0
9,13585,73.07664,37.1324,AF,ASC,,Unnamed 13585,10.0,10.0,10.0,...,,Information derived from RGI5.0.,2002.0,5580.0,4765.0,,,RGI5.0,2002.0,6.0


### Save Files

In [25]:
df_compiled.to_pickle("wgms_combined")
df_thickness_chg.to_pickle("wgms_thickness")
df_mass_balance.to_pickle("wgms_massbalance")
df_length.to_pickle("wgms_length")
df_area.to_pickle("wgms_area")

### WGMS ID of MER DE GLACE

In [28]:
df_compiled[df_compiled["NAME"].str.contains("glace", case=False)]


Unnamed: 0,WGMS_ID,LONGITUDE,LATITUDE,POLITICAL_UNIT,GLACIER_REGION_CODE,SPEC_LOCATION,NAME,PRIM_CLASSIFIC,FORM,FRONTAL_CHARS,EXPOS_ACC_AREA,EXPOS_ABL_AREA,REMARKS,YEAR,HIGHEST_ELEVATION,LOWEST_ELEVATION,INVESTIGATOR,SPONS_AGENCY,REFERENCE,FIRST_MEAS,YEAR_MEASUREMENTS
16777,353,6.93,45.88,FR,CEU,Mont Blanc Area,Mer De Glace,5.0,1.0,9.0,N,N,Glacier is part of GLACIOCLIM observatory.,2003.0,4100.0,1800.0,"D. Six, C.Vincent",CNRS/Grenoble University,"Berthier, E. and C. Vincent. 2012. J. of Glaci...",1968.0,11.0


In [44]:
df_compiled['YEAR_MEASUREMENTS'].unique()


array([  0.,   1.,   2.,   6.,   3.,   4.,   5.,  19.,  21.,   8.,  20.,
        18.,  11.,  13.,  15.,   7.,  22.,  28.,  17.,  32.,  14.,  68.,
        10.,  66.,   9.,  34.,  58.,  40.,  30.,  57.,  41.,  55.,  51.,
        52.,  25.,  16.,  23.,  56.,  64., 104.,  59.,  62.,  65.,  45.,
        43., 100.,  12.,  44.,  47.,  29.,  31.,  27.,  24.,  71.,  26.,
        33.,  48.,  39.,  38.,  35.,  49.,  36.,  37.,  73.,  54.,  53.])

In [34]:
list(range(11))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [37]:
1.0 in list(range(11))

True