In [1]:
import ee
import geemap
import pandas as pd
import geopandas as gpd
import glob
from pathlib import Path
import numpy as np
import time

In [2]:
#ee.Authenticate()
ee.Initialize()

## Set Parameters

In [3]:
# cloud filter params
CLOUD_FILTER = 90
CLD_PRB_THRESH = 50
NIR_DRK_THRESH = 0.15
CLD_PRJ_DIST = 1
BUFFER = 10

In [4]:
START_DATE = '2019-06-01'
END_DATE = '2019-08-31'
COUNTRY = ''
STATE = 'AK'
GEOJSON_PATH = ''
ROI = 'WATERSHED' #STATE, COUNTRY, BBOX, or WATERSHED
INPUT_DIR = '/mnt/poseidon/remotesensing/arctic/data/vectors/alaska_plot_data/v1'
OUTPUT_DIR = '/mnt/poseidon/remotesensing/arctic/data/vectors/data_testing'
BANDS = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12']

## 1. Import data

In [5]:
# import admin data and select country to create grid around
if ROI == 'STATE':
    admin_fcol = (ee.FeatureCollection("FAO/GAUL/2015/level1")
                  .filterMetadata('ADM0_NAME', 'equals', COUNTRY)
                  .filterMetadata('ADM1_NAME', 'equals', STATE))

elif ROI == 'COUNTRY':
    admin_fcol = (ee.FeatureCollection("FAO/GAUL/2015/level1")
                  .filterMetadata('ADM0_NAME', 'equals', COUNTRY))

elif ROI == 'BBOX':
    admin_fcol = geemap.geojson_to_ee(GEOJSON_PATH)
    
elif ROI == 'WATERSHED':
    admin_fcol = (ee.FeatureCollection("USGS/WBD/2017/HUC06")
                  .filterMetadata('states', 'contains', STATE))

### 1.1. Observation Points

In [6]:
# load geojsons
gjsons = sorted(glob.glob(f'{INPUT_DIR}/*.geojson'))
gj_gpds = []
for gjson in gjsons:
    gj = gpd.read_file(gjson)
    gj_gpds.append(gj)

In [7]:
# create UID from file name and index
def set_uid(row, path):
    stem = Path(path).stem
    idx = row['id']
    uid = f'{stem}_{idx}'
    return uid

In [8]:
# apply UID creation function
for path, gj_gpd in zip(gjsons, gj_gpds):
    gj_gpd['uid'] = gj_gpd.apply(lambda row: set_uid(row, path), axis=1)

#### 1.1.1. Remove some json cover columns

In [9]:
# gj_gpds[1].head(2)

In [10]:
# atqasuk
gj_gpds[1].drop(['cover_total'],
                axis=1, inplace=True)

IndexError: list index out of range

In [11]:
# barrow ngee
gj_gpds[2].drop(['cover_vegetation', 'cover_of_shrubs'],
                axis=1, inplace=True)

In [12]:
# barrow tundra
gj_gpds[3].drop(['cover_total'],
                axis=1, inplace=True)

In [13]:
# frost boils
gj_gpds[6].drop(['total_vegetation_cover', 'all_forb_cover', 
                'total_graminoid_cover', 'foliose_and_fruticose_lichen_cover',
                'bryophytes_cover', 'moss_and_liverwort_cover', 'mean_canopy_cover'], 
                axis=1, inplace=True)

In [14]:
# happy valley
gj_gpds[7].drop(['cover_evergreen_shrubs', 'cover_deciduous_shrubs'],
                axis=1, inplace=True)

In [15]:
# poplar veg
gj_gpds[10].drop(['tree_saplings_cover ', ' shrubs_cover', 
                  'total_dead_cover', 'total_vegetation_cover '], 
                 axis=1, inplace=True)

In [16]:
# atlas-1
gj_gpds[16].drop(['cover_evergreen_shrubs', 'cover_deciduous_shrubs'], 
                 axis=1, inplace=True)

In [17]:
# flux
gj_gpds[18].drop(['cover_shrub', 'cover_total_vegetation'], 
                 axis=1, inplace=True)

In [18]:
# oumalik
gj_gpds[19].drop(['cover_total', 'cover_shrub_layer'], 
                 axis=1, inplace=True)

In [19]:
# remove several datasets
del gj_gpds[20] # remove spine v1
del gj_gpds[15] # remove arctic network v1
del gj_gpds[14] # remove willow v1

In [20]:
# add fixed spine rd v2
loc = '/mnt/poseidon/remotesensing/arctic/data/vectors/alaska_plot_data/v2/spine_rd_prudhoe_bay_veg_plots_environmental.geojson'
spine = gpd.read_file(loc)
spine['uid'] = spine.apply(lambda row: set_uid(row, loc), axis=1)
gj_gpds.append(spine)

In [21]:
# add fixed arctic network v2
loc = '/mnt/poseidon/remotesensing/arctic/data/vectors/alaska_plot_data/v2/arctic_network_environmental_data.geojson'
arcnet = gpd.read_file(loc)
arcnet['uid'] = arcnet.apply(lambda row: set_uid(row, loc), axis=1)
gj_gpds.append(arcnet)

In [22]:
# drop spine rd dead columns and parent shrub columns
gj_gpds[19].drop(list(gj_gpds[19].filter(regex = 'dead')), 
                 axis=1, inplace=True)
gj_gpds[19].drop(['cover_evergreen_shrubs', 'cover_deciduous_shrubs'],
                 axis=1, inplace=True)

In [23]:
gj_gpds[20].drop(['cover_litter_alone'],
                axis=1, inplace=True)

### 1.2. HUC06 polygons

In [11]:
# load HUC data
huc = gpd.read_file('/mnt/poseidon/remotesensing/arctic/data/vectors/supplementary/wbdhu6_a_us_september2022.gdb')

In [12]:
# select alaska HUCs and alaska HUCs of interest
ak_huc6 = huc[huc['states'].str.contains('AK')]
aois = ['190604', '190603', '190602']
huc_aoi = ak_huc6[ak_huc6['huc6'].isin(aois)]
#huc_aoi = ak_huc6

In [14]:
outfp = "/mnt/poseidon/remotesensing/arctic/data/vectors/supplementary/Alaska_watershed_northslope.geojson"
huc_aoi.to_file(outfp)

## 2. Process observation points
- Create a UID from the file name and index
- separate geometry from observation data
- merge coordiante data from every site into one dataframe
- remove null geometry for GEE (only four points)
- select points that intersect our watershed of interest (HUC6: 190604)
- convert points to GEE feature class and visualize

### 2.1. Merge all observation points

In [26]:
# separate dataframes
geom_gpds = []
for gj_gpd in gj_gpds:
    gj_geom = gj_gpd[['uid', 'geometry']]
    geom_gpds.append(gj_geom)

In [27]:
# merge points
full_data = pd.concat(gj_gpds, axis=0, ignore_index=True)
df = pd.concat(geom_gpds, ignore_index=True)
df = df.set_index('uid')

In [28]:
# remove null geometries so GEE doesn't freak out
df_nonull = df[~df['geometry'].isna()]
print(f'before: {len(df)} \n after: {len(df_nonull)}') 

before: 2355 
 after: 2351


In [29]:
# select obs points that intersect these HUCs (points of interest--poi)
poi = df_nonull[df_nonull.intersects(huc_aoi.unary_union)]
print(f'{len(poi)} observation points')

930 observation points


### 2.2. Display GEE Map

In [80]:
# create GEE feature collections
poi_fc = geemap.gdf_to_ee(poi)
landcover = ee.Image('USGS/NLCD_RELEASES/2020_REL/NALCMS')
pointvis = {'color': 'e01b24',
            'lineWidth': 1,
            'fillColor': 'f6615188',
            'lineType': 'solid',
            'pointSize': 50,
            'pointShape': 'triangle_down'}

In [81]:
# visualize watersheds and poi
Map = geemap.Map(center=(65, -153), zoom=4, basemap='HYBRID')
Map.addLayer(landcover, {}, 'landcover')
Map.addLayer(admin_fcol, {'color': '00000000', 'width': 0.1}, 'watersheds')
Map.addLayer(poi_fc, pointvis, 
             'observation_points')

In [82]:
Map

Map(center=[65, -153], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=HBox(children=(T…

In [45]:
import os
download_dir = '/mnt/poseidon/remotesensing/arctic/data/training/results/01'
png_file = os.path.join(download_dir, 'alaska_observation_points.html')
Map.to_html(filename=png_file)

## 3. Load and preprocess Sentinel data

### 3.1. Load S2-SR and S1-SAR

In [33]:
# # load sentinel data -> try out SR
# s2_sr = (ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")
#       .filterDate(START_DATE, END_DATE)
#       .filterBounds(poi_fc)
#       #.select(['B4', 'B3', 'B2'])
#      )
# s1_grd = (ee.ImageCollection("COPERNICUS/S1_GRD")
#           .filterDate(START_DATE, END_DATE)
#           .filterBounds(poi_fc)
#          )

### 3.2. Create S2-SR cloud filter

In [34]:
def get_s2_sr_cld_col(aoi, start_date, end_date):
    # Import and filter S2 SR.
    s2_sr_col = (ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED')
        .filterBounds(aoi)
        .filterDate(start_date, end_date)
        .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', CLOUD_FILTER)))

    # Import and filter s2cloudless.
    s2_cloudless_col = (ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY')
        .filterBounds(aoi)
        .filterDate(start_date, end_date))

    # Join the filtered s2cloudless collection to the SR collection by the 'system:index' property.
    return ee.ImageCollection(ee.Join.saveFirst('s2cloudless').apply(
        primary = s2_sr_col,
        secondary = s2_cloudless_col,
        condition = ee.Filter.equals(
            leftField = 'system:index',
            rightField = 'system:index')
    ))

def add_cloud_bands(img):
    # Get s2cloudless image, subset the probability band.
    cld_prb = ee.Image(img.get('s2cloudless')).select('probability')

    # Condition s2cloudless by the probability threshold value.
    is_cloud = cld_prb.gt(CLD_PRB_THRESH).rename('clouds')

    # Add the cloud probability layer and cloud mask as image bands.
    return img.addBands(ee.Image([cld_prb, is_cloud]))


def add_shadow_bands(img):
    # Identify water pixels from the SCL band.
    not_water = img.select('SCL').neq(6)

    # Identify dark NIR pixels that are not water (potential cloud shadow pixels).
    SR_BAND_SCALE = 1e4
    dark_pixels = img.select('B8').lt(NIR_DRK_THRESH*SR_BAND_SCALE).multiply(not_water).rename('dark_pixels')

    # Determine the direction to project cloud shadow from clouds (assumes UTM projection).
    shadow_azimuth = ee.Number(90).subtract(ee.Number(img.get('MEAN_SOLAR_AZIMUTH_ANGLE')));

    # Project shadows from clouds for the distance specified by the CLD_PRJ_DIST input.
    cld_proj = (img.select('clouds').directionalDistanceTransform(shadow_azimuth, CLD_PRJ_DIST*10)
        .reproject(**{'crs': img.select(0).projection(), 'scale': 100})
        .select('distance')
        .mask()
        .rename('cloud_transform'))

    # Identify the intersection of dark pixels with cloud shadow projection.
    shadows = cld_proj.multiply(dark_pixels).rename('shadows')

    # Add dark pixels, cloud projection, and identified shadows as image bands.
    return img.addBands(ee.Image([dark_pixels, cld_proj, shadows]))


def add_cld_shdw_mask(img):
    # Add cloud component bands.
    img_cloud = add_cloud_bands(img)

    # Add cloud shadow component bands.
    img_cloud_shadow = add_shadow_bands(img_cloud)

    # Combine cloud and shadow mask, set cloud and shadow as value 1, else 0.
    is_cld_shdw = img_cloud_shadow.select('clouds').add(img_cloud_shadow.select('shadows')).gt(0)

    # Remove small cloud-shadow patches and dilate remaining pixels by BUFFER input.
    # 20 m scale is for speed, and assumes clouds don't require 10 m precision.
    is_cld_shdw = (is_cld_shdw.focalMin(2).focalMax(BUFFER*2/20)
        .reproject(**{'crs': img.select([0]).projection(), 'scale': 20})
        .rename('cloudmask'))

    # Add the final cloud-shadow mask to the image.
    return img_cloud_shadow.addBands(is_cld_shdw)


def apply_cld_shdw_mask(img):
    # Subset the cloudmask band and invert it so clouds/shadow are 0, else 1.
    not_cld_shdw = img.select('cloudmask').Not()

    # Subset reflectance bands and update their masks, return the result.
    #return img.select('B*').updateMask(not_cld_shdw)
    return img.updateMask(not_cld_shdw).select(BANDS)

In [35]:
s2_sr_cld_col = get_s2_sr_cld_col(poi_fc, START_DATE, END_DATE)
s2_sr = (s2_sr_cld_col.map(add_cld_shdw_mask)
         .map(apply_cld_shdw_mask))

### 3.3. Aggregate images
Eventually I will extract time series to help ID the PFTs

In [36]:
# aggregate seperately to avoid export errors thru GEE
s2_sr_max = s2_sr.reduce(ee.Reducer.max())
s2_sr_med = s2_sr.reduce(ee.Reducer.median())

### 3.4. Sample raster at observation points of interest

In [37]:
# sample sentinel 2 imagery using our observation points
def sample_raster(image, fcollection, scale=10, projection='EPSG:4326', geometries=False):
    fc = image.sampleRegions(collection = fcollection,
                             scale = scale,
                             projection = projection,
                             geometries = geometries # True keeps geom in table
                          )
    return fc

In [38]:
# get bands at each point
max_samples = sample_raster(s2_sr_max, poi_fc)
med_samples = sample_raster(s2_sr_med, poi_fc)

### 3.5. Convert to dataframe and export

In [39]:
# geemap function to convert fc to df. VERY slow??
# def fc_to_df(fcollection, idx):
#     df = geemap.ee_to_pandas(fcollection, sort_columns=True) # Max 5,000 rows
#     df.set_index(idx, inplace=True)
#     return df

In [40]:
def fc_to_df(fc, idx_col):
    # Convert a FeatureCollection into a pandas DataFrame
    # Features is a list of dict with the output
    features = fc.getInfo()['features']

    dictarr = []

    for f in features:
        attr = f['properties']
        dictarr.append(attr)

    df = pd.DataFrame(dictarr)
    df.set_index(idx_col, inplace=True)
    return df

In [41]:
max_df = fc_to_df(max_samples, 'uid')
med_df = fc_to_df(med_samples, 'uid')

In [42]:
max_df

Unnamed: 0_level_0,B11_max,B12_max,B1_max,B2_max,B3_max,B4_max,B5_max,B6_max,B7_max,B8A_max,B8_max,B9_max
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Atqasuk_Environmental_Data_0,2679,1891,943,681,803,1090,1527,2234,2553,2822,2740,3404
Atqasuk_Environmental_Data_1,2814,2030,7295,7168,6548,6364,7398,6478,6150,5485,5592,6366
Atqasuk_Environmental_Data_2,2818,1826,600,779,941,1050,1469,2210,2480,2711,2636,3501
Atqasuk_Environmental_Data_3,2233,1663,2464,713,892,1254,1387,1787,2024,2312,2262,4370
Atqasuk_Environmental_Data_4,2928,2386,645,588,736,955,1275,1920,2130,2450,2098,2453
...,...,...,...,...,...,...,...,...,...,...,...,...
arctic_network_environmental_data_235,3054,1846,735,893,976,1150,1587,2297,2704,3022,2956,3097
arctic_network_environmental_data_236,2805,1736,900,908,1132,1098,1605,2663,3071,3381,3450,3098
arctic_network_environmental_data_237,1390,1070,921,1100,1056,904,1182,1413,1530,1425,2002,1711
arctic_network_environmental_data_376,2494,1991,5762,6196,6328,6528,6700,6570,6616,6563,6600,6633


In [43]:
max_df.to_csv('/mnt/poseidon/remotesensing/arctic/data/training/huc190604_s2sr_summer_max_2019.csv', index=True)
med_df.to_csv('/mnt/poseidon/remotesensing/arctic/data/training/huc190604_s2sr_summer_med_2019.csv', index=True)

In [43]:
import regex as re
locs = []
for idx in max_df.index.tolist():
    text = re.findall(r'\D+', idx)
    text = text[0].strip('_')
    locs.append(text)
locs = np.array(locs)

In [44]:
np.unique(locs)

array(['Atqasuk_Environmental_Data',
       'Barrow_NGEE_Arctic_Environmental_Data',
       'Barrow_Tundra_IBP_Environmental_Data_Repeat_Plots',
       'Barter_Barrow_Environmental_Data',
       'Frost_Boils_Environmental_Data',
       'Happy_Valley_Environmental_Data',
       'Imnavait_Creek_Environmental_Data',
       'North_Slope_Veg_Plots_Environmental_Data',
       'Poplar_Veg_Plots_Environmental', 'Prudhoe_Bay_Environmental_Data',
       'Toolik_Lake_Environmental_Data',
       'arctic_network_environmental_data', 'atlas-',
       'flux_tower_zona_enivronmental_data', 'oumalik_environmental_data',
       'spine_rd_prudhoe_bay_veg_plots_environmental',
       'tundra_fire_veg_plots_environmental_data'], dtype='<U49')

## 4. Clean up PFT PCover dataset

### 4.1. Select PFT PCover data for POI

In [172]:
# select our points of interest
obs_data = med_df.join(full_data.set_index('uid'))
obs_data = obs_data.dropna(axis=1, how='all') # remove all null cols
original_cols = (obs_data.columns).to_list()

### 4.2 Extract all column names

In [173]:
import distance
from sklearn.cluster import AffinityPropagation

In [174]:
def levenshtein(texts):
    '''
    Levenshtein Distance
    - It requires negative similarities, so -1 * levenshtein(t1, t2)
    '''
    texts = np.asarray(texts)
    _similarity = np.array([[distance.levenshtein(list(w1),list(w2)) for w1 in texts] for w2 in texts])
    _similarity = -1 * _similarity
    return _similarity.astype(float)

In [175]:
def text_clustering(texts, similarity=levenshtein, word_level=False):
    '''Text Clustering'''
    # similarity
    if word_level: 
        texts = [t.split() for t in texts]
        
    _similarity = levenshtein(texts)
    _affprop = AffinityPropagation(affinity="precomputed", damping=0.9, verbose=True,
        random_state=0, max_iter=1_000, convergence_iter=10)
    _affprop.fit(_similarity)
    
    return _affprop, _similarity


def print_clusters(affprop, texts):
    '''Print clusters'''
    texts = np.asarray(texts)
    clusters = np.unique(affprop.labels_)
    print(f'\n~ Number of texts:: {texts.shape[0]}')
    print(f'~ Number of clusters:: {clusters.shape[0]}')
    if clusters.shape[0] < 2: return 'Only few clusters - Stopped'
    for cluster_id in clusters:
        exemplar = texts[affprop.cluster_centers_indices_[cluster_id]]
        cluster = np.unique(texts[np.nonzero(affprop.labels_==cluster_id)])
        cluster_str = '";\n  "'.join(cluster)
        print(f'\n# Cluster ({cluster_id}) with ({len(cluster)}) elements')
        print(f'Exemplar:: {exemplar}')
        print(f'\nOthers::\n  "{cluster_str}"')

In [176]:
# texts = original_cols
# affprop, _ = text_clustering(texts, similarity=levenshtein)
# print_clusters(affprop, texts)

### 4.3. Extract PFT ancillary column names

In [260]:
# select which columns contain size and date information
size_cols = [col for col in obs_data.columns if 'releve_area' in col 
             or 'plot_size' in col 
             or 'plot_radius' in col
             or 'releve_measurement' in col]
date_cols = [col for col in obs_data.columns if 'date' in col]

In [261]:
# select size and date column data
misc_cols = size_cols + date_cols
misc_data = obs_data[obs_data.columns.intersection(misc_cols)]
misc_data.set_index(obs_data.index, inplace=True)

#### 4.3.1. Clean PFT ancillary column names

In [262]:
# create new ancillary columns
new_misc_cols = []
for col_name in misc_data.columns.tolist():
    
    if 'date' in col_name:
        new_col = 'date'
    elif 'releve' in col_name or 'plot' in col_name:
        new_col = 'plot_size'
    else:
        new_col = col_name
    new_misc_cols.append(new_col)

In [263]:
# create clean anciallary column dataframe
new_misc = pd.DataFrame(np.vstack([misc_data.to_numpy()]),
                         columns = new_misc_cols,
                         index = misc_data.index)

In [264]:
# extract year from date column
new_misc = new_misc.groupby(new_misc.columns, axis=1).first()
new_misc['year'] = new_misc['date'].astype(str).str[:4]

#### 4.3.2. Standardize plot size values

In [265]:
# fix plot size for each survey
new_misc['id'] = new_misc.index.astype(str).str.extract(r"(\D+)", expand=False)
reduced_df = new_misc.groupby('id')

In [266]:
# defs to fix certain data values
from functools import reduce
def str_to_m2(string):
    string = str(string)
    num = [int(num) for num in re.findall(r'\b\d+\b', string)]
    if not num:
        m2 = np.nan
    else:
        m2 = reduce(lambda x, y: x*y, num)
    return m2

def code_to_m2(num):
    num = int(num)
    if num == 1:
        m2 = 10
    elif num == 2:
        m2 = 1
    else:
        m2 = np.nan
    return m2

In [267]:
# fix data values for certain surveys
fixed_grps = []
grp = reduced_df.get_group(list(reduced_df.groups)[3]) # barter
grp['plot_size'] = grp['plot_size'].apply(lambda row: str_to_m2(row))
print(grp.iloc[1].id)
fixed_grps.append(grp)

grp = reduced_df.get_group(list(reduced_df.groups)[5]) # happy valley
grp['plot_size'] = grp['plot_size'].apply(lambda row: str_to_m2(row))
print(grp.iloc[1].id)
fixed_grps.append(grp)

grp = reduced_df.get_group(list(reduced_df.groups)[7]) # north slope
grp['plot_size'] = 100 # according to ancillary
print(grp.iloc[1].id)
fixed_grps.append(grp)

grp = reduced_df.get_group(list(reduced_df.groups)[8]) # poplar veg
grp['plot_size'] = 100 # according to ancillary
print(grp.iloc[1].id)
fixed_grps.append(grp)

grp = reduced_df.get_group(list(reduced_df.groups)[9]) # prudhoe
grp['plot_size'] = grp['plot_size'].apply(lambda row: code_to_m2(row))
grp['year'] = str(1975)
print(grp.iloc[1].id)
fixed_grps.append(grp)

grp = reduced_df.get_group(list(reduced_df.groups)[10]) # toolik
grp['plot_size'] = np.nan # no plot size data
print(grp.iloc[1].id)
fixed_grps.append(grp)

grp = reduced_df.get_group(list(reduced_df.groups)[13]) # flux
grp['plot_size'] = 1 # according to ancillary
print(grp.iloc[1].id)
fixed_grps.append(grp)

grp = reduced_df.get_group(list(reduced_df.groups)[15]) # spine rd
grp['plot_size'] = 1 # according to ancillary, first 100 are 1m2, final 100 are 5m2
print(grp.iloc[1].id) # this aoi only has first 29 points, so 1m2 it is.
fixed_grps.append(grp)


grp = reduced_df.get_group(list(reduced_df.groups)[16]) # tundra
grp['plot_size'] = 1 # some -1 present for some reason
print(grp.iloc[1].id)
fixed_grps.append(grp)

remaining = [0, 1, 2, 4, 6, 11, 12, 14]
for i in remaining:
    grp = reduced_df.get_group(list(reduced_df.groups)[i])
    fixed_grps.append(grp)

Barter_Barrow_Environmental_Data_
Happy_Valley_Environmental_Data_
North_Slope_Veg_Plots_Environmental_Data_
Poplar_Veg_Plots_Environmental_
Prudhoe_Bay_Environmental_Data_
Toolik_Lake_Environmental_Data_
flux_tower_zona_enivronmental_data_
spine_rd_prudhoe_bay_veg_plots_environmental_
tundra_fire_veg_plots_environmental_data_


In [268]:
# finish up
ancillary = pd.concat(fixed_grps)
ancillary.drop(['date', 'id'], axis=1, inplace=True)
ancillary = ancillary.astype({'plot_size': 'float32', 'year': 'int32'})
ancillary = ancillary.fillna(np.nan)

In [285]:
# add geometry
ancillary = ancillary.join(poi, rsuffix='_other')

In [286]:
ancillary.to_csv('/mnt/poseidon/remotesensing/arctic/data/training/huc190604_ancillary_01.csv', index=True)

### 4.4. Extract PFT PCover column names
I feel pretty safe after looking at this that jitu got the bulk of the "cover" columns correct. Only one I questioned was "sand." Is that a cover?

In [181]:
# quickly export the original cover data
cover_cols = [col for col in obs_data.columns if 'cover' in col]

# select cover columns from data
cover_data = obs_data[obs_data.columns.intersection(cover_cols)]
cover_data.set_index(obs_data.index, inplace=True)

#### 4.4.1. Clean PFT PCover column names
NOTE: some columns are "moss_and_lichen" or "forb_and_graminoid" or something

In [52]:
# do some name cleanup
clean_cover_cols = []
for col in cover_cols:
    
    # remove floating spaces
    col = col.lower().replace(' ', '')
    
    # replace . with _
    if '.' in col:
        col = col.replace('.', '_')
        
    # standardize 'cover' substring placement
    if 'cover' in col:
        col = col.replace('cover', '')
        col = col.lstrip('_')
        col = col.rstrip('_')
        col = f'cover_{col}'
        
    clean_cover_cols.append(col)

In [53]:
# apply cleaned up column names
cleaned_data = pd.DataFrame(np.vstack([cover_data.to_numpy()]),
                            columns = clean_cover_cols,
                            index = cover_data.index)

In [54]:
# remove microsite
cleaned_data.drop('cover_of_microsite', axis=1, inplace=True)

#### 4.4.2 Aggregate PFT PCover column names

In [55]:
cleaned_data.columns

Index(['cover_shrub_layer', 'cover_graminoids', 'cover_forbs',
       'cover_mosses_&_liverworts', 'cover_lichen_layer',
       'cover_species_scale', 'cover_of_trees', 'cover_of_tall_shrub',
       'cover_of_low_shrub', 'cover_of_dwarf_shrub',
       ...
       'cover_pleurocarpous_bryophytes', 'cover_acrocarpous_bryophytes',
       'cover_liverworts', 'cover_surface_fragment', 'cover_needleleaf_trees',
       'cover_broadleaf_trees', 'cover_dwarf_broadleaf',
       'cover_dwarf_needleleaf', 'cover_standing_dead_trees', 'cover_tussock'],
      dtype='object', length=135)

In [56]:
# see how many values are in each column
shrubs = cleaned_data.filter(regex='shrub')
shrubs.count()

cover_shrub_layer                  64
cover_of_tall_shrub                48
cover_of_low_shrub                 48
cover_of_dwarf_shrub               48
cover_of_prostrate_dwarf_shrub     48
cover_tall_shrubs                 183
cover_low_shrubs                  212
cover_erect_dwarf-shrubs           33
cover_prostrate_dwarf-shrubs       33
cover_erect_dwarf_shrub           117
cover_prostrate_dwarf_shrub       117
cover_evergreen_shrub             144
cover_deciduous_shrub             144
cover_dwarf_shrub                 219
cover_low_shrub                   164
cover_deciduous_shrubs              6
cover_evergreen_shrubs              6
cover_tall_shrub                   81
cover_low_shrub                   155
cover_dwarf_shrub                 146
cover_prostrate_dwarf_shrub       148
cover_dwarf_shrubs                  9
cover_tall_shrub                  140
cover_erect_dwarf_shrubs          114
cover_prostrate_dwarf_shrubs      114
dtype: int64

In [123]:
# create new potential column names
new_cols = []
for ptype in cleaned_data.columns.tolist():
    
    # dead vegetation (dead, litter)
    if 'dead' in ptype or 'litter' in ptype:
        new_cols.append('dead_vegetation')
    # graminoids (graminoid, grass)
    elif 'graminoid' in ptype or 'tussock' in ptype:
        new_cols.append('graminoids')        
    # lichens (lichen, algae)
    elif 'lichen' in ptype or 'algae' in ptype:
        new_cols.append('lichen')        
    # bryophytes (brophyte, moss, liverwort)
    elif 'bryophyte' in ptype or 'moss' in ptype or 'liverwort' in ptype:
        new_cols.append('bryophytes')        
    # forbs (forb, horsetail, seedless vascular)   
    elif 'forb' in ptype or 'horsetail' in ptype or 'seedless' in ptype:
        new_cols.append('forbs') 
    # needleleaf, broadleaf
    elif 'tree' in ptype:
        new_cols.append('trees')
    # evergreen shrubs
    elif 'evergreen' in ptype:
        new_cols.append('evergreen_shrubs')    
    # decidious shrubs (dwarf, tall, low, deciduous)
    elif 'shrub' in ptype or 'dwarf' in ptype:
        new_cols.append('deciduous_shrubs')
    
    # water
    elif 'water' in ptype:
        new_cols.append('water')        
    # rock (rock, marl, frost scar, salt crust)
    elif 'rock' in ptype or 'marl' in ptype or 'frost' in ptype or 'crust' in ptype:
        new_cols.append('rocks')    
    # soil
    elif 'soil' in ptype:
        new_cols.append('bare_soil')
        
    # extra categories    
    else:
        new_cols.append(ptype)

In [124]:
new_cover = pd.DataFrame(np.vstack([cleaned_data.to_numpy()]),
                         columns = new_cols,
                         index = cleaned_data.index)

In [125]:
new_cover.columns

Index(['deciduous_shrubs', 'graminoids', 'forbs', 'bryophytes', 'lichen',
       'cover_species_scale', 'trees', 'deciduous_shrubs', 'deciduous_shrubs',
       'deciduous_shrubs',
       ...
       'bryophytes', 'bryophytes', 'bryophytes', 'cover_surface_fragment',
       'trees', 'trees', 'deciduous_shrubs', 'deciduous_shrubs',
       'dead_vegetation', 'graminoids'],
      dtype='object', length=135)

### 4.5. Fix possible PFT PCover value issues
I glanced fractions and dates? That has to be fixed before I can merge columns

In [137]:
def isfloat(x):
    try:
        a = float(x)
    except (TypeError, ValueError):
        return False
    else:
        return True

def isint(x):
    try:
        a = int(x)
    except (TypeError, ValueError):
        return False
    else:
        return True

In [140]:
def determine_val(x):
    if isfloat(x):
        x = float(x)
    elif isint(x):
        x = float(x)
    else:
        #print(x) #if you want to see weird stuff
        x = np.nan
    if x == -9: # get rid of pesky -9s in some datasets
        x = np.nan
    return x

In [141]:
# Standardize data type and set +/r/<1 to nan (later 0).
new_cover = new_cover.applymap(determine_val)

In [142]:
new_cover

Unnamed: 0_level_0,deciduous_shrubs,graminoids,forbs,bryophytes,lichen,cover_species_scale,trees,deciduous_shrubs,deciduous_shrubs,deciduous_shrubs,...,bryophytes,bryophytes,bryophytes,cover_surface_fragment,trees,trees,deciduous_shrubs,deciduous_shrubs,dead_vegetation,graminoids
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Atqasuk_Environmental_Data_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Atqasuk_Environmental_Data_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Atqasuk_Environmental_Data_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Atqasuk_Environmental_Data_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Atqasuk_Environmental_Data_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
arctic_network_environmental_data_235,0.0,10.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arctic_network_environmental_data_236,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1
arctic_network_environmental_data_237,0.0,1.0,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
arctic_network_environmental_data_376,0.0,5.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4.6 Create final PFT PCover dataset

In [143]:
# define new DataFrame that sums/merges columns with same names together
new_cover = new_cover.fillna(0)
pft_cover = new_cover.groupby(new_cover.columns, axis=1).sum()

In [144]:
pft_cover.columns

Index(['bare_soil', 'bryophytes', 'cover_scale', 'cover_species_scale',
       'cover_surface_fragment', 'dead_vegetation', 'deciduous_shrubs',
       'evergreen_shrubs', 'forbs', 'graminoids', 'lichen', 'rocks', 'trees',
       'water'],
      dtype='object')

In [145]:
# drop remaining random cover columns
pft_cover.drop(['cover_scale', 'cover_species_scale', 'cover_surface_fragment', 'dead_vegetation'],
                axis=1, inplace=True)

In [146]:
pft_cover['total_cover'] = pft_cover.sum(axis=1)

In [147]:
# issue with yr 2000 atqasuk data. all nan in DAAC and archive.
# dorp these.
pft_cover = pft_cover.loc[~(pft_cover==0).all(axis=1)]

In [148]:
pft_cover

Unnamed: 0_level_0,bare_soil,bryophytes,deciduous_shrubs,evergreen_shrubs,forbs,graminoids,lichen,rocks,trees,water,total_cover
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Atqasuk_Environmental_Data_31,0.0,34.8,31.5,0.0,3.0,8.5,0.25,0.0,0.0,0.0,78.05
Atqasuk_Environmental_Data_32,0.0,17.4,0.0,0.0,27.4,41.1,1.67,0.0,0.0,0.0,87.57
Atqasuk_Environmental_Data_33,0.0,17.9,43.6,0.0,1.0,5.0,20.90,0.0,0.0,0.0,88.40
Atqasuk_Environmental_Data_34,0.0,76.3,31.1,0.0,9.2,13.9,5.20,0.0,0.0,0.0,135.70
Atqasuk_Environmental_Data_35,0.0,21.4,0.0,0.0,5.7,30.0,0.00,0.0,0.0,0.0,57.10
...,...,...,...,...,...,...,...,...,...,...,...
arctic_network_environmental_data_235,5.0,30.0,15.0,0.0,0.1,10.0,0.10,0.0,0.0,1.0,61.20
arctic_network_environmental_data_236,0.0,40.0,20.0,0.0,0.0,15.1,0.00,0.0,0.0,3.0,78.10
arctic_network_environmental_data_237,0.0,0.0,0.0,0.0,0.1,1.0,0.00,0.0,0.0,100.0,101.10
arctic_network_environmental_data_376,38.0,3.0,0.0,0.0,4.0,5.0,50.00,0.0,0.0,0.0,100.00


In [149]:
pft_cover.to_csv('/mnt/poseidon/remotesensing/arctic/data/training/huc190604_pft_cover_test_01.csv', index=True)