In [1]:
import datetime
import getpass
import pathlib

import dcachefs
import matplotlib.pyplot as plt
import pandas as pd

# AHN3 processing: extracting computing time and intermediate product sizes

In this notebook we extract information such as computing times and storage space requirements for all the steps carried out to process the AHN3 dataset. The input LAZ files, the intermediate data products, the final rasterized features as well as the log files containing information about each step's computing time are all stored on the [SURF dCache storage](http://doc.grid.surfsara.nl/en/stable/Pages/Service/system_specifications/dcache_specs.html). The storage is accessed via a macaroon (bearer-token authentication) - see [here](http://doc.grid.surfsara.nl/en/latest/Pages/Advanced/storage_clients/webdav.html#sharing-data-with-macaroons) for more information.

In order to run this notebook the following packages are required (uncomment and run the following cell in order to install the packages):

In [2]:
# !pip install pandas matplotlib dcachefs

In [3]:
# input cell
API_URL = "https://dcacheview.grid.surfsara.nl:22880/api/v1"
WEBDAV_URL = "https://webdav.grid.surfsara.nl:2880"
TOKEN_FILEPATH = "./macaroon.dat"
DCACHE_ROOT_PATH = "/pnfs/grid.sara.nl/data/projects.nl/eecolidar"

In [4]:
# read token from plain text file
with open(TOKEN_FILEPATH) as f:
    token = f.read().strip()

In [5]:
# setup filesystem object
fs = dcachefs.dCacheFileSystem(
    api_url=API_URL, 
    webdav_url=WEBDAV_URL,
    token=token
)

In [6]:
# validate connection
assert fs.ls(DCACHE_ROOT_PATH, detail=False)

In [7]:
def read_log_file(filepath):
    """ Read log file provided its dCache path. """
    return fs.cat(filepath).decode()
    
    
def extract_time(text):
    """
    Extract execution time from the text of log file.
    
    Parse log timings and extract time delta from initial 
    and final datetimes. Return timing in seconds.  
    """    
    lines = text.splitlines()
    
    start_time = " ".join(lines[0].split()[:2])[:19]
    start_time = datetime.datetime.fromisoformat(start_time)

    end_time = " ".join(lines[-1].split()[:2])[:19]
    end_time = datetime.datetime.fromisoformat(end_time)
    
    return (end_time - start_time).seconds


def extract_times_per_feature(text):
    """
    Extract execution time from a log file for each of the 
    features.

    Return timing in seconds as a dictionary.  
    """
    timing = {}
    for line in text.splitlines():
        ls = line.split()
        if ls[7] == "Extracting" and ls[-1] == "seconds":
            time = float(ls[-2])
            features_list = ls[9:-3]
            features = " ".join(features_list)
            features_clean = features \
                .replace("\"", "") \
                .replace("[", "") \
                .replace("]", "") \
                .replace("'", "") 
            timing[features_clean] = time
    return timing


def extract_times_from_log_files(log_files, feature_breakdown=False):
    """
    Extract execution time from a list of log files. 
    Optionally extract partial timings for the computation of features
    
    Return timing in seconds as a pandas DataFrame.
    """

    # extract timings
    time = []
    for log_file in log_files:
        partial = {}
        txt = read_log_file(log_file)
        total = extract_time(txt)
        if feature_breakdown:
            partial = extract_times_per_feature(txt)
        time.append({"path": log_file, "total": total, **partial}
        )

    time_df = pd.DataFrame(time)
    idx = time_df["path"].apply(lambda x: pathlib.Path(x).stem)
    idx.name = "id"
    time_df.set_index(idx, inplace=True)
    time_df.drop("path", axis=1, inplace=True)
    return time_df

## Raw AHN3 LAZ files 

Extract size of the raw AHN3 files.

In [8]:
ahn3_path = f"{DCACHE_ROOT_PATH}/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323"

In [9]:
files = fs.ls(ahn3_path)

In [10]:
# get all files with the LAZ extension
ahn3_files = [
    file for file in files if file['name'].lower().endswith(".laz")
]

In [11]:
ahn3_size_df = pd.DataFrame(ahn3_files)
ahn3_size_df["size"] = ahn3_size_df["size"]/2**30  # to GB 
idx = ahn3_size_df["name"].apply(lambda x: pathlib.Path(x).stem)
idx.name = "id"
ahn3_size_df.set_index(idx, inplace=True)

In [12]:
ahn3_size_df["size"]

id
C_38EZ2    1.685300
C_26DZ1    1.284083
C_26HN1    2.150178
C_32GZ2    1.832610
C_26DZ2    2.480466
             ...   
C_16AZ1    0.894607
C_16AZ2    1.154814
C_44CN1    1.595681
C_44CN2    1.182035
C_35AZ1    0.334504
Name: size, Length: 1367, dtype: float64

In [13]:
# AHN3 files - file size statistics (in GB)
ahn3_size_df["size"].describe()

count    1367.000000
mean        1.747396
std         0.927175
min         0.000273
25%         1.215486
50%         1.721688
75%         2.193313
max         5.995163
Name: size, dtype: float64

In [14]:
# save to CSV file
ahn3_size_df["size"].sort_index().to_csv("size-GB_ahn3-files.csv")

## Retiling 

In [15]:
retiling_path = f"{DCACHE_ROOT_PATH}/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_retiled"

In [16]:
files = fs.ls(retiling_path)

In [17]:
# find all files with LOG extension
retiling_log_files = [
    file["name"] for file in files if file["name"].lower().endswith(".log")
]
len(retiling_log_files)

1367

In [18]:
retiling_time_df = extract_times_from_log_files(retiling_log_files)
retiling_time_df

Unnamed: 0_level_0,total
id,Unnamed: 1_level_1
C_69BN2,1262
C_67DN1,41
C_35AN2,389
C_08DZ2,276
C_01DZ2,453
...,...
C_51FZ2,1013
C_17DZ1,856
C_08CN2,698
C_27AN1,905


In [19]:
# retiling - execution time statistics (in seconds)
retiling_time_df.describe()

Unnamed: 0,total
count,1367.0
mean,837.79444
std,593.002734
min,2.0
25%,557.5
50%,762.0
75%,992.0
max,5940.0


In [20]:
# save to CSV file
retiling_time_df.sort_index().to_csv("time-seconds_retiling.csv")

In [21]:
# check number of tiles (directories) after retiling
tile_paths = [file["name"] for file in files if file["type"] == "directory"]
len(tile_paths)

37457

## Normalization

In [22]:
normalization_path = f"{DCACHE_ROOT_PATH}/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_normalized"

In [23]:
files = fs.ls(normalization_path)

In [24]:
# find all files with LOG extension
normalization_log_files = [
    file["name"] for file in files if file["name"].lower().endswith(".log")
]
len(normalization_log_files)

37457

In [25]:
normalization_time_df = extract_times_from_log_files(normalization_log_files)
normalization_time_df

Unnamed: 0_level_0,total
id,Unnamed: 1_level_1
tile_126_156,37
tile_126_157,11
tile_127_152,56
tile_127_153,21
tile_127_154,140
...,...
tile_390_366,135
tile_390_367,79
tile_390_368,46
tile_390_369,27


In [26]:
# normalization - execution time statistics (in seconds)
normalization_time_df.describe()

Unnamed: 0,total
count,37457.0
mean,222.997704
std,73.40803
min,9.0
25%,185.0
50%,214.0
75%,254.0
max,1099.0


In [27]:
# save to CSV file
normalization_time_df.sort_index().to_csv("time-seconds_normalization.csv")

In [28]:
# find all files with LAZ extension
normalization_size_df = pd.DataFrame(
    [file for file in files if file["name"].lower().endswith(".laz")]
)
normalization_size_df["size"] = normalization_size_df["size"]/2**30  # to GB 
idx = normalization_size_df["name"].apply(lambda x: pathlib.Path(x).stem)
idx.name = "id"
normalization_size_df.set_index(idx, inplace=True)

In [29]:
normalization_size_df["size"]

id
tile_126_156    0.018798
tile_126_157    0.000278
tile_127_152    0.013544
tile_127_153    0.005760
tile_127_154    0.086497
                  ...   
tile_390_366    0.034329
tile_390_367    0.025124
tile_390_368    0.004907
tile_390_369    0.000009
tile_391_363    0.002674
Name: size, Length: 37457, dtype: float64

In [30]:
# normalization - file size statistics (in GB)
normalization_size_df["size"].describe()

count    3.745700e+04
mean     1.508583e-01
std      7.533979e-02
min      6.072223e-07
25%      1.090739e-01
50%      1.344187e-01
75%      1.784685e-01
max      1.162908e+00
Name: size, dtype: float64

In [31]:
# save to CSV file
normalization_size_df["size"].sort_index().to_csv("size-GB_normalization-output.csv")

## Feature extraction - all points

In [32]:
features_all_path = f"{DCACHE_ROOT_PATH}/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_targets_all"

In [33]:
files = fs.ls(features_all_path)

In [34]:
# find all files with LOG extension
features_all_log_files = [
    file["name"] for file in files if file["name"].lower().endswith(".log")
]
len(features_all_log_files)

37457

In [35]:
features_all_time_df = extract_times_from_log_files(features_all_log_files, feature_breakdown=True)

In [36]:
features_all_time_df

Unnamed: 0_level_0,total,point_density,pulse_penetration_ratio,density_absolute_mean_normalized_height
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tile_344_350,251,1.34,41.75,41.00
tile_240_321,227,1.32,32.80,33.54
tile_340_359,221,1.19,29.92,30.27
tile_219_245,314,1.80,47.82,49.61
tile_159_196,76,0.20,3.16,3.40
...,...,...,...,...
tile_277_217,245,1.23,38.92,39.90
tile_296_282,222,1.46,33.77,32.20
tile_274_218,179,0.97,23.41,24.44
tile_294_283,227,1.46,33.51,32.41


In [37]:
# feature extraction for all points - execution time statistics (in seconds)
features_all_time_df.describe()

Unnamed: 0,total,point_density,pulse_penetration_ratio,density_absolute_mean_normalized_height
count,37457.0,37457.0,37457.0,37457.0
mean,276.504365,1.510156,39.166373,40.163299
std,98.427831,0.681952,17.429325,18.202233
min,48.0,0.06,0.01,0.03
25%,217.0,1.16,29.91,30.36
50%,256.0,1.38,36.25,36.96
75%,325.0,1.75,46.02,47.27
max,1431.0,12.43,287.67,300.41


In [38]:
# save to CSV file
features_all_time_df.sort_index().to_csv("time-seconds_features_all-points.csv")

## Feature extraction - vegetation points

In [39]:
features_veg_path = f"{DCACHE_ROOT_PATH}/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_targets_veg"

In [40]:
files = fs.ls(features_veg_path)

In [41]:
# find all files with LOG extension
features_veg_log_files = [
    file["name"] for file in files if file["name"].lower().endswith(".log")
]
len(features_veg_log_files)

37457

In [43]:
features_veg_time_df = extract_times_from_log_files(features_veg_log_files, feature_breakdown=True)

In [44]:
features_veg_time_df

Unnamed: 0_level_0,total,band_ratio_6<normalized_height,band_ratio_3<normalized_height,band_ratio_5<normalized_height<6,band_ratio_4<normalized_height<5,band_ratio_3<normalized_height<4,band_ratio_2<normalized_height<3,band_ratio_1<normalized_height<2,band_ratio_normalized_height<1,density_absolute_mean_normalized_height,entropy_normalized_height,"mean_normalized_height, std_normalized_height, coeff_var_normalized_height",kurto_normalized_height,skew_normalized_height,var_normalized_height,perc_95_normalized_height,perc_75_normalized_height,perc_50_normalized_height,perc_25_normalized_height
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
tile_380_318,128,1.73,1.31,0.85,0.85,0.84,0.86,0.86,1.38,1.73,0.82,0.22,0.76,1.11,0.16,0.21,0.22,0.21,0.20
tile_306_344,132,2.29,1.65,1.09,0.96,0.93,0.99,0.95,1.42,3.56,2.44,0.63,2.42,3.23,0.42,0.47,0.48,0.45,0.48
tile_327_368,135,1.41,1.18,0.88,0.86,0.86,0.86,0.86,1.18,5.12,1.60,0.48,1.59,2.12,0.34,0.41,0.39,0.38,0.39
tile_312_260,140,1.79,1.20,0.88,0.87,0.87,0.87,0.86,1.31,3.72,1.65,0.44,1.42,1.98,0.30,0.38,0.38,0.37,0.38
tile_289_270,307,4.61,2.83,2.47,2.32,2.34,2.32,2.64,3.02,32.83,4.54,1.54,3.50,4.52,1.36,2.08,2.15,2.18,2.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tile_294_281,283,4.22,2.97,2.26,2.28,2.25,2.29,2.25,2.85,34.80,4.57,1.70,3.42,4.32,1.46,2.13,2.08,2.09,2.06
tile_381_371,130,0.28,0.29,0.18,0.18,0.18,0.18,0.18,0.28,0.36,0.91,0.16,0.71,1.07,0.10,0.11,0.11,0.11,0.12
tile_382_372,75,0.03,0.03,0.03,0.02,0.02,0.02,0.02,0.02,0.07,0.11,0.09,0.14,0.22,0.02,0.06,0.06,0.06,0.06
tile_383_370,118,2.12,1.19,0.79,0.86,0.79,0.79,0.78,1.27,1.07,1.02,0.19,0.77,1.18,0.13,0.16,0.16,0.16,0.17


In [45]:
# feature extraction for vegetation points , part 1 - execution time statistics (in seconds)
features_veg_time_df.describe()

Unnamed: 0,total,band_ratio_6<normalized_height,band_ratio_3<normalized_height,band_ratio_5<normalized_height<6,band_ratio_4<normalized_height<5,band_ratio_3<normalized_height<4,band_ratio_2<normalized_height<3,band_ratio_1<normalized_height<2,band_ratio_normalized_height<1,density_absolute_mean_normalized_height,entropy_normalized_height,"mean_normalized_height, std_normalized_height, coeff_var_normalized_height",kurto_normalized_height,skew_normalized_height,var_normalized_height,perc_95_normalized_height,perc_75_normalized_height,perc_50_normalized_height,perc_25_normalized_height
count,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0
mean,169.801399,2.37552,1.716682,1.265703,1.258591,1.257629,1.257005,1.256564,1.734441,11.220281,2.534326,0.793532,2.064432,2.826204,0.600087,0.808412,0.802635,0.792852,0.802347
std,69.736211,1.479243,0.931172,0.740935,0.739832,0.738794,0.739076,0.738696,0.95572,13.214067,1.464235,0.64282,1.091089,1.37912,0.508892,0.760886,0.758925,0.755342,0.760354
min,56.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.01,0.0,0.0,0.0,0.04,0.04,0.04,0.04
25%,127.0,1.43,1.14,0.8,0.79,0.79,0.79,0.79,1.15,2.68,1.52,0.38,1.32,1.88,0.27,0.31,0.31,0.3,0.31
50%,151.0,2.06,1.54,1.11,1.11,1.1,1.1,1.1,1.55,6.78,2.29,0.61,1.93,2.73,0.45,0.57,0.56,0.55,0.56
75%,192.0,3.0,2.12,1.58,1.57,1.57,1.57,1.57,2.15,14.42,3.35,1.01,2.69,3.69,0.77,1.02,1.01,1.0,1.01
max,1220.0,26.06,13.55,11.13,10.6,10.58,10.07,10.55,13.48,186.29,12.91,8.74,9.39,10.65,6.75,11.22,11.19,11.07,14.86


In [46]:
# save to CSV file
features_veg_time_df.sort_index().to_csv("time-seconds_features_veg-points-1.csv")

## Feature extraction - vegetation points - additional features

In [47]:
features_veg_2_path = f"{DCACHE_ROOT_PATH}/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_targets_veg_additional-features"

In [48]:
files = fs.ls(features_veg_2_path)

In [49]:
# find all files with LOG extension
features_veg_2_log_files = [
    file["name"] for file in files if file["name"].lower().endswith(".log")
]
len(features_veg_2_log_files)

37457

In [50]:
features_veg_2_time_df = extract_times_from_log_files(features_veg_2_log_files, feature_breakdown=True)

In [51]:
features_veg_2_time_df

Unnamed: 0_level_0,total,"eigenv_1, eigenv_2, eigenv_3, normal_vector_1, normal_vector_2, normal_vector_3, slope",band_ratio_20<normalized_height,band_ratio_5<normalized_height<20,band_ratio_normalized_height<5,median_normalized_height,"max_normalized_height, min_normalized_height, range_normalized_height",sigma_z,point_density
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
tile_260_386,215,7.83,2.70,1.92,2.65,1.34,0.78,7.19,0.73
tile_266_198,272,7.83,2.73,1.89,2.45,1.77,1.12,10.94,1.67
tile_266_181,212,3.69,1.37,1.02,1.52,1.00,0.50,3.38,0.40
tile_270_270,179,3.18,1.19,0.75,1.14,0.20,0.07,2.01,0.10
tile_270_269,179,2.21,0.81,0.52,0.86,0.17,0.47,2.02,0.09
...,...,...,...,...,...,...,...,...,...
tile_388_367,325,5.32,1.58,1.03,1.50,0.12,0.05,1.84,0.10
tile_385_330,285,0.97,0.02,0.01,0.01,0.01,0.01,2.49,0.10
tile_384_371,285,1.28,0.48,0.30,0.41,0.30,0.11,2.02,0.08
tile_385_331,307,2.07,0.71,0.43,0.68,0.06,0.03,1.90,0.07


In [52]:
# feature extraction for vegetation points , part 2 - execution time statistics (in seconds)
features_veg_2_time_df.describe()

Unnamed: 0,total,"eigenv_1, eigenv_2, eigenv_3, normal_vector_1, normal_vector_2, normal_vector_3, slope",band_ratio_20<normalized_height,band_ratio_5<normalized_height<20,band_ratio_normalized_height<5,median_normalized_height,"max_normalized_height, min_normalized_height, range_normalized_height",sigma_z,point_density
count,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0,37457.0
mean,278.468697,5.398221,1.856082,1.32551,1.811634,0.920778,0.554372,4.222949,0.456549
std,1916.537248,3.38262,1.061543,0.819839,1.040261,0.706052,0.541779,2.657795,0.468378
min,13.0,0.05,0.0,0.0,0.0,0.0,0.01,1.61,0.05
25%,163.0,3.38,1.22,0.82,1.18,0.47,0.2,2.57,0.16
50%,230.0,4.59,1.63,1.14,1.6,0.73,0.38,3.36,0.3
75%,306.0,6.48,2.24,1.62,2.22,1.16,0.7,4.84,0.56
max,83123.0,47.9,14.51,9.96,14.89,7.82,7.89,36.32,7.31


In [53]:
# save to CSV file
features_veg_2_time_df.sort_index().to_csv("time-seconds_features_veg-points-2.csv")

## Geotiff creation - all points

In [54]:
geotiff_all_path = f"{DCACHE_ROOT_PATH}/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_geotiff_all"

In [55]:
files = fs.ls(geotiff_all_path)

In [56]:
# find all files with LOG extension
geotiff_all_log_files = [
    file["name"] for file in files if file["name"].lower().endswith(".log")
]
len(geotiff_all_log_files)

3

In [57]:
geotiff_all_time_df = extract_times_from_log_files(geotiff_all_log_files)
geotiff_all_time_df

Unnamed: 0_level_0,total
id,Unnamed: 1_level_1
point_density,20205
density_absolute_mean_normalized_height,20313
pulse_penetration_ratio,21106


In [58]:
# geotiff creation for all-point based features - execution time statistics (in seconds)
geotiff_all_time_df.describe()

Unnamed: 0,total
count,3.0
mean,20541.333333
std,491.988143
min,20205.0
25%,20259.0
50%,20313.0
75%,20709.5
max,21106.0


In [59]:
# save to CSV file
geotiff_all_time_df.sort_index().to_csv("time-seconds_geotiff_all-points.csv")

In [60]:
# find all files with TIF extension
geotiff_all_size_df = pd.DataFrame(
    [file for file in files if file["name"].lower().endswith(".tif")]
)
geotiff_all_size_df["size"] = geotiff_all_size_df["size"]/2**30  # to GB 
idx = geotiff_all_size_df["name"].apply(lambda x: pathlib.Path(x).stem)
idx.name = "id"
geotiff_all_size_df.set_index(idx, inplace=True)

In [61]:
geotiff_all_size_df["size"]

id
ahn3_feat_10m_1m_all_TILE_000_BAND_point_density                              0.887047
ahn3_feat_10m_1m_all_TILE_000_BAND_density_absolute_mean_normalized_height    0.822255
ahn3_feat_10m_1m_all_TILE_000_BAND_pulse_penetration_ratio                    0.974467
Name: size, dtype: float64

In [62]:
# geotiff creation for all-point based features - file size statistics (in GB)
geotiff_all_size_df["size"].describe()

count    3.000000
mean     0.894590
std      0.076386
min      0.822255
25%      0.854651
50%      0.887047
75%      0.930757
max      0.974467
Name: size, dtype: float64

In [63]:
# save to CSV file
geotiff_all_size_df["size"].sort_index().to_csv("size-GB_geotiff-all-output.csv")

## Geotiff creation - vegetation points

In [64]:
geotiff_veg_path = f"{DCACHE_ROOT_PATH}/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323_geotiff_veg/"

In [65]:
files = fs.ls(geotiff_veg_path)

In [66]:
# find all files with LOG extension
geotiff_veg_log_files = [
    file["name"] for file in files if file["name"].lower().endswith(".log")
]
len(geotiff_veg_log_files)

33

In [67]:
geotiff_veg_time_df = extract_times_from_log_files(geotiff_veg_log_files)
geotiff_veg_time_df

Unnamed: 0_level_0,total
id,Unnamed: 1_level_1
band_ratio_normalized_height%3C5,20412
band_ratio_1%3Cnormalized_height%3C2,20156
band_ratio_6%3Cnormalized_height,20925
skew_normalized_height,20360
std_normalized_height,20327
entropy_normalized_height,20211
mean_normalized_height,20627
perc_95_normalized_height,20368
band_ratio_3%3Cnormalized_height,20122
band_ratio_4%3Cnormalized_height%3C5,20378


In [68]:
# geotiff creation for vegetation-based features - execution time statistics (in seconds)
geotiff_veg_time_df.describe()

Unnamed: 0,total
count,33.0
mean,20683.575758
std,473.917255
min,19800.0
25%,20360.0
50%,20591.0
75%,20980.0
max,21978.0


In [69]:
# save to CSV file
geotiff_veg_time_df.sort_index().to_csv("time-seconds_geotiff_veg-points.csv")

In [70]:
# find all files with TIF extension
geotiff_veg_size_df = pd.DataFrame(
    [file for file in files if file["name"].lower().endswith(".tif")]
)
geotiff_veg_size_df["size"] = geotiff_veg_size_df["size"]/2**30  # to GB 
idx = geotiff_veg_size_df["name"].apply(lambda x: pathlib.Path(x).stem)
idx.name = "id"
geotiff_veg_size_df.set_index(idx, inplace=True)

In [71]:
geotiff_veg_size_df["size"]

id
ahn3_feat_10m_1m_veg_TILE_000_BAND_mean_normalized_height                     1.007954
ahn3_feat_10m_1m_veg_TILE_000_BAND_band_ratio_1%3Cnormalized_height%3C2       0.652252
ahn3_feat_10m_1m_veg_TILE_000_BAND_band_ratio_6%3Cnormalized_height           0.470933
ahn3_feat_10m_1m_veg_TILE_000_BAND_skew_normalized_height                     0.994300
ahn3_feat_10m_1m_veg_TILE_000_BAND_std_normalized_height                      0.995969
ahn3_feat_10m_1m_veg_TILE_000_BAND_entropy_normalized_height                  0.718389
ahn3_feat_10m_1m_veg_TILE_000_BAND_perc_95_normalized_height                  0.916978
ahn3_feat_10m_1m_veg_TILE_000_BAND_band_ratio_3%3Cnormalized_height           0.539743
ahn3_feat_10m_1m_veg_TILE_000_BAND_density_absolute_mean_normalized_height    0.719107
ahn3_feat_10m_1m_veg_TILE_000_BAND_band_ratio_4%3Cnormalized_height%3C5       0.496363
ahn3_feat_10m_1m_veg_TILE_000_BAND_coeff_var_normalized_height                0.956170
ahn3_feat_10m_1m_veg_TILE_000_BAND_band_

In [72]:
# geotiff creation for vegetation-based features - file size statistics (in GB)
geotiff_veg_size_df["size"].describe()

count    33.000000
mean      0.775163
std       0.236534
min       0.208977
25%       0.549778
50%       0.753215
75%       0.995969
max       1.101807
Name: size, dtype: float64

In [73]:
# save to CSV file
geotiff_veg_size_df["size"].sort_index().to_csv("size-GB_geotiff-veg-output.csv")