In [None]:
import fsspec
import pandas as pd

root_dir = 'abfss://lakebeehaven@beehaven.dfs.core.windows.net'
source = root_dir+"gold/processing/"

# orgainize silver data by location
silver_dfs = {
    "schwartau": pd.DataFrame(),
    "wurzburg": pd.DataFrame()
}
# empty list to hold merged DataFrames
gold_dfs = []

# list files in source
fs = fsspec.filesystem("abfs")
file_list_longform = fs.ls(source)
file_list = [file.rsplit("/", maxsplit=1)[1] for file in file_list_longform]

for file in file_list:
    # skip "hidden" files
    if file.startswith("."):
        continue

    print(f"processing {file}")
    df = pd.read_parquet(source+file, filesystem=fs)
    location = file.split("_")[0]

    # temperature and humidity come from hive data and weather station: clarify source
    if df.shape[1] == 2:
        if "temperature" in df.columns:
            df.columns = ["timestamp", "temperature_hive"]
        elif "humidity" in df.columns:
            df.columns = ["timestamp", "humidity_hive"]

    if silver_dfs[location].empty:
        silver_dfs[location] = df.copy()
    else:
        silver_dfs[location] = silver_dfs[location].merge(df, on="timestamp", how="outer")

for loc in silver_dfs:
    # skip locations with no DataFrames
    if silver_dfs[loc].empty:
        continue
    # add location to DataFrames
    silver_dfs[loc]["location"] = loc
    silver_dfs[loc]["location"] = silver_dfs[loc]["location"].astype("category")
    # sort DataFrame by timestamps
    silver_dfs[loc] = silver_dfs[loc].sort_values(by="timestamp")

gold = (
    # combine silver DataFrames
    pd.concat((silver_dfs["schwartau"], silver_dfs["wurzburg"]), ignore_index=True)
    # set stardard column order
    [['timestamp', 'location', 'flow_out', 'flow_in',
       'temperature_hive', 'humidity_hive', 'weight', 'precipitation', 'pressure_msl',
       'sunshine', 'temperature', 'wind_direction', 'wind_speed',
       'cloud_cover', 'dew_point', 'relative_humidity', 'wind_gust_direction',
       'wind_gust_speed', 'solar', 'precipitation_source_distance',
       'pressure_msl_source_distance', 'sunshine_source_distance',
       'temperature_source_distance', 'wind_direction_source_distance',
       'wind_speed_source_distance', 'cloud_cover_source_distance',
       'dew_point_source_distance', 'relative_humidity_source_distance',
       'visibility_source_distance', 'wind_gust_direction_source_distance',
       'wind_gust_speed_source_distance', 'solar_source_distance']]
)
gold

In [None]:
gold.info()

In [None]:
# write to file
sink = root_dir+"gold/"

write_name = sink+f"hivedata__{pd.Timestamp.now().strftime('%Y-%m-%dT%Hh%Mm%Ss')}.parquet"
with fs.open(write_name, "wb") as f:
    gold.to_parquet(f, index=False)


In [None]:
mssparkutils.session.stop()