In [1]:
import sys
sys.path.append('/home/potzschf/repos/')
from helperToolz.helpsters import *
from helperToolz.evapo import *
from collections import defaultdict
import re
import itertools

workhorse = True

if workhorse:
    origin = 'Aldhani/eoagritwin/'
else:
    origin = ''

In [None]:
# get files and check if years are same for both sensors. Also check what indices and metrics are available
training_files = getFilelist('/data/Aldhani/eoagritwin/et/Training_ML/training_data/raw_extracts/', '.parquet')

indices_by_year = defaultdict(set)

for file in training_files:
 
    match  = re.search(r'/(S2|S3)_(\d{4})(?:_([A-Za-z]{3,10}))?', file) # (?:_([A-Z]{3}))? --> the ? at the end makes the whole group optional, i.e. also files where 
                                                                    # only the first 2 groups match will be considered. the first question mark stops the loop from breaking,
                                                                    # if a file with only the first 2 groups is encountered
    if match:
        sensor, year, index = match.groups()
        if index:  # Only S2 has metrics in filename
            indices_by_year[year].add(f'{sensor}:{index}')
    

# Get all unique indices
all_indices = sorted({index for indices in indices_by_year.values() for index in indices})

# Prepare a table: rows = years, columns = indices
table_data = []
for year in sorted(indices_by_year):
    row = {index: '✔' if index in indices_by_year[year] else 'X' for index in all_indices}
    row['Year'] = year
    table_data.append(row)

# Create and show DataFrame
df = pd.DataFrame(table_data)
df = df.set_index('Year')
print('\nDatasets available for training\n')
print(df)



Datasets available for training

     S2:EVI S2:NDM S2:NDV S2:TCB S2:TCG S2:TCW S3:mean S3:median
Year                                                            
2017      ✔      ✔      ✔      ✔      ✔      ✔       ✔         ✔
2018      ✔      ✔      ✔      ✔      ✔      ✔       ✔         ✔
2019      ✔      ✔      ✔      ✔      ✔      ✔       ✔         ✔
2020      ✔      ✔      ✔      ✔      ✔      ✔       ✔         ✔
2021      ✔      ✔      ✔      ✔      ✔      ✔       ✔         ✔
2022      ✔      ✔      ✔      ✔      ✔      ✔       ✔         ✔
2023      ✔      ✔      ✔      ✔      ✔      ✔       ✔         ✔
2024      ✔      ✔      ✔      ✔      ✔      ✔       ✔         ✔


In [None]:
# bring S2 together and in shape
index_conti = []
for index in [col.split(':')[-1] for col in df.columns[:-2]]:
    print(index)
    temp_conti = []
    for file in training_files:
        if index in file:
            print(file)
            dat = pd.read_parquet(file)
            dat = dat.assign(year = re.search(r'_(\d{4})_', file).group(1))
            dat['S2'] = dat['S2'].replace(-9999, np.nan)
            dat = dat[['year', 'doy', 'row', 'col', 'S2', 'index']]
            dat = dat.rename(columns={'S2': file.split('_')[-1].split('.')[0]})
            dat = dat.drop('index', axis=1)
            temp_conti.append(dat)
    index_conti.append(pd.concat(temp_conti, ignore_index=True))

In [None]:
S2_block = index_conti[0]
for i in range(1, len(index_conti)):
    print(i)
    S2_block = pd.merge(S2_block, index_conti[i], on=['row', 'col', 'doy', 'year'], how='inner')
S2_block = S2_block.dropna()
S2_block.to_parquet(f'/data/{origin}et/Training_ML/training_data/combined_extracts/S2_powerblock.parquet', index=False)

In [None]:
# bring S3 together and in shape
index_conti = []
for index in [col.split(':')[-1] for col in df.columns[-2:]]:
    print(index)
    temp_conti = []
    for file in training_files:
        if index in file:
            print(file)
            dat = pd.read_parquet(file)
            dat = dat.assign(year = re.search(r'_(\d{4})_', file).group(1))
            dat = dat[['year', 'doy', 'row', 'col', f'S3_{index}']]
            dat = dat.rename(columns={f'S3_{index}': f'{index}'})
            temp_conti.append(dat)
    index_conti.append(pd.concat(temp_conti, ignore_index=True))

In [None]:
S3_block = pd.merge(index_conti[0], index_conti[1], on=['row', 'col', 'doy', 'year'], how='inner')
S3_block = S3_block.dropna()
S3_block.to_parquet(f'/data/{origin}et/Training_ML/training_data/combined_extracts/S3_powerblock.parquet', index=False)


In [None]:
S2_block = pd.read_parquet(f'/data/{origin}et/Training_ML/training_data/combined_extracts/S2_powerblock.parquet')
S3_block = pd.read_parquet(f'/data/{origin}et/Training_ML/training_data/combined_extracts/S3_powerblock.parquet')

In [None]:
block = pd.merge(S3_block, S2_block, on=['row', 'col', 'doy', 'year'], how='inner')
block.to_parquet(f'/data/{origin}et/Training_ML/training_data/combined_extracts/Powerblock.parquet', index=False)

In [14]:
# export little files
block['date'] = pd.to_datetime(block['year'], format='%Y') + pd.to_timedelta(block['doy'] - 1, unit='D')
block['month'] = block['date'].dt.month

In [16]:
for year in block['year'].unique():
    print(year)
    for month in block[block['year'] == year]['month'].unique(): # safeguard, if in the first year not all months are present
        if not os.path.isfile(f'/data/{origin}et/Training_ML/training_data/combined_extracts_monthly/{year}_{month}.parquet'):
            subset = block[(block['year'] == year) & (block['month'] == month)]
            subset = subset.drop(['date', 'month'], axis=1)
            subset.to_parquet(f'/data/{origin}et/Training_ML/training_data/combined_extracts_monthly/{year}_{month}.parquet')

2017
2018
2019
2020
2021
2022
2023
2024
