# CV Fold Preparation

This notebook prepare independant CV folds 



# Config

In [2]:
COLAB = False

if COLAB : 
  configSetup = {
      'COLAB'           : 'True',
      'PATH_ROOT_DRIVE' : '/content/drive/MyDrive/Projects/Forecast',
      'PATH_ROOT_LOCAL' : '/content/session',
      'PATH_SUNDL'      : '/content/sundl',
      'PATH_PROJECT'    : '/content/sundl/notebooks/flare_limits_pcnn'
  }
  !git clone https://github.com/gfrancisco20/sundl.git
  import sys
  import re
  sys.path.append(configSetup['PATH_SUNDL'])
  sys.path.append(configSetup['PATH_PROJECT'])
  configFile = f'{configSetup["PATH_PROJECT"]}/config.py'
  with open(configFile, 'r') as file:
    content = file.read()
  for constant in configSetup.keys():
    content = re.sub(re.compile(f'{constant} = .*'), f'{constant} = \'{configSetup[constant]}\'', content)
  with open(configFile, 'w') as file:
    file.write(content)
   
from config import *
from sundl.utils.colab import mountDrive
if COLAB:
  # mouting drive content in session on colab
  mountDrive()

# Libraries

In [5]:
from pathlib import Path
import datetime
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import dill as pickle

from sundl.utils.data import read_Dataframe_With_Dates

# Folds building

## Raw Balanced Folds

In [8]:
%%time
from sundl.utils.flare.thresholds import mpfTresh, totehTresh
from sundl.utils.flare.windows import windowHistoryFromFlList
from sundl.utils.flare import flux2cls
from sundl.cv.temporal import buildChunks, instantiateFoldsSequentially

labelCols     = 'sw_v'                   # column name used to compute labels
testSplitDate = datetime.datetime(2019,1,1) # splitting date between CV and operational tes
buffer        = pd.DateOffset(days=4)      # size of discarded buffer between temporal chunks (ensure folds indepedancy)
chunk_width   = pd.DateOffset(days=20)      # size of temporal chunks
bufferTest    = pd.DateOffset(days=27)
bufferCme     = pd.DateOffset(days=4)  
n_fold        = 5                           # number of folds for CV
windows_avg_h = [1,6,12]

force_rebuild = True

minDate = pd.to_datetime('2010-05-13 00:00:00')
maxDate = pd.to_datetime('2023-04-19 00:00:00')
# if dataset unzipped, from 
# from sundl.utils.data import loadMinMaxDates
# minDate, maxDate = loadMinMaxDates(PATH_IMAGES)

timeserie_sw_folds = {}
dfTest = {}

for window_h in windows_avg_h:

  chunkPath = F_PATH_CHUNKS_SW(chunk_width, buffer)
  foldsPath = F_PATH_FOLDS_SW(chunk_width, buffer, testSplitDate, window_h)
  testPath = F_PATH_TEST_SW(bufferTest, testSplitDate, window_h)
  
  swPath = F_PATH_SW_TS(window_h)

  print(minDate, maxDate)
  if foldsPath.exists() and not force_rebuild:
    chunks = read_Dataframe_With_Dates(chunkPath,['start','end'])
    dfTest[window_h] = pd.read_csv(testPath)
    with open(foldsPath, 'rb') as f1:
      timeserie_sw_folds[window_h] = pickle.load(f1)
  else:
    
    if swPath.exists():
      timeserie_sw = read_Dataframe_With_Dates(swPath,  minDate = minDate, maxDate = None)
    else:
      timeserie_sw = read_Dataframe_With_Dates(F_PATH_SW_TS(1),  minDate = minDate, maxDate = None)
      timeserie_sw = timeserie_sw[['sw_v']].rolling(f'{window_h}H', closed = 'right').mean()[int(window_h/2):]
    
    
    if foldsPath.exists():
      chunks = read_Dataframe_With_Dates(chunkPath,['start','end'])
    else:
      chunks = buildChunks(timeserie_sw,
                          chunk_width = chunk_width,
                          buffer      = buffer)
      
      chunks.to_csv(chunkPath)
      
    # Instantiate balanced folds 
    excludeDateRanges = [(date-bufferCme, date+bufferCme) for date in read_Dataframe_With_Dates(PATH_CMES).index]
    timeserie_sw_folds[window_h] , dfTest[window_h] = instantiateFoldsSequentially(
        chunks    = chunks,
        timeserie = timeserie_sw,
        n_folds = n_fold,
        testType  = 'temporal', #param [ 'folds', 'temporal' ] {type:"string"}
        testDate  = datetime.datetime(2019,1,1,0,0,0), # for temporal test split only,
        excludeDateRanges = excludeDateRanges
        )
    
    with open(foldsPath, 'wb') as f1:
      pickle.dump(timeserie_sw_folds[window_h] , f1)
    dfTest[window_h].to_csv(testPath)
    


            

2010-05-13 00:00:00 2023-04-19 00:00:00
2010-05-13 00:00:00 2023-04-19 00:00:00
2010-05-13 00:00:00 2023-04-19 00:00:00
CPU times: user 25.5 s, sys: 536 ms, total: 26 s
Wall time: 27.5 s


In [14]:
dfTimeseries = dfTest[1]
dfTimeseries

Unnamed: 0_level_0,Unnamed: 0,sw_v,sigma
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-28 00:00:00,97055,445.0,7.0
2019-01-28 02:00:00,97057,433.0,9.0
2019-01-28 04:00:00,97059,438.0,7.0
2019-01-28 06:00:00,97061,427.0,8.0
2019-01-28 08:00:00,97063,434.0,9.0
...,...,...,...
2023-01-11 14:00:00,131725,371.0,9.0
2023-01-11 16:00:00,131727,373.0,13.0
2023-01-11 18:00:00,131729,379.0,8.0
2023-01-11 20:00:00,131731,366.0,4.0


In [28]:
labelCol = 'sigma'
offLabel = 4
dfTimeseries[f'label_{offLabel}'] = dfTimeseries[labelCol].rolling(
            window = f'{offLabel}H',
            closed = 'right', # min_periods = int(input_lag)
            ).apply(lambda x: x[-1]).shift(freq=f'-{offLabel}H')[:-int(offLabel/2)] 
dfTimeseries#[~dfTimeseries['label_4'].isna()]

  ).apply(lambda x: x[-1]).shift(freq=f'-{offLabel}H')[:-int(offLabel/2)]


Unnamed: 0_level_0,Unnamed: 0,sw_v,sigma,label_4
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-28 00:00:00,97055,445.0,7.0,7.0
2019-01-28 02:00:00,97057,433.0,9.0,8.0
2019-01-28 04:00:00,97059,438.0,7.0,9.0
2019-01-28 06:00:00,97061,427.0,8.0,15.0
2019-01-28 08:00:00,97063,434.0,9.0,4.0
...,...,...,...,...
2023-01-11 14:00:00,131725,371.0,9.0,8.0
2023-01-11 16:00:00,131727,373.0,13.0,
2023-01-11 18:00:00,131729,379.0,8.0,
2023-01-11 20:00:00,131731,366.0,4.0,


### Plot

In [11]:
import numpy as np
window_h = 1
foldsSize = pd.DataFrame({'count_train':np.zeros(len(timeserie_sw_folds[window_h])), 'count_val':np.zeros(len(timeserie_sw_folds[window_h]))})
for k,trainVal in enumerate(timeserie_sw_folds[window_h]):
  train,val = trainVal
  foldsSize.loc[k,'count_train'] = len(train)
  foldsSize.loc[k,'count_val'] = len(val)
foldsSize.index.rename('foldID',inplace=True)
foldsSize

Unnamed: 0_level_0,count_train,count_val
foldID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,20781.0,5386.0
1,21139.0,5028.0
2,20968.0,5199.0
3,21023.0,5144.0
4,20757.0,5410.0


In [14]:
train

Unnamed: 0_level_0,Unnamed: 0,sw_v,sigma
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-05-13 00:00:00,20687,376.0,7.0
2010-05-13 02:00:00,20689,389.0,8.0
2010-05-13 04:00:00,20691,447.0,6.0
2010-05-13 06:00:00,20693,452.0,10.0
2010-05-13 08:00:00,20695,488.0,9.0
...,...,...,...
2018-11-09 06:00:00,95141,429.0,5.0
2018-11-09 08:00:00,95143,429.0,10.0
2018-11-09 10:00:00,95145,461.0,8.0
2018-11-09 12:00:00,95147,467.0,3.0
