# Import library

In [2]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import sys
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import plotly.colors as pc
from plotly.subplots import make_subplots
import matplotlib.dates as mdates
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset, random_split, TensorDataset
import torch.nn.functional as F
import torch.nn.init as init

import scipy
import pickle
import matplotlib.cbook as cbook
import random
from os.path import dirname, join as pjoin
import copy
import cloudsplitter import *

# def report_savefig(plotname, resultfolderpath = './', formattype = 'pdf', pad_margin = 0.01):
#     fig.savefig(resultfolderpath + plotname + f'.{formattype}',format=formattype, 
#                 bbox_inches='tight',pad_inches=pad_margin,transparent=True)

In [12]:
data = pd.read_csv('GHI_CI_NCEP_Iclr_15min_DeDe2023_newiclr.csv', index_col = 'Datetime',parse_dates=['Datetime'])
metadata = pd.read_csv('DeDe_48sites_metadata.csv')
# data.dropna(inplace=True)\n",
data.loc[data['I'] < 0, 'I'] = 0
# Add hour index column
data['HI'] = data.index.hour

data.drop(columns=['rawI', 'CI_CM', 'CI_CM_interpolated', 'CI_RGB'], inplace = True)
data.rename(columns={'CI_RGB_interpolated':'CI_R', 'Short-wave irradiation':'Inwp'}, inplace = True)
data['CI_R'] = data['CI_R']/255
data.drop(columns=['Temperature', 'Relative Humidity', 'Pressure', 'Wind speed'
                 , 'Wind direction', 'Rainfall', 'Snowfall', 'Snow depth']
        , inplace = True)
data.head()

Unnamed: 0_level_0,Site_id,I,CI_R,Inwp,Iclr,HI
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01,1,0.0,,,0.0,0
2023-01-01,3,0.0,,,0.0,0
2023-01-01,4,0.0,,,0.0,0
2023-01-01,5,0.0,,,0.0,0
2023-01-01,6,0.0,,,0.0,0


In [13]:
df_select = pd.DataFrame()
site_ids = sorted(data['Site_id'].unique())
columns_to_ahead = ['Iclr', 'Inwp', 'HI']

numautolag = 4
numcilag = 4
numstepahead = 4

for site_id in site_ids:
    # Filter data for the current site
    df_site = data[data['Site_id'] == site_id].copy()
    
    # Add autolag
    for j in range(numautolag):
        collagname = f'I_lag{int((j+1)*15)}'
        df_site[collagname] = df_site['I'].shift(periods = j+1, freq = '15T')
    
    # Add exogeneous lag
    for j in range(numcilag):
        collagname = f'CI_R_lag{int((j+1)*15)}'
        df_site[collagname] = df_site['CI_R'].shift(periods = j+1, freq = '15T')
    
    ## K-step ahead of future regressor       
    for k in range(numstepahead):
        for colahead in columns_to_ahead:
            colaheadname = f'{colahead}_ahead{int((k+1)*15)}'
            df_site[colaheadname] = df_site[colahead].shift(periods = -(k+1), freq = '15T')
    
    ## K-step ahead of target 
    for k in range(numstepahead):
        colaheadname = f'I_ahead{int((k+1)*15)}'
        df_site[colaheadname] = df_site['I'].shift(periods = -(k+1), freq = '15T')
        
    df_select = pd.concat([df_select, df_site])
df_select = df_select.sort_values(by=['Datetime', 'Site_id'])
# df_select = df_withlag.between_time('07:00','17:00')
df_select.head()

Unnamed: 0_level_0,Site_id,I,CI_R,Inwp,Iclr,HI,I_lag15,I_lag30,I_lag45,I_lag60,...,Iclr_ahead45,Inwp_ahead45,HI_ahead45,Iclr_ahead60,Inwp_ahead60,HI_ahead60,I_ahead15,I_ahead30,I_ahead45,I_ahead60
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-01,1,0.0,,,0.0,0,,,,,...,0.0,,0.0,0.0,,1.0,0.0,0.0,0.0,0.0
2023-01-01,3,0.0,,,0.0,0,,,,,...,0.0,,0.0,0.0,,1.0,0.0,0.0,0.0,0.0
2023-01-01,4,0.0,,,0.0,0,,,,,...,0.0,,0.0,0.0,,1.0,0.0,0.0,0.0,0.0
2023-01-01,5,0.0,,,0.0,0,,,,,...,0.0,,0.0,0.0,,1.0,0.0,0.0,0.0,0.0
2023-01-01,6,0.0,,,0.0,0,,,,,...,0.0,,0.0,0.0,,1.0,0.0,0.0,0.0,0.0


## Extract Central region

In [14]:
site_region = {}
for region in metadata['Region'].unique():
    site_region[region] = metadata[metadata['Region']==region]['Site_id'].tolist()
    
# Remove abnormal site
site_region['N'].remove(22)
df_central = df_select[df_select['Site_id'].isin(site_region['C'])]

# Split data by smoothness

In [15]:
# clear_threshold_k = 0.85
partly_threshold_k = 0.75
countconcave_threshold = 10

min_threshold_k = 0.75
count_lowk_threshold = 0 # Must be >= 2 to cover Site 36
    
df_train, df_val, df_test = cloudcondition_splitter(df_central, valratio = 0.1, testratio = 0.1, method = 'smoothness'
                                                    , returncond = True
                                                    , countconcave_threshold = countconcave_threshold
                                                    , partly_threshold_k = partly_threshold_k
                                                    , min_threshold_k = min_threshold_k
                                                   , count_lowk_threshold = count_lowk_threshold)


cond_list = ['clearsky', 'partlycloudy', 'cloudy']
df_check_list = [df_train, df_val, df_test]
data_name = ['Train', 'Val', 'Test']
for i, df_check in enumerate(df_check_list):
    print(f'----- {data_name[i]} -----')
    for cond in cond_list:
        skypercent = len(df_check[df_check['skycondition'] == cond])*100/len(df_check)
        print(f'{cond} portion: {skypercent:.2f}%')
        
# variables_to_save = {'train':df_train, 'val':df_val, 'test':df_test}
# # Save
# dict_path = os.path.join(datafolderpath, 'data_central_completeddatetime_equallag_smoothsplit.pkl')
# with open(dict_path, 'wb') as pickle_file:
#     pickle.dump(variables_to_save, pickle_file)


NameError: name 'cloudcondition_splitter' is not defined