In [18]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf 

from freeman.tensor import MakeWindow
from freeman.tensor import dataset_spec, check_window_data

In [2]:
_PATH_DATA = os.path.join(
    os.path.expanduser('~'),
    'projects', 'ai-projects', '61-WIND-TURBIN', 'data',
    'climate_test.csv'
)

df_data = pd.read_csv(_PATH_DATA)

In [3]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7013 entries, 0 to 7012
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   p (mbar)         6940 non-null   float64
 1   T (degC)         6940 non-null   float64
 2   Tpot (K)         6940 non-null   float64
 3   Tdew (degC)      6940 non-null   float64
 4   rh (%)           6940 non-null   float64
 5   VPmax (mbar)     6940 non-null   float64
 6   VPact (mbar)     6940 non-null   float64
 7   VPdef (mbar)     6940 non-null   float64
 8   sh (g/kg)        6940 non-null   float64
 9   H2OC (mmol/mol)  6940 non-null   float64
 10  rho (g/m**3)     6940 non-null   float64
 11  Vwv_X            6940 non-null   float64
 12  Vwv_Y            6940 non-null   float64
 13  Vmwv_X           6940 non-null   float64
 14  Vmwv_Y           6940 non-null   float64
 15  day_sin          7013 non-null   float64
 16  day_cos          7013 non-null   float64
 17  year_sin      

In [4]:
df_data.fillna(0.0, inplace=True)

# 전체 컬럼 인덱싱
column_indices = {name: i for i, name in enumerate(df_data.columns)}

# 목표 컬럼 인덱스 구하기
target_col = 'T (degC)'
target_idx = column_indices[target_col]

In [5]:
df_data.isna().sum()

p (mbar)           0
T (degC)           0
Tpot (K)           0
Tdew (degC)        0
rh (%)             0
VPmax (mbar)       0
VPact (mbar)       0
VPdef (mbar)       0
sh (g/kg)          0
H2OC (mmol/mol)    0
rho (g/m**3)       0
Vwv_X              0
Vwv_Y              0
Vmwv_X             0
Vmwv_Y             0
day_sin            0
day_cos            0
year_sin           0
year_cos           0
dtype: int64

In [6]:
WINDOW_SIZE = 24
BATCH_SIZE = 32

In [7]:
nd_data = df_data.values

In [8]:
try:
    df_win_data = MakeWindow(
        df_data, 
        WINDOW_SIZE, BATCH_SIZE,
        label_col=target_col
    )
except ValueError as ve:
    print(f'Program Error:\n{ve}')
except Exception as e:
    print(f'System Error:\n{e}')

2023-09-05 19:04:02.851135: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-05 19:04:02.854992: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-09-05 19:04:02.855140: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Batch dataset shape: features(32, 24, 19), label(32,)
Last batch dataset shape: features(13, 24, 19), label(13,)
Batch dataset size: 219
Total dataset row size: 6989 = 32 * (219 - 1) + 13


In [9]:
try:
    nd_win_data = MakeWindow(
        nd_data, 
        WINDOW_SIZE, BATCH_SIZE,
        label_idx=target_idx,
        drop_col_label=True
    )
except ValueError as ve:
    print(f'Program Error:\n{ve}')
except Exception as e:
    print(f'System Error:\n{e}')

Batch dataset shape: features(32, 24, 18), label(32,)
Last batch dataset shape: features(13, 24, 18), label(13,)
Batch dataset size: 219
Total dataset row size: 6989 = 32 * (219 - 1) + 13


In [10]:
df_X = df_data.copy()

In [11]:
df_y = df_X.pop(target_col)

In [12]:
try:
    tuple_win_data = MakeWindow(
        (df_X.values, df_y), 
        WINDOW_SIZE, BATCH_SIZE,
        last_idx_label=True
    )
except ValueError as ve:
    print(f'Program Error:\n{ve}')
except Exception as e:
    print(f'System Error:\n{e}')

Batch dataset shape: features(32, 24, 18), label(32,)
Last batch dataset shape: features(14, 24, 18), label(14,)
Batch dataset size: 219
Total dataset row size: 6990 = 32 * (219 - 1) + 14


In [13]:
# wd: window data
# sd: source data
# ws: window size
# bs: batch size
# tidx: target column index
# dcol: drop target column (bool)
# lidx: get target in window last row (bool)
# def check_window_data(wd, sd, ws, bs, tidx, dcol, lidx):
#     is_match = True
#     idx = 0
#     X_data = np.delete(sd, tidx, axis=1) if dcol else sd 
#     y_data = sd[ws-1 if lidx else ws:, tidx]
    
#     for x, y in wd.take(-1):
#         idxx1 = 0
#         for xe in x:
#             idxx2 = 0
#             for xee in xe.numpy():
#                 if np.all(X_data[idx*bs+idxx1+idxx2] != xee):
#                     print(f'Not match X({idx}, {idxx1}, {idxx2}): \n{X_data[idx*bs+idxx1+idxx2]}\n{xee}')
#                     is_match = False
#                 idxx2 += 1
#             idxx1 += 1
#         idxy = 0
#         for ye in y:
#             if np.all(y_data[idx*bs+idxy]!=ye):
#                 print(f'Not match y({idx}, {idxy}): {y_data[idx*bs+idxy]}, {ye}')    
#                 is_match = False
#             idxy += 1   
#         idx += 1     
#         if not is_match:
#             print('Not Match')
#             break
        
#     if is_match:
#         print('Match')

In [14]:
df_win_data_data = df_win_data.win_data
for X, y in df_win_data_data.take(1):
    print()




In [15]:
check_window_data(
    df_win_data.win_data, nd_data, WINDOW_SIZE, BATCH_SIZE, target_idx, False, False
)

Match


In [16]:
check_window_data(
    nd_win_data.win_data, nd_data, WINDOW_SIZE, BATCH_SIZE, target_idx, True, False
)

Match


In [19]:
check_window_data(
    tuple_win_data.win_data, nd_data, WINDOW_SIZE, BATCH_SIZE, target_idx, True, True
)

Match


In [20]:
# shuffle data
try:
    tuple_win_data2 = MakeWindow(
        (df_X.values, df_y), 
        WINDOW_SIZE, BATCH_SIZE,
        last_idx_label=True,
        shuffle=True,
    )
except ValueError as ve:
    print(f'Program Error:\n{ve}')
except Exception as e:
    print(f'System Error:\n{e}')

Batch dataset shape: features(32, 24, 18), label(32,)
Last batch dataset shape: features(14, 24, 18), label(14,)
Batch dataset size: 219
Total dataset row size: 6990 = 32 * (219 - 1) + 14


In [21]:
check_window_data(
    tuple_win_data2.win_data, nd_data, WINDOW_SIZE, BATCH_SIZE, target_idx, True, True
)
# 안맞는게 당연

Not match X(0, 0, 0): 
[ 1.7248562  -0.83529805 -1.02871984 -0.55946369 -0.75250665 -1.02361973
 -0.30639778 -1.03900741 -1.04037568  1.11983281  0.29779412  0.44131904
  0.31486773  0.47851644 -1.22491234  0.70702783  1.28921524  0.44389967]
[ 0.1054771  -0.32102723 -0.2817172   0.0561462  -0.48659263 -0.46865034
 -0.36798168 -0.4709005  -0.46986205  0.30828444  0.35707618 -1.18849002
  0.37381487 -1.58511657  0.99977896 -1.00008532  1.32848104  0.29064089]
Not match X(0, 0, 1): 
[ 1.66254146 -0.93226173 -0.98355557 -0.19192028 -0.81306993 -0.99511916
 -0.42852179 -1.01032144 -1.01183045  1.20203039  0.23979068  0.36806708
  0.24283854  0.40190114 -1.00017288  0.99992215  1.28952052  0.44293828]
[ 0.10004969 -0.26250477 -0.31612807 -0.17946633 -0.4423098  -0.49873428
 -0.27125668 -0.50145729 -0.49997141  0.25435501  0.63613683 -1.24550629
  0.73166601 -1.43453563  0.7068928  -1.22483103  1.32867472  0.28965183]
Not match X(0, 0, 2): 
[ 1.64103282 -1.0273129  -1.04616424 -0.06636826 -0