In [6]:
!pip install tkan -qq

In [7]:
!pip install bayesian-optimization -qq

In [10]:
!pip uninstall -y scipy
!pip install --no-cache-dir scipy

Found existing installation: scipy 1.15.2
Uninstalling scipy-1.15.2:
  Successfully uninstalled scipy-1.15.2
Collecting scipy
  Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Downloading scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.7/37.7 MB[0m [31m317.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: scipy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.6 which is incompatible.
gensim 4.3.3 requires scipy<1.14.0,>=1.7.0, but you have scipy 1.15.3 which is incompatible.
ydata-profili

# **LOAD AND CREATE DATA**

In [None]:
from bayes_opt import BayesianOptimization

import numpy as np
import pandas as pd
import pickle
import re
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_squared_error

# Giả sử TKAN là lớp layer của bạn
# from your_tkan_module import TKAN

# Evaluation functions
def root_mean_squared_error(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mean_absolute_error(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))

def mean_absolute_percentage_error(y_true, y_pred):
    non_zero = y_true != 0
    if np.any(non_zero):
        return 100 * np.mean(np.abs((y_true[non_zero] - y_pred[non_zero]) / y_true[non_zero]))
    return np.inf


In [None]:
file_path = "/kaggle/input/doman-sg/CLN_SG_V2.xlsx"

df = pd.read_excel(file_path)
df_clean = df.copy()

# Xử lý ngoại lệ theo 3 sigma rule
for col in df_clean.select_dtypes(include='number').columns:
    mean = df_clean[col].mean()
    std = df_clean[col].std()
    lower = mean - 3 * std
    upper = mean + 3 * std
    # Loại bỏ giá trị vượt ±3σ
    df_clean[col] = np.where((df_clean[col] < lower) | (df_clean[col] > upper), np.nan, df_clean[col])

# Áp dụng nội suy tuyến tính theo chiều dọc (trục index)
df_clean = df_clean.interpolate(method='linear')

# Có thể điền tiếp bằng giá trị gần nhất (forward/backward fill)
df_clean = df_clean.fillna(method='bfill').fillna(method='ffill')

df = df_clean
print(f"Nan in data: {df.isnull().sum()}\n")

# 1. Tính hệ số tương quan Pearson giữa tất cả các cột và Man_song_saigon
correlations = df.corr(numeric_only=True)['Man_song_saigon'].drop('Man_song_saigon')

# 2. Chọn các đặc trưng có tương quan > 0.5 với Man_song_saigon
selected_features = correlations[correlations > 0.5].index.tolist()

# 3. Thêm cột Man_song_saigon (biến mục tiêu) và cột Ngay (ngày tháng)
selected_features += ['Man_song_saigon', 'Ngay']

# 4. Tạo DataFrame mới chỉ chứa các cột được chọn
df_selected = df[selected_features] if selected_features else df[['Man_song_saigon', 'Ngay']]

df.set_index('Ngay', inplace=True)

chosen_col = ['Man_song_saigon', 'Dodan_vao_nha_may', 'pH_Song_SG']
df = df[chosen_col]

column_index = df.columns.get_loc('Man_song_saigon')
print(f"Cột 'Man_song_saigon' là cột số: {column_index}")






In [None]:
def check_nan(array, array_name):
    if np.any(np.isnan(array)):
        nan_indices = np.where(np.isnan(array))
        print(f"Found {len(nan_indices[0])} nan in {array_name}")
        for idx in zip(*nan_indices):
            id = tuple(int(x) for x in idx)
            # print(f"  Index {id}: Value = {array[idx]}")

    else:
        print(f"No NaN in {array_name}")

In [None]:
# create data
import os
from sklearn.preprocessing import StandardScaler
import joblib  # Thêm joblib để lưu scaler

# Hàm tạo chuỗi từ dữ liệu
def create_sequences(data, target_col, window_size, forecast_horizon):
    X, y = [], []
    for i in range(len(data) - window_size - forecast_horizon + 1):
        window = data.iloc[i : i + window_size].values
        target_seq = data.iloc[i + window_size : i + window_size + forecast_horizon, target_col].values
        X.append(window)
        y.append(target_seq)
    return np.array(X), np.array(y)

output = {}

# Các tham số
n_aheads = [1, 3, 7]
window_sizes = [7, 15, 30]
vars = [['Man_song_saigon'], ['Man_song_saigon', 'Dodan_vao_nha_may', 'pH_Song_SG']]
check_nan(df, "df")
for window_size in window_sizes:
    for n_ahead in n_aheads:
        for var in vars:
            # print(f"Forecast horizon: {n_ahead}")
            # print(f"Window size: {window_size}")
            # print(f"Variable: {var}\n")

            forecast_horizon = n_ahead
            # check_nan(df[var], 'df[var]')

            # Tạo chuỗi
            X_all, y_all = create_sequences(df[var], target_col=0, window_size=window_size, forecast_horizon=forecast_horizon)
            # check_nan(X_all, "X_all")
            # check_nan(y_all, "y_all")

            # Chia 60% train, 20% val, 20% test
            n = len(X_all)
            train_end = int(n * 0.6)
            val_end = int(n * 0.8)

            X_train, y_train = X_all[:train_end], y_all[:train_end]
            X_val, y_val = X_all[train_end:val_end], y_all[train_end:val_end]
            X_test, y_test = X_all[val_end:], y_all[val_end:]

            # Khởi tạo StandardScaler
            scaler_X = StandardScaler()
            scaler_y = StandardScaler()

            # Reshape X_train để chuẩn hóa
            X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])
            X_train_scaled = scaler_X.fit_transform(X_train_reshaped)
            X_train_scaled = X_train_scaled.reshape(X_train.shape)

            X_val_reshaped = X_val.reshape(-1, X_val.shape[-1])
            X_val_scaled = scaler_X.transform(X_val_reshaped)
            X_val_scaled = X_val_scaled.reshape(X_val.shape)

            X_test_reshaped = X_test.reshape(-1, X_test.shape[-1])
            X_test_scaled = scaler_X.transform(X_test_reshaped)
            X_test_scaled = X_test_scaled.reshape(X_test.shape)

            # Chuẩn hóa y
            y_train_scaled = scaler_y.fit_transform(y_train)
            y_val_scaled = scaler_y.transform(y_val)
            y_test_scaled = scaler_y.transform(y_test)

            # Tạo tên file dựa trên các tham số
            file_name = f"ws{window_size}_fh{n_ahead}_var{len(var)}"
            output.update({file_name:
                {
                    "X_train": X_train_scaled,
                    "y_train": y_train_scaled,
                    "X_test": X_test_scaled,
                    "y_test": y_test_scaled,
                    "X_val": X_val_scaled,
                    "y_val": y_val_scaled,
                    "scaler_X": scaler_X,
                    "scaler_y": scaler_y,
                }})

joblib.dump(output, '/kaggle/working/data.pkl')

In [None]:
import os
import joblib
with open("/kaggle/working/data.pkl", "rb") as file:
    output = joblib.load(file)
print(output.keys())
print(type(output))


# **Bayesian Optimization**

In [None]:
from tkan import TKAN
from tqdm import tqdm

# Early stopping callback
def callbacks():
    return [EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)]

# Training parameters
BATCH_SIZE = 128
N_MAX_EPOCHS = 50
model_id = 'TKAN'
n_aheads = [1, 3, 7]

# Hàm mục tiêu cho Bayesian Optimization
def objective_function(units, sub_kan_input_dim, sub_kan_output_dim, learning_rate, n_ahead, data):
    # Chuyển đổi tham số thành kiểu phù hợp
    units = int(round(units))
    sub_kan_input_dim = int(round(sub_kan_input_dim))
    sub_kan_output_dim = int(round(sub_kan_output_dim))
    n_ahead = int(n_ahead)

    # Load data
    data_dict = output[data]
    X_train_scaled = data_dict['X_train']
    X_val_scaled = data_dict['X_val']
    X_test_scaled = data_dict['X_test']
    y_train_scaled = data_dict['y_train']
    y_val_scaled = data_dict['y_val']
    y_test_scaled = data_dict['y_test']
    scaler_y = data_dict['scaler_y']

    # Define TKAN model
    model = Sequential([
        Input(shape=X_train_scaled.shape[1:]),
        TKAN(units=units,
             sub_kan_input_dim=sub_kan_input_dim,
             sub_kan_output_dim=sub_kan_output_dim,
             return_sequences=False,
             activation='tanh',
             recurrent_activation='sigmoid'),
        Dense(units=n_ahead, activation='linear')
    ], name=model_id)

    # Compile model
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mae', jit_compile=False)

    # Train model
    history = model.fit(
        X_train_scaled, y_train_scaled,
        validation_data=(X_val_scaled, y_val_scaled),
        batch_size=BATCH_SIZE,
        epochs=N_MAX_EPOCHS,
        callbacks=callbacks(),
        shuffle=False,
        verbose=False
    )

    # Predict and inverse scale
    preds_scaled = model.predict(X_test_scaled, verbose=False)
    preds = scaler_y.inverse_transform(preds_scaled)
    y_test_orig = scaler_y.inverse_transform(y_test_scaled)

    # Calculate MAE (mục tiêu tối ưu hóa)
    mae = mean_absolute_error(y_true=y_test_orig, y_pred=preds)

    # Bayesian Optimization tối ưu hóa giá trị âm của MAE (vì nó tối đa hóa hàm mục tiêu)
    return -mae


In [None]:
# OPTIMIZE

def optimize_dataset(output, fh_dataset=3):
    results_rows = []
    data_group = {}

    # filter dataset with matched target fh
    for d in output:
        match = re.search(r'ws(\d+)_fh(\d+)_var(\d+)', d)
        if not match:
            print(f"Không khớp định dạng: {data}")
            continue
        n_ahead = int(match.group(2)) 
        if n_ahead == fh_dataset:
            data_group.update({d:output[d]})
    
    for data in tqdm(data_group, desc="DATA: "):
        match = re.search(r'ws(\d+)_fh(\d+)_var(\d+)', data)
        if not match:
            print(f"Không khớp định dạng: {data}")
            continue

        ws = int(match.group(1))
        n_ahead = int(match.group(2))
        n_var = int(match.group(3))
  
        # Định nghĩa phạm vi siêu tham số
        pbounds = {
            'units': (8, 128),  # Phạm vi liên tục
            'sub_kan_input_dim': (8, 64),
            'sub_kan_output_dim': (8, 64),
            'learning_rate': (1e-4, 1e-1),  # Thang log
            'n_ahead': (n_ahead, n_ahead),  # Giữ cố định n_ahead cho dataset
        }

        # Khởi tạo Bayesian Optimization
        optimizer = BayesianOptimization(
            f=lambda units, sub_kan_input_dim, sub_kan_output_dim, learning_rate, n_ahead: objective_function(
                units, sub_kan_input_dim, sub_kan_output_dim, learning_rate, n_ahead, data
            ),
            pbounds=pbounds,
            random_state=24,
        )

        start_optim = time.time()
        # Chạy tối ưu hóa (20 lần khởi tạo ngẫu nhiên + 20 lần lặp)
        optimizer.maximize(init_points=20, n_iter=20)

        # Lấy siêu tham số tốt nhất
        best_params = optimizer.max['params']
        best_mae = -optimizer.max['target']  # Chuyển đổi lại MAE
        print(f"Best parameters for {data}: {best_params}")
        print(f"Best MAE: {best_mae:.4f}")

        # Store result row for CSV
        result_rows.append({
            'dataset': data,
            'n_ahead': n_ahead,
            'ws': ws,
            'n_var': n_var,
            'mae': best_mae,
            'optim_time': time.time() - start_optim,
            'best_params': best_params
        })

        del optimizer

# # Save results to CSV
# results_df = pd.DataFrame(result_rows)
# results_df.to_csv('tkan_results_optimized.csv', index=False)
# print("TKAN results saved to 'tkan_results_optimized.csv'")

# with open('tkan_models_optimized.pkl', 'wb') as f:
#     pickle.dump(tkan_models, f)
optimize_dataset(output, fh_dataset = 7)