In [None]:
import pandas as pd
import numpy as np
import os, yaml, wandb, pickle, optuna, gc

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import TensorDataset, DataLoader, Sampler

from tqdm import tqdm
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
df = pd.read_parquet('stru_AE.parquet')

In [None]:
def make_transition(data):
    df = pd.read_parquet(data)
    s_col = [x for x in df if x[:2]=='s:']
    a_col = [x for x in df if x[:2]=='a:']
    r_col = [x for x in df if x[:2]=='r:']
    
    dict = {}
    dict['traj'] = {}

    s,a,r,s2,t  = [],[],[],[],[]
    
    for traj in df.traj.unique():
        df_traj = df[df['traj'] == traj]
        dict['traj'][traj] = {'s':[],'a':[],'r':[]}
        dict['traj'][traj]['s'] = df_traj[s_col].values.tolist()
        dict['traj'][traj]['a'] = df_traj[a_col].values.tolist()
        dict['traj'][traj]['r'] = df_traj[r_col].values.tolist()
        
        step_len = len(df_traj) - 1
        for step in range(step_len):
            s.append(dict['traj'][traj]['s'][step])
            a.append(dict['traj'][traj]['a'][step])
            r.append(dict['traj'][traj]['r'][step+1])
            s2.append(dict['traj'][traj]['s'][step+1])
            t.append(0)
        s.append(dict['traj'][traj]['s'][step_len-1])
        a.append(dict['traj'][traj]['a'][step_len-1])
        r.append(dict['traj'][traj]['r'][step_len])
        s2.append(dict['traj'][traj]['s'][step_len])
        t.append(1)
    
    s  = torch.FloatTensor(np.float32(s))
    a  = torch.LongTensor(np.int64(a))
    r  = torch.FloatTensor(np.float32(r))
    s2 = torch.FloatTensor(np.float32(s2))
    t  = torch.FloatTensor(np.float32(t))
    Dataset = TensorDataset(s, a, r, s2, t)

    return Dataset

In [None]:
def make_transition_test(data,batch_size=256):
    df = pd.read_parquet(data)
    s_col = [x for x in df if x[:2]=='s:']
    a_col = [x for x in df if x[:2]=='a:']
    r_col = [x for x in df if x[:2]=='r:']
    dict = {}
    dict['traj'] = {}

    s = []
    a = []
    r = []

    for traj in tqdm(df.traj.unique()):
        df_traj = df[df['traj'] == traj]
        dict['traj'][traj] = {'s':[],'a':[],'r':[]}
        dict['traj'][traj]['s'] = df_traj[s_col].values.tolist()
        dict['traj'][traj]['a'] = df_traj[a_col].values.tolist()
        dict['traj'][traj]['r'] = df_traj[r_col].values.tolist()

        step_len = len(df_traj) - rolling_size + 1
        for step in range(step_len):
            a.append(dict['traj'][traj]['s'][step])
            a.append(dict['traj'][traj]['a'][step])
            r.append(dict['traj'][traj]['r'][step])
    
    s = torch.FloatTensor(np.float32(s))
    a = torch.LongTensor(np.int64(a))
    r = torch.FloatTensor(np.float32(r))

    Dataset = TensorDataset(s,a,r)
    rt = DataLoader(Dataset,batch_size,shuffle=False)
    return rt

In [None]:
class Qnetwork(nn.Module):
    def __init__(self, input_size, hidden_sizes, output_size):
        super(Qnetwork, self).__init__()
        self.layers = nn.ModuleList()  # 레이어를 담을 리스트 초기화
        self.hidden_sizes = hidden_sizes

        # 입력 레이어
        self.layers.append(nn.Linear(input_size, self.hidden_sizes[0]))

        # 숨겨진 레이어들
        for i in range(1, len(self.hidden_sizes)):
            self.layers.append(nn.Linear(self.hidden_sizes[i - 1], self.hidden_sizes[i]))

        # 출력 레이어
        self.layers.append(nn.Linear(self.hidden_sizes[-1], output_size))

    def forward(self, x):
        # 숨겨진 레이어들을 순차적으로 적용
        for layer in self.layers[:-1]:
            x = F.relu(layer(x))  # 활성화 함수로 ReLU 사용
        # 출력 레이어 적용 (활성화 함수 없음)
        x = self.layers[-1](x)
        return x

In [None]:
def train(batch_size,lr,lr_decay,lr_step,loss_type,gamma,alpha,epochs,state_dim,hidden_size,num_actions,update_freq,GCO,AESEVER):
    network = Qnetwork(state_dim,hidden_size,num_actions)
    target_network = Qnetwork(state_dim,hidden_size,num_actions)
    gamma = 1.0
    patience = 20
    counters = 0
    best_loss = 1e6

    optimizer = optim.Adam(network.parameters(), lr=lr)
    scheduler = ExponentialLR(optimizer, gamma=lr_decay)

    train_data = make_transition(df,alpha,GCO,AESEVER)

    train_loader = DataLoader(train_data,batch_size=batch_size,shuffle=True)
    val_loader = DataLoader(train_data,batch_size=256,shuffle=False)
    
    for epoch in tqdm(range(epochs)):
        train_loss = 0
        update_counter = 0
        for s,a,r,s2,t in train_loader:
            s = s.to(device)
            a = a.to(device)
            r = r.to(device)
            s2 = s2.to(device)
            t = t.to(device)

            q = network(s)
            q_pred = q.gather(1, a).squeeze()

            q2 = target_network(s2).detach()
            q2_net = network(s2).detach()

            q2_max = q2.gather(1, torch.max(q2_net,dim=1)[1].unsqueeze(1)).squeeze(1).detach()

            bellman_target = torch.clamp(r, max=0.0, min=-1.0) + gamma * torch.clamp(q2_max.detach(), max=0.0, min=-1.0)*(1-t)

            if loss_type == "l1":loss = F.l1_loss(q_pred, bellman_target)
            elif loss_type == "smooth_l1":loss = F.smooth_l1_loss(q_pred, bellman_target)
            elif loss_type == "mse":loss = F.mse_loss(q_pred, bellman_target)

            optimizer.zero_grad()
            loss.backward()
            train_loss += loss.item()
            optimizer.step()

            update_counter += 1
            if update_counter == update_freq:
                target_network.load_state_dict(network.state_dict())
                update_counter = 0

        with torch.no_grad():
            val_loss = 0
            for s,a,r,s2,t in val_loader:
                s = s.to(device)
                a = a.to(device)
                r = r.to(device)
                s2 = s2.to(device)
                t = t.to(device)

                q = network(s)
                q2 = target_network(s2).detach()
                q_pred = q.gather(1, a).squeeze()

                q2_net = network(s2).detach()
                q2_max = q2.gather(1, torch.max(q2_net,dim=1)[1].unsqueeze(1)).squeeze()

                bellman_target = torch.clamp(r, max=0.0, min=-1.0) + gamma * torch.clamp(q2_max.detach(), max=0.0, min=-1.0)*(1-t)

                if loss_type == "l1":loss = F.l1_loss(q_pred, bellman_target)
                elif loss_type == "smooth_l1":loss = F.smooth_l1_loss(q_pred, bellman_target)
                elif loss_type == "mse":loss = F.mse_loss(q_pred, bellman_target)
                val_loss += loss.item()

            q_value = []
            reward  = []        
            
            for s,a,r in val_transition:
                q = network(s.to(device))
                q_value.append(q.detach().cpu().numpy())
                reward.append(r.detach().cpu().numpy())

            q_value = np.concatenate(q_value,axis=0)
            q_max = q_value.max(axis=1)
            auroc = roc_auc_score(reward,q_max)

        wandb.log({"Iter:": epoch, "train:":train_loss, "val:":val_loss,"counters":counters,"AUROC":auroc})
        
        if (epoch%lr_step ==0):
            scheduler.step()
            continue

        if val_loss < best_loss:
            best_loss = val_loss
            counters = 0
        else :
            counters += 1
            if (counters > patience)&(epoch>=99):
                break

    return auroc

In [None]:
def objective(trial):
    batch_size = trial.suggest_categorical("batch_size",[16,32,64,128,256])
    
    lr = trial.suggest_categorical("learning_rate",[1e-6,5e-6,1e-5,5e-5,1e-4,5e-4, 1e-3, 5e-3, 1e-2])
    lr_decay = trial.suggest_categorical("lr_decay",[0.75,0.8,0.85,0.9,0.95,1])
    lr_step = trial.suggest_categorical("lr_step",[2,5,10,20,30,50,100,200,250,500,1000])
    
    loss = trial.suggest_categorical("loss",['smooth_l1','mse'])

    update_freq = trial.suggest_categorical("update_freq",[2,4,8,16,32])
    hidden_size = trial.suggest_categorical("hidden_size",[[32],[64],[32,32],[32,64],[64,32],[64,64]])
    gamma = trial.suggest_categorical("gamma",[0.8,0.85,0.9,0.95,0.97,0.98,0.99,0.999])
    alpha = trial.suggest_categorical("alpha",[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0])

    GCO = trial.suggest_categorical("GCO",[2,3,4,5])
    AESEVER = trial.suggest_categorical("AESEVER",[1,2,3])
    
    epochs = 100000

    state_dim = len([x for x in df if x[:2]=='s:'])
    num_actions = len(df['a:action'].unique())

    wandb.init(
        entity='dahs-hb',project='stru_ae_exp', name=f'trial-{trial.number}', reinit=True,
        config={
        "batch_size":batch_size,
        "learning_rate":lr,
        "lr_decay":lr_decay,
        "lr_step":lr_step,
        "loss":loss,
        "gamma":gamma,
        "alpha":alpha,
        "GCO":GCO,
        "AESEVER":AESEVER,
        "hidden_size":hidden_size
    })

    auroc = train(batch_size,lr,lr_decay,lr_step,loss,gamma,alpha,epochs,state_dim,hidden_size,num_actions,update_freq,GCO,AESEVER)

    return auroc

In [None]:
device = 'cpu'
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

best_params = study.best_params
best_loss = study.best_value

print("Best Hyperparameters:", best_params)
print("Best Validation Loss:", best_loss)