## 0. 학습 세팅

### 1) 메모리 정리

In [1]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

KeyboardInterrupt: 

### 2) 수정된 코드 자동 업데이트

In [None]:
%load_ext autoreload
%autoreload 2
import foolbox as fb

## 1. Load library

In [None]:
# 라이브러리 호출
import os
import time
from collections import Counter, defaultdict

import argparse
import easydict

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torchmetrics.functional.classification import accuracy
from torch.utils.tensorboard import SummaryWriter

from src.engines import train, evaluate, epoch_time
from src.utils import load_checkpoint, save_checkpoint

## 2. Variable Declaration

In [None]:
# # Jupyter 외 환경
# parser = argparse.ArgumentParser()
# parser.add_argument("--title", type=str, default="baseline")
# parser.add_argument("--device", type=str, default="cuda")
# parser.add_argument("--root", type=str, default="data")
# parser.add_argument("--batch_size", type=int, default=64)
# parser.add_argument("--num_workers", type=int, default=2)
# parser.add_argument("--epochs", type=int, default=100)
# parser.add_argument("--lr", type=float, default=0.001)
# parser.add_argument("--logs", type=str, default='logs')
# parser.add_argument("--checkpoints", type=str, default='checkpoints')
# parser.add_argument("--resume", type=bool, default=False)
# args = parser.parse_args()

# Jupyter 환경
args = easydict.EasyDict({
        "title" : "VEEM_RNN_Reg",
        "learn_type" : "regression",
        "device" : "cuda",
        "batch_size" : 32, # !!!
        "num_workers" : 2,
        "epochs" : 50, # !!!### 2) 모델 + 옵티마이저 + 손실함수 + 스케쥴러 + 메트릭 함수 정의
        "lr" : 0.01, # !!!
        "logs" : "logs",
        "checkpoints" : "checkpoints",
        "resume" : False,
        "test_ratio" : 0.25,
        "input_size" : 11,
        "hidden_size" : 2,
        "num_layers" : 1,
        "output_size" : 5,
        "regression_output_size" : 5,
        "classification_output_size" : 2,
        "regression_lr" : 0.01,
        "classification_lr" : 3e-4,
        "regression_epochs" : 500,
        "classification_epochs" : 500
    })

if(args.learn_type == "regression"):
    args.output_size = args.regression_output_size
    args.lr = args.regression_lr
    args.epochs = args.regression_epochs
else:
    args.output_size = args.classification_output_size
    args.lr = args.classification_lr
    args.epochs = args.classification_epochs

"""
print(args.output_size)
"""

## 3. Model Define

### 1) 모델 정의

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__() # 상속한 nn.Module에서 RNN에 해당하는 init 실행
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x): # x : [batch_size, sequence_length, input_size]
        """
        print("x.shape : ", x.shape)
        print("x : ", x)
        """
        # hidden state + cell state 초기화 (Bi-directional LSTM : 아래의 hidden and cell states의 첫번째 차원은 2*self.num_layers)
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(args.device) # h0 : [num_layers, batch_size, hidden_size]
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(args.device) # c0 : [num_layers, batch_size, hidden_size]
        """
        print("h0.shape : ", h0.shape)
        print("c0.shape : ", c0.shape)
        
        print("h0 : ", h0)
        print("c0 : ", c0)
        """

        # LSTM 순전파
        out, (hn, cn) = self.lstm(x, (h0, c0)) # out : [batch_size, sequence_length, hidden_size]
        """
        print("out.shape : ", out.shape)
        print("out : ", out)
        """
        
        # 마지막 time step(sequence length)의 hidden state 반환
        """
        print("out[:, -1, :] : ", out[:, -1, :])
        """
        out = self.fc(out[:, -1, :]) # out : [batch_size, hidden_size] -> out : [batch_size, output_size]
        """
        print("out.shape : ", out.shape)
        print("out : ", out)
        """
        return out

### 2) 모델 + 옵티마이저 + 손실함수 + 스케쥴러 + 메트릭 함수 정의

In [None]:
# Build model
model = RNN(args.input_size, args.hidden_size, args.num_layers, args.output_size).to(args.device)

# Build optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

# Build scheduler
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs  * 37)

# Build loss function + Build metric function
if(args.learn_type == "classification"):
    loss_fn = nn.CrossEntropyLoss()
    metric_fn = accuracy
else:
    loss_fn = nn.MSELoss()
    metric_fn = None

### 3) loggger 정의

In [None]:
# Build logger
train_logger = SummaryWriter(f'{args.logs}/train/{args.title}')
test_logger = SummaryWriter(f'{args.logs}/test/{args.title}')

## 4. Data Preprocessing

## - Person, SNSB

### 1) 데이터셋 변환

### 2) 데이터셋 불러오기

In [None]:
# 데이터셋 불러오기 + 출력
Person_path = f'C:\\Users\\Bang\\JupyterProjects\\VEEM_Project\\data\\Person_SNSB\\VEEM 대상자 정보.csv'
SNSB_path = f'C:\\Users\\Bang\\JupyterProjects\\VEEM_Project\\data\\Person_SNSB\\VEEM SNSB 데이터.csv'

Person_dataset=pd.read_csv(Person_path)
SNSB_dataset=pd.read_csv(SNSB_path)

print("ㅡㅡㅡㅡㅡ[Person_dataset.dtypes]ㅡㅡㅡㅡㅡ")
print(Person_dataset.dtypes)
print("ㅡㅡㅡㅡㅡ[SNSB_dataset.dtypes]ㅡㅡㅡㅡㅡ")
print(SNSB_dataset.dtypes)

### 3) 데이터 자료형 변환

In [None]:
Person_dataset['나이'] = Person_dataset['나이'].astype(float)
Person_dataset['교육연한'] = Person_dataset['교육연한'].astype(float)

SNSB_dataset['DST_F+B'] = SNSB_dataset['DST_F+B'].astype(float)
SNSB_dataset['S-K-BNT'] = SNSB_dataset['S-K-BNT'].astype(float)
SNSB_dataset['SVLT_delayedrecall'] = SNSB_dataset['SVLT_delayedrecall'].astype(float)
SNSB_dataset['K-TMT-E_B'] = SNSB_dataset['K-TMT-E_B'].astype(float)

print("ㅡㅡㅡㅡㅡ[Person_dataset.dtypes]ㅡㅡㅡㅡㅡ")
print(Person_dataset.dtypes)
print("ㅡㅡㅡㅡㅡ[SNSB_dataset.dtypes]ㅡㅡㅡㅡㅡ")
print(SNSB_dataset.dtypes)

### 4) 인덱스 지정

In [None]:
Person_dataset.set_index('번호', inplace=True)
SNSB_dataset.set_index('번호', inplace=True)

In [None]:
print("ㅡㅡㅡㅡㅡ[Person_dataset]ㅡㅡㅡㅡㅡ")
Person_dataset

In [None]:
print("ㅡㅡㅡㅡㅡ[SNSB_dataset]ㅡㅡㅡㅡㅡ")
SNSB_dataset

### 5) 필요한 피쳐 추출

In [None]:
# 데이터셋 나누기
SNSB_all_dataset=SNSB_dataset.iloc[:, 2:7]
SNSB_all_dataset = SNSB_all_dataset.dropna(axis = 0)
Person_all_dataset=Person_dataset.iloc[:, 2:]
Person_dementia_dataset=Person_dataset.iloc[:, 1:2]

In [None]:
print("ㅡㅡㅡㅡㅡ[SNSB_all_dataset]ㅡㅡㅡㅡㅡ")
SNSB_all_dataset

In [None]:
print("ㅡㅡㅡㅡㅡ[Person_all_dataset]ㅡㅡㅡㅡㅡ")
Person_all_dataset

In [None]:
print("ㅡㅡㅡㅡㅡ[Person_dementia_dataset]ㅡㅡㅡㅡㅡ")
Person_dementia_dataset

### 6) 데이터 변환

#### (1) Person_dementia_dataset : pandas -> list -> 문자열 임베딩 -> torch 변환

In [None]:
# for idx in Person_dementia_dataset.index:
#     print(Person_dementia_dataset.loc[idx,'집단'])

# pandas -> list
Person_dementia_dataset_list = Person_dementia_dataset['집단'].values.tolist()
print("Person_dementia_dataset_list : ", Person_dementia_dataset_list)

# list -> 문자열 임베딩
embedding_table = {'HC': 0, 'MCI': 1}
Person_dementia_dataset_embedding_list = []
for i, word in enumerate(Person_dementia_dataset_list):
    Person_dementia_dataset_embedding_list.append(embedding_table[word])
print("Person_dementia_dataset_embedding_list : ", Person_dementia_dataset_embedding_list)

# list -> torch 변환
Person_dementia_dataset_torch = torch.Tensor(Person_dementia_dataset_embedding_list)
print("Person_dementia_dataset_torch : ", Person_dementia_dataset_torch)

#### (2) SNSB_all_dataset : pandas -> numpy -> torch 변환

In [None]:
SNSB_all_dataset_np = SNSB_all_dataset.to_numpy()
SNSB_all_dataset_torch = torch.from_numpy(SNSB_all_dataset_np).float()
"""
print("ㅡㅡㅡㅡㅡ[SNSB_all_dataset_torch.shape]ㅡㅡㅡㅡㅡ")
print(SNSB_all_dataset_torch.shape)

print("ㅡㅡㅡㅡㅡ[SNSB_all_dataset_torch]ㅡㅡㅡㅡㅡ")
print(SNSB_all_dataset_torch)
"""

In [None]:
# Person_all_dataset_np = Person_all_dataset.to_numpy()
# Person_all_dataset_torch = torch.from_numpy(Person_all_dataset_np).float()

# print("ㅡㅡㅡㅡㅡ[Person_all_dataset_torch.shape]ㅡㅡㅡㅡㅡ")
# print(Person_all_dataset_torch.shape)

# print("ㅡㅡㅡㅡㅡ[Person_all_dataset_torch]ㅡㅡㅡㅡㅡ")
# print(Person_all_dataset_torch)

## - eyerpt, rpt

### 1) 데이터셋 변환

### 2) 데이터셋 불러오기

In [None]:
# 전체 데이터셋 정보
train_eyerpt_all_dataset = defaultdict(list)
test_eyerpt_all_dataset = defaultdict(list)
train_rpt_all_dataset = defaultdict(list)
test_rpt_all_dataset = defaultdict(list)

# 전체 경로
eyerpt_rpt_path = f'C:\\Users\\Bang\\JupyterProjects\\VEEM_Project\\data\\rpt\\'

# 전체 폴더 내 파일 리스트 추출
eyerpt_rpt_files_name = os.listdir(eyerpt_rpt_path)
eyerpt_rpt_files_name = sorted(eyerpt_rpt_files_name)

print("eyerpt_rpt_files_name : ", eyerpt_rpt_files_name)

# eyerpt, rpt 파일 리스트 추출
eyerpt_files_name = [eyerpt_rpt_file_name for eyerpt_rpt_file_name in eyerpt_rpt_files_name if "eye" in eyerpt_rpt_file_name]
rpt_files_name = [eyerpt_rpt_file_name for eyerpt_rpt_file_name in eyerpt_rpt_files_name if not "eye" in eyerpt_rpt_file_name]

print("eyerpt_files_name : ", eyerpt_files_name)
print("rpt_files_name : ", rpt_files_name)

### 3) train test split

In [None]:
# 전체 데이터 개수 -> 비율 기반 split
all_eyerpt_count = len(eyerpt_files_name)
test_eyerpt_count = int(all_eyerpt_count * args.test_ratio)
train_eyerpt_count = all_eyerpt_count - test_eyerpt_count

train_eyerpt_files_name = eyerpt_files_name[:train_eyerpt_count]
test_eyerpt_files_name = eyerpt_files_name[train_eyerpt_count:]
"""
print("train_eyerpt_files_name : ", train_eyerpt_files_name)
print("len(train_eyerpt_files_name) : ", len(train_eyerpt_files_name))
print("test_eyerpt_files_name : ", test_eyerpt_files_name)
print("len(test_eyerpt_files_name) : ", len(test_eyerpt_files_name))
"""

all_rpt_count = len(rpt_files_name)
test_rpt_count = int(all_rpt_count * args.test_ratio)
train_rpt_count = all_rpt_count - test_rpt_count

train_rpt_files_name = rpt_files_name[:train_rpt_count]
test_rpt_files_name = rpt_files_name[train_rpt_count:]
"""
print("train_rpt_files_name : ", train_rpt_files_name)
print("len(train_rpt_files_name) : ", len(train_rpt_files_name))
print("test_rpt_files_name : ", test_rpt_files_name)
print("len(test_rpt_files_name) : ", len(test_rpt_files_name))
"""

### 4) 전처리 함수 정의

In [None]:
# eyerpt 파일 전처리
def eyerpt_rpt_preprocessing(name):
    # =====================================================
    # (1) 변수 불러오기
    # =====================================================
    print("name : ", name)
    files_name = eval(f"{name}_files_name")
    all_dataset = eval(f"{name}_all_dataset")

    for i, file_name in enumerate(files_name):
        # =====================================================
        # (2) 데이터 불러오기
        # =====================================================
        path = eyerpt_rpt_path + file_name
        """
        print('path : ', path) # 확인용 코드
        """

        dataset=pd.read_csv(path)
        """
        print("ㅡㅡㅡㅡㅡ[dataset.dtypes]ㅡㅡㅡㅡㅡ")
        print(dataset.dtypes)
        """

        # =====================================================
        # (3) 데이터 자료형 변환
        # =====================================================

        # =====================================================
        # (4) 인덱스 지정
        # =====================================================
        # dataset['time stamp'] = pd.to_datetime(dataset['time stamp'])
        dataset.set_index('time stamp', inplace=True)
        """
        print("ㅡㅡㅡㅡㅡ[dataset]ㅡㅡㅡㅡㅡ")
        print(dataset)
        """

        # =====================================================
        # (5) 데이터 프레임 변환 + 필요한 피쳐 추출
        # =====================================================
        df = dataset.loc[:, :]
        if(name == "train_eyerpt" or name == "test_eyerpt"):
            df = df.drop(['time', 'beforeOBJ', 'presentOBJ', 'Obeject_name'], axis = 1) # 1 = columns
        else:
            df = df.drop(['total_task_time_s', 'hand_x_rotation_deg', 'hand_y_rotation_deg', 'hand_z_rotation_deg'], axis = 1)
        """
        print("ㅡㅡㅡㅡㅡ[df]ㅡㅡㅡㅡㅡ")
        print(df)
        """

        # =====================================================
        # (6) 결측치 행 제거
        # =====================================================
        df.dropna(axis=0, inplace = True)
        """
        print("ㅡㅡㅡㅡㅡ[df]ㅡㅡㅡㅡㅡ")
        print(df)
        print(df.Panel_num)
        """

        # =====================================================
        # (7) 실험 종료 이후 데이터 제거
        # =====================================================
        if(name == "train_eyerpt" or name == "test_eyerpt"):
            df_drop8 = df[df.Panel_num < 8]
        else:
            df_drop8 = df[df.panel_num < 8]
        """
        print("ㅡㅡㅡㅡㅡ[df_drop8]ㅡㅡㅡㅡㅡ")
        print(df_drop8)
        """

        # =====================================================
        # (8) 데이터 프레임 -> numpy 변환 -> torch 변환
        # =====================================================
        if(name == "train_eyerpt" or name == "test_eyerpt"):
            df_drop8_np = df_drop8.to_numpy()
            df_drop8_torch = torch.from_numpy(df_drop8_np)
        else:
            df_drop8_np = df_drop8.to_numpy()
            df_drop8_torch = torch.from_numpy(df_drop8_np).float()
        """
        print("ㅡㅡㅡㅡㅡ[df_drop8_torch.shape]ㅡㅡㅡㅡㅡ")
        print(df_drop8_torch.shape)

        print("ㅡㅡㅡㅡㅡ[df_drop8_torch]ㅡㅡㅡㅡㅡ")
        print(df_drop8_torch)
        """

        # =====================================================
        # (9) 데이터셋 길이 추출
        # =====================================================
        df_sequence_length = len(df_drop8_torch)
        """
        print("df_sequence_length : ", df_sequence_length)
        """
        
        # =====================================================
        # (10) 데이터셋 라벨 행 추출(float형 사용)
        # =====================================================
        if(args.learn_type == "regression"):
            # =====================================================
            # 1] SNSB 데이터셋 라벨 행 추출(float형 사용)
            # =====================================================
            SNSB_label = SNSB_all_dataset_torch[i,:]
            """
            print("SNSB_label : ", SNSB_label)
            """
            SNSB_label = SNSB_label.reshape(-1, len(SNSB_label)) # loss 학습을 위해 output과 형식 통일!!!
            """
            print("SNSB_label : ", SNSB_label)
            """
        else:
            # =====================================================
            # 2] Person_dementia 데이터셋 라벨 행 추출(float형 사용)
            # =====================================================
            Person_dementia_label = Person_dementia_dataset_torch[i]
            """
            print("Person_dementia_label : ", Person_dementia_label)
            """
            Person_dementia_label = Person_dementia_label.reshape(1) # loss 학습을 위해 output과 형식 통일!!!
            """
            print("Person_dementia_label : ", Person_dementia_label)
            """

        # =====================================================
        # (11) 전체 데이터셋 구성
        # =====================================================
        # 파일 정보 + 파일 sequence 길이 리스트화
        if(args.learn_type == "regression"):
            df_infor = [df_drop8_torch, df_sequence_length, SNSB_label]
        else:
            df_infor = [df_drop8_torch, df_sequence_length, Person_dementia_label]
        """
        print("df_infor : ", df_infor)
        """

        # 모든 정보 딕셔너리화
        all_dataset[file_name[0:2]].append(df_infor)
        # *all_dataset = dict(zip([file_name[0:2]], df_infor))
        """
        print("all_dataset['0'] : ", all_dataset['02'])
        print("all_dataset : ", all_dataset)
        """

    # =====================================================
    # (12) 전체 데이터셋 return
    # =====================================================
    exec(f"{name}_all_dataset = all_dataset")

    print("len(all_dataset) : ", len(eval(f"{name}_all_dataset")))

### 5) 전처리

In [None]:
eyerpt_rpt_preprocessing("train_eyerpt")
eyerpt_rpt_preprocessing("test_eyerpt")
eyerpt_rpt_preprocessing("train_rpt")
eyerpt_rpt_preprocessing("test_rpt")

print("len(train_eyerpt_all_dataset) : ", len(train_eyerpt_all_dataset))
print("len(test_eyerpt_all_dataset) : ", len(test_eyerpt_all_dataset))
print("len(train_rpt_all_dataset) : ", len(train_rpt_all_dataset))
print("len(test_rpt_all_dataset) : ", len(test_rpt_all_dataset))

### 3) 데이터 자료형 변환

rpt_dataset['panel_num'] = rpt_dataset['panel_num'].astype(float)
rpt_dataset['error'] = rpt_dataset['error'].astype(float)

print("ㅡㅡㅡㅡㅡ[eyerpt_dataset.dtypes]ㅡㅡㅡㅡㅡ")
print(eyerpt_dataset.dtypes)
print("ㅡㅡㅡㅡㅡ[rpt_dataset.dtypes]ㅡㅡㅡㅡㅡ")
print(rpt_dataset.dtypes)

## 5. Model Train

### 1) Load model epoch

In [None]:
# Load model
start_epoch = 0
if args.resume:
    start_epoch = load_checkpoint(args.checkpoints, args.title, model, optimizer)

### 2) Train model

In [None]:
for epoch in range(start_epoch, args.epochs):
    """
    # start timer
    start_time = time.time() # 확인용 코드
    """
    # 모델 학습 소요시간
    start_time = time.monotonic()
    
    # train one epoch + evaluate one epoch
    train_summary = train(train_rpt_all_dataset.items(), args.learn_type, args.input_size, model, optimizer, scheduler, loss_fn, metric_fn, args.device)

    # write log
    train_logger.add_scalar('Loss', train_summary['loss'], epoch + 1)
    if(args.learn_type == "regression"):
        pass
    else:
        train_logger.add_scalar('Accuracy', train_summary['metric'], epoch + 1)
    
    # save model
    save_checkpoint(args.checkpoints, args.title, model, optimizer, epoch + 1)
        
    # 모델 학습 소요시간
    end_time = time.monotonic()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Print log
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    if(args.learn_type == "regression"):
        print(f'\t Train Loss: {train_summary["loss"]:.3f}')
    else:
        print(f'\t Train Loss: {train_summary["loss"]:.3f} | Train Acc: {train_summary["metric"]:.2f}%')
    print(f'\t scheduled_lr : {scheduler.get_last_lr()[0]}')

# 모델 저장
torch.save(model.state_dict(), f"{args.title}.ckpt")

## 6. Model Test

In [None]:
# 학습된 모델 불러오기
model.load_state_dict(torch.load(f"{args.title}.ckpt"))

# 모델 성능 측정
test_summary = evaluate(test_rpt_all_dataset.items(), args.learn_type, args.input_size, model, loss_fn, metric_fn, args.device)

# write log
test_logger.add_scalar('Loss', test_summary['loss'], epoch + 1)
if(args.learn_type == "regression"):
    pass
else:
    test_logger.add_scalar('Accuracy', test_summary['metric'], epoch + 1)

if(args.learn_type == "regression"):
    print(f'\t Test Loss: {test_summary["loss"]:.3f}')
else:
    print(f'\t Test Loss: {test_summary["loss"]:.3f} | Test Acc: {test_summary["metric"]:.2f}%')