In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
##### pytorch #####
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


##### 시각화 #####
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns 

##### 기본 모듈 #####
import pandas as pd
import numpy as np
import os
import random
import json
import math
import easydict
from pprint import pprint
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

##### 디버깅 #####
import pdb

##### cuda #####
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # GPU 할당

##### 경고무시 #####
import warnings
warnings.filterwarnings(action='ignore')

In [4]:
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [5]:
import sentencepiece as spm

vocab_file = "/content/drive/MyDrive/2.Study/GPT/Data/kowiki.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)

True

In [6]:
config = easydict.EasyDict({
    "n_dec_vocab": len(vocab),
    "n_dec_seq": 256,
    "n_layer": 6,
    "d_hidn": 256,
    "i_pad": 0,
    "d_ff": 1024,
    "n_head": 4,
    "d_head": 64,
    "dropout": 0.1,
    'n_output' : 5,
    "layer_norm_epsilon": 1e-12,
    'batch_size' : 128,
    'max_len' : 256,
    'save_pretrain' : "/content/drive/MyDrive/2.Study/GPT/Data/save_gpt_pretrain.pth",
    'save_base' : "/content/drive/MyDrive/2.Study/GPT/Data/save_base.pth",
    'save_fine' : "/content/drive/MyDrive/2.Study/GPT/Data/save_fine.pth"
                })

In [7]:
data = pd.read_csv('/content/drive/MyDrive/2.Study/GPT/Data/final_emotional_dataset.csv', index_col = 0)
# data = pd.read_csv('/content/drive/MyDrive/1. AIFFEL/AIFFELTON/data/preprocessing_emotional_review.csv', index_col = 0)
data = data.dropna()
data = data.sample(frac = 1).reset_index(drop = True)
data['label'].unique()

array([0, 1])

In [8]:
bound = int(len(data) * 0.8)
train_data = data[ : bound ]
test_data = data[ bound : ]

train_data = train_data.reset_index(drop = True)
test_data = test_data.reset_index(drop = True)

print(len(train_data))
print(len(test_data))

82037
20510


In [9]:
for i in range(len(data)):
  if len(vocab.encode_as_ids(data['content'][i])) >= 300:
    print(i)
    break

105


---
# Dataset & DataLoader

---

In [10]:
class Custom_dataset(Dataset):
  
  def __init__(self, vocab, data):
    self.vocab = vocab
    self.label = []
    self.sentence = []

    for i in range(len(data)):
      sent = vocab.encode_as_ids(data['content'][i])
      
      if len(sent) > 250:
        sent = sent[:250]

      self.sentence.append([vocab.piece_to_id('[BOS]')] + sent + [vocab.piece_to_id('[EOS]')])
      self.label.append(data['label'][i])

  def __len__(self):
    return len(self.label)

  def __getitem__(self, idx):
    return (torch.tensor(self.label[idx]), torch.tensor(self.sentence[idx]))


def collate_fn(inputs):
  labels, dec_inputs = list(zip(*inputs))

  dec_inputs = torch.nn.utils.rnn.pad_sequence(dec_inputs, batch_first=True, padding_value=0)

  batch = [torch.stack(labels, dim=0), dec_inputs]
  
  return batch

# max_len이 256보다 길어서 RuntimeError: CUDA error: device-side assert triggered 오류?

In [11]:
train_dataset = Custom_dataset(vocab, train_data)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = Custom_dataset(vocab, test_data)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn)

In [12]:
# class MovieDataSet(torch.utils.data.Dataset):
#     def __init__(self, vocab, infile):
#         self.vocab = vocab
#         self.labels = []
#         self.sentences = []

#         line_cnt = 0
#         with open(infile, "r") as f:
#             for line in f:
#                 line_cnt += 1

#         with open(infile, "r") as f:
#             for i, line in enumerate(tqdm(f, total=line_cnt, desc=f"Loading {infile}", unit=" lines")):
#                 data = json.loads(line)
#                 self.labels.append(data["label"])
#                 self.sentences.append([vocab.piece_to_id("[BOS]")] + [vocab.piece_to_id(p) for p in data["doc"]] + [vocab.piece_to_id("[EOS]")])
    
#     def __len__(self):
#         assert len(self.labels) == len(self.sentences)
#         return len(self.labels)
    
#     def __getitem__(self, item):
#         return (torch.tensor(self.labels[item]),
#                 torch.tensor(self.sentences[item]))
        
# def movie_collate_fn(inputs):
#     labels, dec_inputs = list(zip(*inputs))

#     dec_inputs = torch.nn.utils.rnn.pad_sequence(dec_inputs, batch_first=True, padding_value=0)

#     batch = [
#         torch.stack(labels, dim=0),
#         dec_inputs,
#     ]
#     return batch        

In [13]:
# batch_size = 128
# train_dataset = MovieDataSet(vocab, "/content/drive/MyDrive/2.Study/GPT/Data/ratings_train.json")
# train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=movie_collate_fn)
# test_dataset = MovieDataSet(vocab, "/content/drive/MyDrive/2.Study/GPT/Data/ratings_test.json")
# test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=movie_collate_fn)

---
# Model
---

In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.config = config

        self.W_Q = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.W_K = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.W_V = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        '''입력으로 들어온 차원(d_hidn)을 
           헤드의 개수(n_head)와 헤드의 차원(d_head)으로 나누어 가중치 행렬을 생성 '''
           
        self.scaled_dot_attn = ScaledDotProductAttention(self.config)
        self.linear = nn.Linear(self.config.n_head * self.config.d_head, self.config.d_hidn)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, Q, K, V, attn_mask):
        batch_size = Q.size(0)
        
        q_s = self.W_Q(Q).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2) # (bs, n_head, q_seq_len, d_head)
        
        k_s = self.W_K(K).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2) # (bs, n_head, k_seq_len, d_head)
        
        v_s = self.W_V(V).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2) # (bs, n_head, v_seq_len, d_head)

        
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config.n_head, 1, 1) # (bs, n_head, q_seq_len, k_seq_len)

        
        context, attn_prob = self.scaled_dot_attn(q_s, k_s, v_s, attn_mask)
        '''context = Attention score를 기반으로 하는 v_s의 가중 합 (bs, n_head, q_seq_len, d_head)
           attn_prob = 시퀀스에 대한 Attention Distribution (bs, n_head, q_seq_len, k_seq_len)'''
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.n_head * self.config.d_head) # (bs, n_head, q_seq_len, h_head * d_head)
        
        output = self.linear(context) # (bs, n_head, q_seq_len, e_embd)
        output = self.dropout(output)

        return output, attn_prob # (bs, q_seq_len, d_hidn), (bs, n_head, q_seq_len, k_seq_len)

In [15]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.dropout = nn.Dropout(config.dropout)
        self.scale = 1 / (self.config.d_head ** 0.5)
    
    def forward(self, Q, K, V, attn_mask):
        
        scores = torch.matmul(Q, K.transpose(-1, -2)) # (bs, n_head, n_q_seq, n_k_seq)
        scores = scores.mul_(self.scale)
        scores.masked_fill_(attn_mask, -1e9)
        
        attn_prob = nn.Softmax(dim=-1)(scores) # (bs, n_head, n_q_seq, n_k_seq)
        attn_prob = self.dropout(attn_prob)
        
        context = torch.matmul(attn_prob, V) # (bs, n_head, n_q_seq, d_v)
        
        # (bs, n_head, n_q_seq, d_v), (bs, n_head, n_q_seq, n_v_seq)
        return context, attn_prob

In [16]:
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.conv1 = nn.Conv1d(in_channels=self.config.d_hidn, out_channels=self.config.d_hidn * 4, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=self.config.d_hidn * 4, out_channels=self.config.d_hidn, kernel_size=1)
        self.active = F.gelu # 비선형성을 추가하고 모델의 표현력을 증가
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, inputs):
        
        output = self.active(self.conv1(inputs.transpose(1, 2))) # (bs, d_ff, n_seq)
        
        output = self.conv2(output).transpose(1, 2) # (bs, n_seq, d_hidn)
        output = self.dropout(output)
        return output # (bs, n_seq, d_hidn)

In [17]:
# Position Encoding값을 구하는 함수 

def get_sinusoid_encoding_table(n_seq, d_hidn):
    
    # 입력 위치와 임베딩 차원에 따라 해당 위치의 각도값을 계산
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / d_hidn)
    
    # 입력 위치에 따라 모든 임베딩 차원의 각도값 계산 
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(d_hidn)]

    # seq길이와 임베딩 차원 수에 따라 시퀀스 길이 만큼 반복해서 생성된 값들을 저장
    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    
    # 짝수 인덱스의 값들은 sin함수로 계산하고 홀수는 cos함수로 계산
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # even index sin 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # odd index cos
    '''sin, cos 함수는 시계열, 주기성이 있는 데이처를 처리할 때 사용되며 
       생성된 값들은 특정한 주기를 가지게 되며 이를 통해 입력 seq내의 토큰 위치를 구분할 수 있게 됨
       
       즉, sin, cos를 사용해 위치 정보를 부여함으로써 
       입력 seq 토큰 위치에 대한 정보를 학습하고 이를 활용해 문맥 정보를 파악할 수 있게 됨'''
    return sinusoid_table

In [18]:
# padding이 적용된 위치에 대한 mask를 생성하는 함수 
def get_attn_pad_mask(seq_q, seq_k, i_pad):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()

    # i_pad와 일치하는 부분을 True, 아니면 False
    pad_attn_mask = seq_k.data.eq(i_pad)

    # seq_k와 같은 길이로 len_q까지 확장
    pad_attn_mask= pad_attn_mask.unsqueeze(1).expand(batch_size, len_q, len_k)
    return pad_attn_mask

In [19]:
# attention decoder mask 
def get_attn_decoder_mask(seq):

    # Decoder의 입력 시퀀스와 동일한 크기의 tensor를 생성하고 모든 요소를 1로 채움
    subsequent_mask = torch.ones_like(seq).unsqueeze(-1).expand(seq.size(0), seq.size(1), seq.size(1))
    
    # 대각선 기준 위쪽 삼각형만 남기고 아래쪽 삼각형을 0으로 만듬
    subsequent_mask = subsequent_mask.triu(diagonal=1) # upper triangular part of a matrix(2-D)
    
    # Decoder의 입력 시퀀스를 대각선 기준으로 위쪽 삼각형 = 0, 아래쪽 삼각형 = 1로 채운 이진 행렬 
    return subsequent_mask

In [20]:
class DecoderLayer(nn.Module):
  
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.self_attn = MultiHeadAttention(self.config)
    self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)
    '''입력 tensor의 마지막 차원을 기준으로 계산된 평균과 표준편차를 이용하여 정규화된 tensor를 출력
       학습 안정화 / Gradient vanishing / 일반화 성능향상에 장점이존재'''
       
    self.pos_ffn = PoswiseFeedForwardNet(self.config)
    self.layer_norm3 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)
  
  def forward(self, dec_inputs, self_attn_mask):
      
    self_att_outputs, self_attn_prob = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)
    # (bs, n_dec_seq, d_hidn), (bs, n_head, n_dec_seq, n_dec_seq)

    self_att_outputs = self.layer_norm1(dec_inputs + self_att_outputs)
    
    
    ffn_outputs = self.pos_ffn(self_att_outputs) # (bs, n_dec_seq, d_hidn)
    ffn_outputs = self.layer_norm3(self_att_outputs + ffn_outputs)
    
    return ffn_outputs, self_attn_prob    
    # (bs, n_dec_seq, d_hidn), (bs, n_head, n_dec_seq, n_dec_seq), (bs, n_head, n_dec_seq, n_enc_seq)

In [21]:
class Decoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config = config

    self.dec_emb = nn.Embedding(self.config.n_dec_vocab, self.config.d_hidn)
    sinusoid_table = torch.FloatTensor(get_sinusoid_encoding_table(self.config.n_dec_seq + 1, self.config.d_hidn))
    self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze=True)

    # n_layer 개수 만큼 Decoder 생성 
    self.layers = nn.ModuleList([DecoderLayer(self.config) for _ in range(self.config.n_layer)])
  
  def forward(self, dec_inputs):

    # input값의 각 위치에 대한 position을 저장하는 Tensor
    positions = torch.arange(dec_inputs.size(1), device=dec_inputs.device, dtype=dec_inputs.dtype).expand(dec_inputs.size(0), dec_inputs.size(1)).contiguous() + 1
    # 패딩 부분을 True로
    pos_mask = dec_inputs.eq(self.config.i_pad)
    positions.masked_fill_(pos_mask, 0)

    # 각 단어의 임베딩 벡터와 position값을 더한 값 계산
    dec_outputs = self.dec_emb(dec_inputs) + self.pos_emb(positions) # (bs, n_dec_seq, d_hidn)


    dec_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, self.config.i_pad) # (bs, n_dec_seq, n_dec_seq)
    dec_attn_decoder_mask = get_attn_decoder_mask(dec_inputs)                        # (bs, n_dec_seq, n_dec_seq)
    dec_self_attn_mask = torch.gt((dec_attn_pad_mask + dec_attn_decoder_mask), 0)    # (bs, n_dec_seq, n_dec_seq)

    self_attn_probs = []
    for layer in self.layers:
        # (bs, n_dec_seq, d_hidn), (bs, n_dec_seq, n_dec_seq)
        dec_outputs, self_attn_prob = layer(dec_outputs, dec_self_attn_mask)
        self_attn_probs.append(self_attn_prob)
    
    # (bs, n_dec_seq, d_hidn), [(bs, n_dec_seq, n_dec_seq)]
    return dec_outputs, self_attn_probs

In [22]:
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.decoder = Decoder(self.config)
    
    def forward(self, dec_inputs):
        # (bs, n_seq, d_hidn), [(bs, n_head, n_dec_seq, n_dec_seq)]
        dec_outputs, dec_self_attn_probs = self.decoder(dec_inputs)
        
        # (bs, n_dec_seq, n_dec_vocab), [(bs, n_head, n_dec_seq, n_dec_seq)]
        return dec_outputs, dec_self_attn_probs
    
    def save(self, epoch, loss, path):
        torch.save({
            "epoch": epoch,
            "loss": loss,
            "state_dict": self.state_dict()
        }, path)
    
    def load(self, path):
        save = torch.load(path)
       
        return  self.load_state_dict(save["state_dict"])

---
# Fine-Tuning Code
---

In [23]:
class Classification_model(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.gpt = GPT(self.config)

        # GPT의 결과를 입력으로 단어 분포 확률값으로 반환하여 단어를 예측하기 위한 코드
        self.projection_lm = nn.Linear(self.config.d_hidn, self.config.n_dec_vocab, bias=False)
        
        # 가중치를 GPT 모델의 Decoder의 임베딩 가중치로 설정        
        self.projection_lm.weight = self.gpt.decoder.dec_emb.weight
        '''가중치를 공유하는 이유?
           1. 두 레이어에서 사용하는 단어 표현을 일관성 있게 유지할 수 있으며, 
              그로 인해 모델의 학습을 안정적으로 만들어주는 역할을 한다.
           2. 불필요한 계산을 줄일 수 있다. 
              예를 들어 Decoder의 출력과 Decoder의 임베딩 레이어의 가중치를 곱하여 단어 분포 확률값으로 반환 후 
              project_lm 레이어를 통해 선형 변환을 수행하는 것을 계산 비용이 비싸기 때문'''
        
        # 클래스별 확률값 반환 
        self.projection_cls = nn.Linear(self.config.d_hidn, self.config.n_output, bias=False)
    
    def forward(self, dec_inputs):
        
        dec_outputs, dec_self_attn_probs = self.gpt(dec_inputs) # (bs, n_dec_seq, d_hidn), [(bs, n_head, n_dec_seq, n_dec_seq)]
        
        logits_lm = self.projection_lm(dec_outputs) # (bs, n_dec_seq, n_dec_vocab)
        
        dec_outputs = dec_outputs[:, -1].contiguous() # (bs, d_hidn)
        
        logits_cls = self.projection_cls(dec_outputs) # (bs, n_output)

        # (bs, n_dec_seq - 1, n_dec_vocab), (bs, n_output), [(bs, n_head, n_dec_seq, n_dec_seq)]
        return logits_lm[:, :-1, :].contiguous(), logits_cls, dec_self_attn_probs

---
# Train & Evaluate
---

In [24]:
def train_epoch(config, epoch, model, criterion_cls, optimizer, train_loader):
  losses = []
  model.train()

  with tqdm(total=len(train_loader), desc=f"Train({epoch})") as pbar:
    for i, value in enumerate(train_loader):
      labels, dec_inputs = value[0].to(device), value[1].to(device)

      optimizer.zero_grad()
      outputs = model(dec_inputs)
      
      logits_cls = outputs[1]
  
      loss_cls = criterion_cls(logits_cls, labels)
      loss = loss_cls

      loss_val = loss_cls.item()
      losses.append(loss_val)

      loss.backward()
      optimizer.step()

      pbar.update(1)
      pbar.set_postfix_str(f"Loss: {loss_val:.3f} ({np.mean(losses):.3f})")
  return np.mean(losses)

In [25]:
def eval_epoch(config, model, data_loader):
  matchs = []
  model.eval()

  n_word_total = 0
  n_correct_total = 0
  with tqdm(total=len(data_loader), desc=f"Valid") as pbar:
    for i, value in enumerate(data_loader):
      labels, dec_inputs = map(lambda v: v.to(device), value)

      outputs = model(dec_inputs) # [logits_lm, logits_cls, dec_self_attn_probs]
      logits_cls = outputs[1]
      _, indices = logits_cls.max(1)

      # 같은지 비교
      match = torch.eq(indices, labels).detach()
      matchs.extend(match.cpu())
      accuracy = np.sum(matchs) / len(matchs) if 0 < len(matchs) else 0

      pbar.update(1)
      pbar.set_postfix_str(f"Acc: {accuracy:.3f}")
  return np.sum(matchs) / len(matchs) if 0 < len(matchs) else 0

In [26]:
learning_rate = 5e-5
n_epoch = 10

In [27]:
def train(model, value : str):

  criterion_cls = torch.nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  best_epoch, best_loss, best_score = 0, 0, 0
  losses, scores = [], []
  for epoch in range(n_epoch):
      loss = train_epoch(config, epoch+1, model, criterion_cls, optimizer, train_loader)
      score = eval_epoch(config, model, test_loader)

      losses.append(loss)
      scores.append(score)

      if best_score < score:
          best_epoch, best_loss, best_score = epoch, loss, score
          if value == 'base':
            torch.save(model.state_dict(), config.save_base)
          elif value == 'fine':
            torch.save(model.state_dict(), config.save_fine)
            
  print(f">>>> epoch={best_epoch}, loss={best_loss:.5f}, socre={best_score:.5f}")
  return losses, scores

In [28]:
model = Classification_model(config).to(device)

loss_none, score_none = train(model, 'base')

Train(0):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(1):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(2):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(3):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(4):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(5):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(6):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(7):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(8):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(9):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

>>>> epoch=8, loss=0.14006, socre=0.91565


In [29]:
model = Classification_model(config).to(device)

model.gpt.load(config.save_pretrain)

loss_20, score_20 = train(model, 'fine')

Train(0):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(1):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(2):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(3):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(4):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(5):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(6):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(7):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(8):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

Train(9):   0%|          | 0/641 [00:00<?, ?it/s]

Valid:   0%|          | 0/161 [00:00<?, ?it/s]

>>>> epoch=7, loss=0.12303, socre=0.92921


In [30]:
model = Classification_model(config)

for i, value in enumerate(train_loader):
  labels, dec_inputs = value[0], value[1]
  outputs = model(dec_inputs)
  break

In [31]:
print(outputs[0].shape)
print(outputs[1].shape)
print(outputs[2][5].shape)
print(labels.shape)

torch.Size([128, 251, 8007])
torch.Size([128, 5])
torch.Size([128, 4, 252, 252])
torch.Size([128])


In [32]:
criterion_cls = torch.nn.CrossEntropyLoss()
criterion_cls(outputs[1], labels)

tensor(1.6684, grad_fn=<NllLossBackward0>)