In [1]:
# pytorch 프레임워크 라이브러리
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# 자연어 처리 라이브러리
from sklearn.model_selection import train_test_split
from transformers import AdamW, BertModel
from transformers.optimization import get_cosine_schedule_with_warmup
import gluonnlp as nlp

# 기타
import numpy as np
import pandas as pd
from tqdm import tqdm, tqdm_notebook
from datetime import datetime, timedelta
import os
import time
import json



2024-03-31 06:13:59.120035: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-03-31 06:14:04.142016: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-03-31 06:14:04.142039: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-03-31 06:14:13.054764: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-

In [12]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=0,  
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate    
        self.num_classes = num_classes
        self.classifier = nn.Linear(hidden_size , self.num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

# 모델 설정 함수
def setup_model(model_path, vocab_file, device, num_classes):
    bert_model, vocab = get_model_and_vocab(model_path, vocab_file)
    tokenizer = nlp.data.BERTSPTokenizer(vocab_file, vocab, lower=False)
    model = BERTClassifier(bert_model, num_classes=num_classes, dr_rate=0.5).to(device)
    return model, tokenizer

def get_model_and_vocab(model_path, vocab_file, ctx="cpu"):
    bertmodel = BertModel.from_pretrained(model_path, return_dict=False)
    device = torch.device(ctx)
    bertmodel.to(device)
    bertmodel.eval()
    vocab_b_obj = nlp.vocab.BERTVocab.from_sentencepiece(
        vocab_file, padding_token="[PAD]"
    )
    return bertmodel, vocab_b_obj

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))
    
# 데이터 준비 함수
def prepare_data(train_df, test_df, tokenizer, max_len, batch_size):

    dataset_train = [[row['document'], row['label']] for _, row in train_df.iterrows()]
    dataset_test = [[row['document'], row['label']] for _, row in test_df.iterrows()]

    data_train = BERTDataset(dataset_train, 0, 1, tokenizer, max_len, True, False)
    data_test = BERTDataset(dataset_test, 0, 1, tokenizer, max_len, True, False)

    train_dataloader = torch.utils.data.DataLoader(data_train, batch_size=batch_size, num_workers=5)
    test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=5)

    return train_dataloader, test_dataloader

def calc_accuracy(X,Y):
    max_vals, max_indices = torch.max(X, 1)
    train_acc = (max_indices == Y).sum().data.cpu().numpy()/max_indices.size()[0]
    return train_acc

def logging(log_path, txt):
    with open(log_path, 'a') as log_file:
        log_file.write(f"{txt}\n")
        





In [13]:
# 훈련 함수
def train_model(model, train_dataloader, test_dataloader, device, optimizer, scheduler, loss_fn, num_epochs, max_grad_norm, log_interval, model_save_dir):
    # early stopping code 
    best_test_acc = 0.0
    best_loss = float('inf')
    patience = 5
    no_improvement_count = 0
    
    loss_lst = []
    acc_lst = []
    
    stop_e = 0
    log_path = os.path.join(model_save_dir, f"train_log.txt")
    model_path = os.path.join(model_save_dir, f'movie-review-entiment-analysis-model.pt')

    for e in range(num_epochs):
        # Training and evaluation loop (remains the same)
        stop_e += 1 
        train_acc = 0.0
        test_acc = 0.0
        total_loss = 0.0
        model.train()
        start_time = time.time()
        for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):
            optimizer.zero_grad()
            token_ids = token_ids.long().to(device)
            segment_ids = segment_ids.long().to(device)
            valid_length = valid_length
            label = label.long().to(device)
            out = model(token_ids, valid_length, segment_ids)
            loss = loss_fn(out, label)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            train_acc += calc_accuracy(out, label)
            
            if batch_id % log_interval == 0:
                print("Epoch : {}, Batch ID : {}, Loss {}, Train acc {}".format(e + 1, batch_id + 1, loss.data.cpu().numpy(), train_acc / (batch_id + 1)))
                logging(log_path, f"Epoch : {e + 1}, Batch ID : {batch_id + 1}, Loss : {loss.data.cpu().numpy()}, Train acc : {train_acc / (batch_id + 1)}")
        
        print(f"Epoch : {e + 1}, Train Accuracy : {train_acc / (batch_id + 1)}")
        logging(log_path, f"Epoch : {e + 1}, Train Accuracy : {train_acc / (batch_id + 1)}")

        model.eval()
        with torch.no_grad():
            for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):
                token_ids = token_ids.long().to(device)
                segment_ids = segment_ids.long().to(device)
                valid_length = valid_length
                label = label.long().to(device)
                out = model(token_ids, valid_length, segment_ids)
                loss = loss_fn(out, label)
                test_acc += calc_accuracy(out, label)
                total_loss += loss.item()

        current_test_acc = test_acc / (batch_id + 1)
        current_loss = total_loss / (batch_id + 1)
        acc_lst.append(current_test_acc)
        loss_lst.append(current_loss)
        total_time = time.time()-start_time
        times = str(timedelta(seconds=total_time))
        short = times.split(".")[0]
        print("="*100)
        print(f"Epoch : {e + 1} Test Accuracy {current_test_acc}, Loss : {current_loss}")
        print(f"Epoch : {e + 1}, Train time {short}")
        print("="*100)
        logging(log_path, f"Epoch : {e + 1} Test Accuracy {current_test_acc}, Loss : {current_loss}")
        logging(log_path, f"Epoch : {e + 1}, Train time {short}")
            
        # Model saving and early stopping
        if current_test_acc > best_test_acc and current_loss < best_loss:
            best_test_acc = current_test_acc
            best_loss = current_loss
            no_improvement_count = 0
            logging(log_path, f"Epoch : {e + 1}, Accuracy : {best_test_acc:.2f}, Loss : {best_loss:.2f}")
                
            # Save the model
            torch.save(model.state_dict(), model_path)
            print(f'Model saved: {model_path}')
        else:
            no_improvement_count += 1

        if no_improvement_count >= patience:
            print("Early stopping triggered. Stopping training.")
            logging(log_path, f"Early stopping triggered. Stopping training.")
            break
    
    return stop_e , best_test_acc, best_loss, log_path

In [14]:
# 메인 함수
def main(train_df, test_df):
    # 기본 파라미터 설정
    max_len=256
    batch_size=32
    warmup_ratio=0.1
    num_epochs=50
    max_grad_norm=1
    log_interval=100
    learning_rate=5e-5
    num_classes = 2 # 분류 할 클래스의 개수

    # 모델 로드 및 데이터 준비
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    base_model_path = '../kobert-base-v1'
    vocab_file = '../kobert-base-v1/kobert_news_wiki_ko_cased-1087f8699e.spiece'

    model, tokenizer = setup_model(base_model_path, vocab_file, device, num_classes)

    train_dataloader, test_dataloader = prepare_data(train_df, test_df, tokenizer, max_len, batch_size)
    
    # 옵티마이저 및 스케쥴러 설정
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()

    t_total = len(train_dataloader) * num_epochs
    warmup_step = int(t_total * warmup_ratio)
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=warmup_step, num_training_steps=t_total)

    # 모델 저장 디렉토리 지정 및 생성
    current_time = datetime.now()
    folder_name = current_time.strftime("%Y-%m-%d_%H-%M")
    model_save_dir = os.path.join(f"./model-save/{folder_name}")
    os.makedirs(model_save_dir, exist_ok=True)

    # 모델 훈련
    start_time = time.time()
    stop_epoch, best_test_acc, best_loss, log_path = train_model(model, 
                                                                 train_dataloader, 
                                                                 test_dataloader, 
                                                                 device, 
                                                                 optimizer, 
                                                                 scheduler, 
                                                                 loss_fn, 
                                                                 num_epochs, 
                                                                 max_grad_norm, 
                                                                 log_interval, 
                                                                 model_save_dir)
    total_time = time.time()-start_time
    times = str(timedelta(seconds=total_time))

    # 훈련시간 출력
    short = times.split(".")[0]
    print("="*100)
    print(f"Train time : {short}")
    print("="*100)

In [15]:
train_df = pd.read_table('./ratings_train.txt')
train_df = train_df[:len(train_df)//30]
train_test = pd.read_table('./ratings_test.txt')
train_test = train_test[:len(train_test)//30]

main(train_df,train_test)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(train_dataloader)):


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch : 1, Batch ID : 1, Loss 0.7229323387145996, Train acc 0.46875
Epoch : 1, Batch ID : 101, Loss 0.657133162021637, Train acc 0.5095915841584159
Epoch : 1, Train Accuracy : 0.5258757961783439


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/53 [00:00<?, ?it/s]

Epoch : 1 Test Accuracy 0.6285377358490566, Loss : 0.6636752609936696
Epoch : 1, Train time 0:03:30
Model saved: ./model-save/2024-03-31_06-17/movie-review-entiment-analysis-model.pt


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch : 2, Batch ID : 1, Loss 0.680321216583252, Train acc 0.65625
Epoch : 2, Batch ID : 101, Loss 0.3569379150867462, Train acc 0.6921410891089109
Epoch : 2, Train Accuracy : 0.7298964968152867


  0%|          | 0/53 [00:00<?, ?it/s]

Epoch : 2 Test Accuracy 0.8007075471698113, Loss : 0.4315129023677898
Epoch : 2, Train time 0:03:37
Model saved: ./model-save/2024-03-31_06-17/movie-review-entiment-analysis-model.pt


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch : 3, Batch ID : 1, Loss 0.7076925039291382, Train acc 0.59375
Epoch : 3, Batch ID : 101, Loss 0.2127956748008728, Train acc 0.838180693069307
Epoch : 3, Train Accuracy : 0.8493232484076433


  0%|          | 0/53 [00:00<?, ?it/s]

Epoch : 3 Test Accuracy 0.8136792452830188, Loss : 0.44933189325175193
Epoch : 3, Train time 0:03:37


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch : 4, Batch ID : 1, Loss 0.6557847261428833, Train acc 0.75
Epoch : 4, Batch ID : 101, Loss 0.26018625497817993, Train acc 0.9053217821782178
Epoch : 4, Train Accuracy : 0.8992834394904459


  0%|          | 0/53 [00:00<?, ?it/s]

Epoch : 4 Test Accuracy 0.8213443396226415, Loss : 0.48590494973479575
Epoch : 4, Train time 0:03:37


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch : 5, Batch ID : 1, Loss 0.49477463960647583, Train acc 0.78125
Epoch : 5, Batch ID : 101, Loss 0.16677634418010712, Train acc 0.9204826732673267
Epoch : 5, Train Accuracy : 0.9243630573248408


  0%|          | 0/53 [00:00<?, ?it/s]

Epoch : 5 Test Accuracy 0.8573113207547169, Loss : 0.48543697532336666
Epoch : 5, Train time 0:03:37


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch : 6, Batch ID : 1, Loss 0.48518186807632446, Train acc 0.875
Epoch : 6, Batch ID : 101, Loss 0.6758376955986023, Train acc 0.9554455445544554
Epoch : 6, Train Accuracy : 0.9482484076433121


  0%|          | 0/53 [00:00<?, ?it/s]

Epoch : 6 Test Accuracy 0.8378537735849056, Loss : 0.5812875145237963
Epoch : 6, Train time 0:03:37


  0%|          | 0/157 [00:00<?, ?it/s]

Epoch : 7, Batch ID : 1, Loss 0.2902156412601471, Train acc 0.90625
Epoch : 7, Batch ID : 101, Loss 0.2531392574310303, Train acc 0.9384282178217822
Epoch : 7, Train Accuracy : 0.9462579617834395


  0%|          | 0/53 [00:00<?, ?it/s]

Epoch : 7 Test Accuracy 0.8360849056603774, Loss : 0.6193571731026443
Epoch : 7, Train time 0:03:37
Early stopping triggered. Stopping training.
Train time : 0:25:22


UnboundLocalError: local variable 'desc' referenced before assignment

In [11]:
train_df

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1
...,...,...,...
4995,6723963,내 셀카가 더 잼있다,0
4996,3683108,도랏나 ㅡㅡ,0
4997,6406669,황우슬혜랑 차인표가 주인공이냐 이 말같지도 않은 시트콤이? 맨날 내용이 산으로가ㅋ,0
4998,6878666,"이혜영 강남에서,10여년전에 한번 봤는데,007처럼,은색 재규어 몰고 다니더라.방송...",0
