In [1]:
import os
import sys
import glob
import pickle
import argparse
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler

from transformers import AutoTokenizer
from transformers import GPT2TokenizerFast
from transformers import BertTokenizerFast
from transformers import get_cosine_schedule_with_warmup
from transformers import get_linear_schedule_with_warmup

from dataset import *
# from learning import *
from model import *
from utils import *

import warnings
warnings.filterwarnings(action='ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def setup(seed):
    # random.seed(SEED) #  Python의 random 라이브러리가 제공하는 랜덤 연산이 항상 동일한 결과를 출력하게끔
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

setup(17)

sentence_ps = "moving_average"
window_size = 3

# Define project
project_name = f'NIA_119-GPT_MULTI_1'
model_name = 'only_Text_GPT_MULTI_1'
model_link = "kykim/gpt3-kor-small_based_on_gpt2" # 'skt/kogpt2-base-v2' #'beomi/kcbert-base'

# args
rank = 'cuda:0'
epochs = 10
batch_size = 12
lr = 1e-5

class_num = 3
speaker_num = 4
max_length = 768
padding = 'max_length'
save_term = 710

# dataset
train_path = os.path.join('..', 'NIA_text_dataset', 'train_json_audio_data_decoder_time.csv')
valid_path = os.path.join('..', 'NIA_text_dataset', 'valid_json_audio_data_decoder_time.csv')
test_path = os.path.join('..', 'NIA_text_dataset', 'test_json_audio_data_decoder_time.csv')
# test_path = os.path.join('..', 'NIA_text_dataset', 'test_json_audio_data_decoder_time_arrest_cut.csv')
# train_path = os.path.join('..', 'NIA_text_dataset', 'toy_data_json_audio_data_decoder_time.csv')
# valid_path = os.path.join('..', 'NIA_text_dataset', 'toy_data_json_audio_data_decoder_time.csv')
# test_path = os.path.join('..', 'NIA_text_dataset', 'toy_data_json_audio_data_decoder_time.csv')

save_path = os.path.join('models', 'only_Text_GPT_MULTI_1_e10_bs12')  
ckpt_path = os.path.join(save_path, 'checkpoint_8_710.tar')

train_data = pd.read_csv(train_path)
valid_data = pd.read_csv(valid_path)
test_data = pd.read_csv(test_path)
valid_file_ids = valid_data.id
test_file_ids = test_data.id

## your Data Pre-Processing
print('init Data >>>')
print('\tinit train data :', train_data.shape)
print('\tinit valid data :', valid_data.shape)
print('\tinit test data :', test_data.shape)

# train_data = train_data.dropna(axis=0)
train_data = train_data.reset_index(drop=True)
# valid_data = valid_data.dropna(axis=0)
valid_data = valid_data.reset_index(drop=True)
# test_data = test_data.dropna(axis=0)
test_data = test_data.reset_index(drop=True)

print('\ttrain data :', train_data.shape)
print('\tvalid data :', valid_data.shape)
print('\ttest data :', test_data.shape)

## Create Dataset and DataLoader
# tokenizer = GPT2TokenizerFast.from_pretrained(model_link,bos_token='<s>', eos_token='</s>', 
#                                               unk_token='<unk>',pad_token='<pad>', mask_token='<mask>')
tokenizer = BertTokenizerFast.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")
special_tokens_dict = {'additional_special_tokens': [f'[SPK{n}]' for n in range(speaker_num)]}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
pad_token_id = tokenizer.pad_token_id
cls_token_id = tokenizer.cls_token_id
sep_token_id = tokenizer.sep_token_id

train_dataset = MyDataset(train_data, 
                            tokenizer, 
                            max_length=max_length, 
                            padding=padding,
                            speaker_num=speaker_num,
                            class_num=class_num)
valid_dataset = MyDataset(valid_data,
                            tokenizer,
                            max_length=max_length,
                            padding=padding,
                            speaker_num=speaker_num,
                            class_num=class_num)
test_dataset = MyDataset(test_data,
                            tokenizer,
                            max_length=max_length,
                            padding=padding,
                            speaker_num=speaker_num,
                            class_num=class_num)

test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=4)

## label_frequency
train_label_frequency = (train_data.label1 == 1).sum() / len(train_data)
valid_label_frequency = (valid_data.label1 == 1).sum() / len(valid_data)
test_label_frequency = (test_data.label1 == 1).sum() / len(test_data)
print("Label frequency of Train Data: {:6f}".format(train_label_frequency))
print("Label frequency of Valid Data: {:6f}".format(valid_label_frequency))
print("Label frequency of Test Data: {:6f}".format(test_label_frequency))

# modeling
model = GPT_Baseline.from_pretrained(model_link, class_num=class_num,
                                        pad_token_id=pad_token_id, cls_token_id=cls_token_id, sep_token_id=sep_token_id)
model.resize_token_embeddings(len(tokenizer))
model = model.to(rank)

# optimizer = optim.AdamW([{'params': model.module.electra.parameters(),'lr': electra_lr},
#                          {'params': model.module.classifier.parameters(),'lr': cls_lr}],
#                         eps=1e-8)
optimizer = optim.AdamW(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

# iter_len = len(train_loader)
# num_training_steps = iter_len * epochs
# num_warmup_steps = int(0.15 * num_training_steps)
# scheduler = get_linear_schedule_with_warmup(optimizer,
#                                             num_warmup_steps=num_warmup_steps,
#                                             num_training_steps=num_training_steps)

print(f"{ckpt_path} >>> ")
file_name = os.path.basename(ckpt_path).split('.')[0]
model = GPT_Baseline.from_pretrained(model_link, class_num=class_num, pad_token_id=pad_token_id,
                                        cls_token_id=cls_token_id, sep_token_id=sep_token_id,
                                        sentence_ps=sentence_ps, window_size=window_size)
model.resize_token_embeddings(len(tokenizer))
ckpt = torch.load(ckpt_path, map_location=rank)
model.load_state_dict(ckpt['model_state_dict']); model.to(rank)
model.sentence_ps = sentence_ps
model.window_size = window_size

init Data >>>
	init train data : (101742, 476)
	init valid data : (25436, 476)
	init test data : (31795, 476)
	train data : (101742, 476)
	valid data : (25436, 476)
	test data : (31795, 476)
Label frequency of Train Data: 0.016650
Label frequency of Valid Data: 0.016630
Label frequency of Test Data: 0.016638


Some weights of GPT_Baseline were not initialized from the model checkpoint at kykim/gpt3-kor-small_based_on_gpt2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'speaker_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


models/only_Text_GPT_MULTI_1_e10_bs12/checkpoint_8_710.tar >>> 


Some weights of GPT_Baseline were not initialized from the model checkpoint at kykim/gpt3-kor-small_based_on_gpt2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'speaker_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
def inference(model, rank, criterion, data_loader, label_frequency, test_file_ids, pad_token_id=0):
    assert rank == 0 or rank == 'cuda:0'

    model.eval()
    sum_loss = sum_acc = 0
    bs = data_loader.batch_size
    
    predicted = torch.tensor([])
    labels = torch.tensor([])

    file_ids_list = []
    input_ids_list = torch.tensor([])
    spk_type_ids_list = torch.tensor([])
    predicted_token_logits = torch.tensor([])
    label_per_token_logits = []

    with torch.no_grad():
        pbar = tqdm(data_loader, file=sys.stdout)
        for batch_idx, ((input_ids, att_mask, type_ids, spk_type_ids), target) in enumerate(pbar):
            input_ids, att_mask = input_ids.to(rank), att_mask.to(rank)
            type_ids, spk_type_ids = type_ids.to(rank), type_ids.to(rank)
            target = target.to(rank)
            mb_len = len(target)

            output, logit, seq_logits = model(input_ids=input_ids, attention_mask=att_mask,
                                              token_type_ids=type_ids, speaker_type_ids=spk_type_ids,
                                              is_inference=True)
            
            sequence_lengths = torch.eq(input_ids, pad_token_id).int().argmax(-1) - 1
            sequence_lengths = sequence_lengths % input_ids.shape[-1]
            sequence_lengths = sequence_lengths.to(rank)

            loss = criterion(logit, target)
            # loss = get_loss(rank, input_ids, output, target, criterion)
            acc = calc_acc(logit, target)

            sum_loss += loss.item()
            sum_acc += acc

            loss = sum_loss / (batch_idx + 1)
            acc = sum_acc / (batch_idx * bs + mb_len)
            pbar.set_postfix(loss='{:.8f}, acc={:.4f}'.format(loss, acc))
            
            output_pred = logit.detach().cpu()
            true_label = target.detach().cpu()
            predicted = torch.concat([predicted, output_pred], dim=0)
            labels = torch.concat([labels, true_label], dim=0)

            input_ids = input_ids.detach().cpu()
            spk_type_ids = spk_type_ids.detach().cpu()
            for idx, (file_id, seq_len, seq_logit) in enumerate(zip(test_file_ids, sequence_lengths, seq_logits)):
                seq_length = seq_logit.size(0)
                print('\n', file_id, seq_logit.shape, seq_length)
                
                file_ids = [file_id] * seq_length
                file_ids_list += file_ids

                label = [target[idx].item()] * seq_length
                label_per_token_logits += label
                # input_id = input_ids[idx][:seq_len]; input_ids_list = torch.concat([input_ids_list, input_id], dim=0)
                # spk_type_id = spk_type_ids[idx][:seq_len]; spk_type_ids_list = torch.concat([spk_type_ids_list, spk_type_id], dim=0)
                
                # logits = output[idx, :seq_len]; logits=logits.detach().cpu()
                # seq_logits_concated = torch.concat(seq_logit, dim=0).detach().cpu()
                predicted_token_logits = torch.concat([predicted_token_logits, seq_logit.detach().cpu()], dim=0)

            test_file_ids = test_file_ids[bs:]
            break

        pbar.close()

    total_loss = sum_loss / (batch_idx + 1)
    total_acc = sum_acc / (batch_idx * bs + mb_len)
    
    # predicted_probas = torch.sigmoid(predicted)[:, 1]
    predicted_probas = torch.softmax(predicted, dim=-1)[:, 1]
    predicted_labels = torch.where(predicted_probas >= label_frequency , 1, 0)
    labels_ = torch.where(labels == 1, 1, 0)
    
    predicted_probas = predicted_probas.numpy()
    predicted_labels = predicted_labels.numpy()
    labels_ = labels_.numpy()

    file_ids_list = np.array(file_ids_list)
    # input_ids_list = input_ids_list.numpy().astype(np.int32)
    # spk_type_ids_list = spk_type_ids_list.numpy().astype(np.int32)

    predicted_token_logits = torch.softmax(predicted_token_logits, dim=-1)
    
    predicted_token_proba_0 = predicted_token_logits[:, 0]
    predicted_token_proba_0 = np.round(predicted_token_proba_0.numpy(), 8)

    predicted_token_proba_1 = predicted_token_logits[:, 1]
    predicted_token_proba_1 = np.round(predicted_token_proba_1.numpy(), 8)

    predicted_token_proba_2 = predicted_token_logits[:, 2]
    predicted_token_proba_2 = np.round(predicted_token_proba_2.numpy(), 8)

    # predicted_token_logits = torch.where(predicted_token_logits >= label_frequency , 1, 0)
    # predicted_token_logits = np.round(predicted_token_logits.numpy(), 8)
    label_per_token_logits = np.array(label_per_token_logits).astype(np.int32)

    return (predicted_probas, labels_), (file_ids_list, predicted_token_proba_0, predicted_token_proba_1, predicted_token_proba_2, label_per_token_logits)

In [4]:
((predicted_probas, labels_), 
    (file_ids_list, predicted_token_proba_0, 
    predicted_token_proba_1, predicted_token_proba_2, 
    label_per_token_logits)) = inference(model, rank, criterion, test_loader, train_label_frequency,
                                        test_file_ids=test_file_ids, pad_token_id=pad_token_id)

prediction_result = pd.DataFrame({'id':test_file_ids,'predicted_probas':predicted_probas, 'labels':labels_})
prediction_result.to_csv(os.path.join(save_path, f'inference_logit_{file_name}_{sentence_ps}_{window_size}.csv'), index=False)
result_df = calc_metric(predicted_probas, labels_)
result_df.to_csv(os.path.join(save_path, f'inference_thresholding_{file_name}_{sentence_ps}_{window_size}.csv'), index=False)

file_result = pd.DataFrame({'id':file_ids_list, 'predicted_token_proba_0':predicted_token_proba_0,
                            'predicted_token_proba_1':predicted_token_proba_1, 
                            'predicted_token_proba_2':predicted_token_proba_2,
                            'label':label_per_token_logits})
file_result.to_csv(os.path.join(save_path, f'inference_file_{file_name}_{sentence_ps}_{window_size}.csv'), index=False)

  0%|          | 0/2650 [00:01<?, ?it/s, loss=0.14451057, acc=0.9167]
 64f6c516461a963f30a3c498 torch.Size([19, 3]) 19

 651e4f08d163a62ed3b43b70 torch.Size([17, 3]) 17

 651e508f2acb389901367aec torch.Size([31, 3]) 31

 64ec361829e61fb51457b173 torch.Size([21, 3]) 21

 64dd752b1ef84058319a7f1f torch.Size([32, 3]) 32

 651e4e712e98ee7120968af5 torch.Size([48, 3]) 48

 651e4dfb2795332cde28f278 torch.Size([55, 3]) 55

 64f6c2ad7d6c8ed09eec8105 torch.Size([21, 3]) 21

 651e4a6ede6495f4e9d36b34 torch.Size([46, 3]) 46

 651e4a3786dc055ca8bf4607 torch.Size([29, 3]) 29

 651e4a6ede6495f4e9d36cb4 torch.Size([22, 3]) 22

 651e4e89429d02dab45a2dd9 torch.Size([45, 3]) 45
  0%|          | 0/2650 [00:01<?, ?it/s, loss=0.14451057, acc=0.9167]


ValueError: array length 12 does not match index length 31795

In [10]:
((predicted_probas, labels_), 
    (file_ids_list, predicted_token_proba_0, 
    predicted_token_proba_1, predicted_token_proba_2, 
    label_per_token_logits)) = inference(model, rank, criterion, test_loader, train_label_frequency,
                                        test_file_ids=test_file_ids, pad_token_id=pad_token_id)

prediction_result = pd.DataFrame({'id':test_file_ids,'predicted_probas':predicted_probas, 'labels':labels_})
prediction_result.to_csv(os.path.join(save_path, f'inference_logit_{file_name}_{sentence_ps}_{window_size}.csv'), index=False)
result_df = calc_metric(predicted_probas, labels_)
result_df.to_csv(os.path.join(save_path, f'inference_thresholding_{file_name}_{sentence_ps}_{window_size}.csv'), index=False)

file_result = pd.DataFrame({'id':file_ids_list, 'predicted_token_proba_0':predicted_token_proba_0,
                            'predicted_token_proba_1':predicted_token_proba_1, 
                            'predicted_token_proba_2':predicted_token_proba_2,
                            'label':label_per_token_logits})
file_result.to_csv(os.path.join(save_path, f'inference_file_{file_name}_{sentence_ps}_{window_size}.csv'), index=False)

  0%|          | 0/2650 [00:00<?, ?it/s, loss=0.14451341, acc=0.9167]
 64f6c516461a963f30a3c498 torch.Size([36, 3]) 36

 651e4f08d163a62ed3b43b70 torch.Size([32, 3]) 32

 651e508f2acb389901367aec torch.Size([60, 3]) 60

 64ec361829e61fb51457b173 torch.Size([40, 3]) 40

 64dd752b1ef84058319a7f1f torch.Size([62, 3]) 62

 651e4e712e98ee7120968af5 torch.Size([94, 3]) 94

 651e4dfb2795332cde28f278 torch.Size([108, 3]) 108

 64f6c2ad7d6c8ed09eec8105 torch.Size([40, 3]) 40

 651e4a6ede6495f4e9d36b34 torch.Size([90, 3]) 90

 651e4a3786dc055ca8bf4607 torch.Size([56, 3]) 56

 651e4a6ede6495f4e9d36cb4 torch.Size([42, 3]) 42

 651e4e89429d02dab45a2dd9 torch.Size([88, 3]) 88
  0%|          | 0/2650 [00:00<?, ?it/s, loss=0.14451341, acc=0.9167]


ValueError: array length 12 does not match index length 31795