In [1]:
import torch
import pandas as pd
from baseline_embedding import get_dataset
from torch.utils.data import Dataset, DataLoader
import os
import time
from keras.preprocessing.sequence import pad_sequences
import torch.nn as nn
import numpy as np
import gc
from sklearn.metrics import cohen_kappa_score, accuracy_score

In [2]:
from config import config

args = config['aihub_v1']

In [3]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)

In [4]:
def get_embedded_essay(essays, is_train=True):        
    # embedded_essay_raw = pd.read_csv(os.path.join(args['emb_file_path'], f"{args['train_dataset_path'].split('/')[1]}_{'train' if is_train else 'valid'}_{'notlabeled' if args['is_topic_label'] == False else 'labeled'}.csv"), encoding='cp949')
    embedded_essay_raw = pd.read_csv(os.path.join(args['emb_file_path'], f"emb_feat_{'train' if is_train else 'valid'}_{'notlabeled' if args['is_topic_label'] == False else 'labeled'}.csv"), encoding='cp949')
    print(embedded_essay_raw.shape)
    embedded_essay = []
    tmp_ix = 0
    for ix, essay_raw in enumerate(essays):
        tmp_len = len(essay_raw)
        essay = embedded_essay_raw[tmp_ix:tmp_ix + tmp_len]
        embedded_essay.append(essay)
        tmp_ix += tmp_len
    return embedded_essay

In [22]:
def compute_metrics(y_sent_pred, y_test):
    metrics = {}
    all_kappas = []
    for i in range(len(args['rubric'])):
        metrics[args['rubric'][i]] = {}
        y_pred = y_sent_pred[:, i]
        y_true = y_test[:, i]
        accuracy = accuracy_score(y_true, y_pred)
        kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
        metrics[args['rubric'][i]]['accuracy'] = accuracy
        metrics[args['rubric'][i]]['kappa'] = kappa
        all_kappas.append(kappa)

    metrics['overall'] = {}
    overall_accuracy = accuracy_score(y_test.flatten(), y_sent_pred.flatten())
    overall_kappa = np.mean(all_kappas)
    metrics['overall']['accuracy'] = overall_accuracy
    metrics['overall']['kappa'] = overall_kappa
    return metrics

In [6]:
class GRUScoreModule(nn.Module):
    def __init__(self,output_dim,hidden_dim, dropout=0.5):
        super(GRUScoreModule, self).__init__()
        self.gru = nn.GRU(768,hidden_dim, dropout=dropout, batch_first=True, bidirectional=True)        
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim*2, output_dim)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x, _ = self.gru(x)
        
        x = x[:, -1, :]  # Use the output of the last time step
        x = self.dropout(x)
        x = self.fc(x)
        x = self.sigmoid(x)
        return x

In [7]:
class EssayDataset(Dataset):
    def __init__(self, embedded_essays, labels):
        self.embedded_essays = embedded_essays
        self.embedded_essays = torch.tensor(pad_sequences(embedded_essays, maxlen=128, padding='pre', dtype='float32'), dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.embedded_essays[idx], self.labels[idx]    

In [8]:
train_essay, valid_essay, train_y, valid_y = get_dataset()
train_embedded_essay = get_embedded_essay(train_essay, is_train=True)
valid_embedded_essay = get_embedded_essay(valid_essay, is_train=False)

(485014, 768)
(68609, 768)


In [9]:
train_dataset = EssayDataset(train_embedded_essay, train_y)
valid_dataset = EssayDataset(valid_embedded_essay, valid_y) 

In [10]:
batch_size = 512
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

In [11]:
import torch.optim as optim
n_outputs = len(args['rubric'])

dropout = 0.5
learning_rate = 0.001
n_epochs = 100

model = GRUScoreModule(output_dim=n_outputs,hidden_dim=128, dropout=dropout).cuda()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [12]:
patience = 10
train_loss_list = []
val_loss_list = []
best_val_loss = float('inf')
early_stopping_counter = 0
prev_time = time.time()
set_seed(42)
for epoch in range(n_epochs):
    model.train()
    train_loss = 0
    for inputs,labels in train_loader:
        inputs ,labels = inputs.cuda(), labels.cuda()
        optimizer.zero_grad()
        outputs = model(inputs)        
        loss = criterion(outputs, labels)
            
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    all_outputs = []
    val_loss = 0
    with torch.no_grad():
        for inputs, labels in valid_loader:
            inputs, labels = inputs.cuda(),labels.cuda()
            outputs = model(inputs)            
            loss = criterion(outputs, labels)
            all_outputs.extend(outputs.cpu().numpy())
            val_loss += loss.item()

    train_loss /= len(train_loader)
    val_loss /= len(valid_loader)
    train_loss_list.append(train_loss)
    val_loss_list.append(val_loss)
    print(f'Epoch {epoch+1}/{n_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Time Elapsed: {time.time() - prev_time:.4f}')
    prev_time = time.time()
    
    if val_loss < best_val_loss:
        best_outputs = np.array(all_outputs)
        if not os.path.exists('./model'):
            os.makedirs('./model')
        torch.save(model.state_dict(), './model/kobert_model.pth')
        best_val_loss = val_loss
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= patience:
            print("Early stopping")
            break

Epoch 1/100, Train Loss: 0.0568, Val Loss: 0.0456, Time Elapsed: 6.8013
Epoch 2/100, Train Loss: 0.0284, Val Loss: 0.0394, Time Elapsed: 5.4532
Epoch 3/100, Train Loss: 0.0241, Val Loss: 0.0369, Time Elapsed: 5.2994
Epoch 4/100, Train Loss: 0.0217, Val Loss: 0.0356, Time Elapsed: 5.5189
Epoch 5/100, Train Loss: 0.0205, Val Loss: 0.0346, Time Elapsed: 5.4959
Epoch 6/100, Train Loss: 0.0198, Val Loss: 0.0339, Time Elapsed: 5.3946
Epoch 7/100, Train Loss: 0.0190, Val Loss: 0.0351, Time Elapsed: 5.4450
Epoch 8/100, Train Loss: 0.0184, Val Loss: 0.0338, Time Elapsed: 5.5432
Epoch 9/100, Train Loss: 0.0180, Val Loss: 0.0316, Time Elapsed: 5.5274
Epoch 10/100, Train Loss: 0.0176, Val Loss: 0.0333, Time Elapsed: 5.4077
Epoch 11/100, Train Loss: 0.0172, Val Loss: 0.0331, Time Elapsed: 5.8063
Epoch 12/100, Train Loss: 0.0168, Val Loss: 0.0310, Time Elapsed: 6.6663
Epoch 13/100, Train Loss: 0.0166, Val Loss: 0.0317, Time Elapsed: 7.8648
Epoch 14/100, Train Loss: 0.0164, Val Loss: 0.0301, Time Ela

In [None]:
y_pred = best_outputs
y_test = np.array(valid_y)
y_test = np.rint(y_test*len(args['num_range'])).astype(int)
y_pred = np.rint(y_pred*len(args['num_range'])).astype(int)
y_pred = np.clip(y_pred, min(y_test), max(y_test))

In [23]:

metrics = compute_metrics(y_pred,y_test)
metrics

{'exp1': {'accuracy': 0.6044700304774805, 'kappa': 0.28124239377744853},
 'exp2': {'accuracy': 0.6410430071114122, 'kappa': 0.33612429122508114},
 'exp3': {'accuracy': 0.7126650863528615, 'kappa': 0.8688147490311131},
 'org1': {'accuracy': 0.536065018625127, 'kappa': 0.3406097226796083},
 'org2': {'accuracy': 0.75330172705723, 'kappa': 0.8982884062761548},
 'org3': {'accuracy': 0.6796478157805621, 'kappa': 0.8575876170944874},
 'org4': {'accuracy': 0.7260413139180495, 'kappa': 0.659550517719069},
 'con1': {'accuracy': 0.6366407043684389, 'kappa': 0.355000063521442},
 'con2': {'accuracy': 0.6134439552996952, 'kappa': 0.09285019234206038},
 'con3': {'accuracy': 0.7571960717913986, 'kappa': 0.9027817107688751},
 'con4': {'accuracy': 0.5951574669827294, 'kappa': 0.4233534435385514},
 'overall': {'accuracy': 0.6596065634331805, 'kappa': 0.5469275552703536}}

In [21]:
y_test.flatten()

array([3, 3, 2, ..., 2, 3, 3])