In [79]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import re

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


In [101]:
CLS_TOKEN = 101
PAD_TOKEN = 0

class EnglishDataset(Dataset):
    def __init__(self, file_name):
        text_df = pd.read_csv(file_name)
        
        self.x = text_df.iloc[:, 1].values
        self.y = text_df.iloc[:, 2:7].values
        self.n_samples = self.x.shape[0]
        
        self.x = self.preprocessing(self.x)
        self.x, self.attention_mask = self.tokenize_text(self.x)
        
    def __getitem__(self, index):
        return {'input_ids':self.x[index], 'attention_mask':self.attention_mask[index]}, {'target':torch.tensor(self.y[index], dtype=torch.float32)}
    
    def __len__(self):
        return self.n_samples
        
    def preprocessing(self, text):
        result = []
        for content in text:
            content = re.sub(r"\n"," ", content)
            # 틀린 문법도 포함해야하므로 전처리 끝
            result.append(content)
        return result
    
    def tokenize_text(self, text):
        tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
        tokenized_text = []
        attention_mask = []
        for content in text:
            tokenized = tokenizer.encode_plus(content,
                                              add_special_tokens=True,
                                              padding="max_length",
                                              truncation=True,
                                              max_length=512,
                                              return_attention_mask=True)
            tokenized_text.append(torch.tensor(tokenized["input_ids"], dtype=torch.long))
            attention_mask.append(torch.tensor(tokenized["attention_mask"], dtype=torch.long))
        return tokenized_text, attention_mask

In [102]:
train_dataset = EnglishDataset("train.csv")
test_dataset = EnglishDataset("test.csv")
dataloader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=16, shuffle=True)

In [103]:
train_dataset[0][0].keys()

dict_keys(['input_ids', 'attention_mask'])

In [110]:
class DeBERTaClass(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.deberta = AutoModel.from_pretrained("microsoft/deberta-v3-base")
        self.linear = torch.nn.Linear(768, 6)
        
    def forward(self, inputs):
        deberta_output = self.deberta(inputs['input_ids'], inputs['attention_mask'], return_dict=True)
        outputs = self.linear(deberta_output['last_hidden_state'])
        return outputs

In [111]:
model = DeBERTaClass()

#hyperparams
epochs = 10
lr = 0.001

# loss, optimizer
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#training
n_total_steps = len(dataloader)
for epoch in range(epochs):
    for i, (inputs, targets) in enumerate(dataloader):
        
        #forward
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        #backward
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if (i==63):
            print(f"epoch {epoch+1} / {epochs}, loss = {loss}")

torch.save(model.state_dict(), "/train_save")

MemoryError: 