reference : https://www.kaggle.com/code/uygarkk/youtube-debertav3-autocorrect/notebook

imports

In [33]:
import pandas as pd
import transformers
from transformers import DebertaV2TokenizerFast, DebertaV2ForSequenceClassification
import torch
from torch import optim
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn import MSELoss
import numpy as np
import random
import timeit
from tqdm import tqdm
import autocorrect

Hyperparameter Definitions

In [35]:
RANDOM_SEED = 42
MODEL_PATH = "microsoft/deberta-v3-base"
MAX_LENGTH = 512
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
EPOCHS = 2

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = "cuda" if torch.cuda.is_available() else "cpu"
transformers.utils.logging.set_verbosity_error()

Data

In [36]:
train_summary_df = pd.read_csv("./CommonLit_data/summaries_train.csv")
train_prompt_df = pd.read_csv("./CommonLit_data/prompts_train.csv")
test_summary_df = pd.read_csv("./CommonLit_data/summaries_test.csv")
test_prompt_df = pd.read_csv("./CommonLit_data/prompts_test.csv")
submission_df = pd.read_csv("./CommonLit_data/sample_submission.csv")

Correct Spelling Mistakes & check correct numbers

In [38]:
spell = autocorrect.Speller(lang='en', fast=True)

# 원래 요약문 texts
original_texts = train_summary_df["text"].tolist()

# spell 적용
corrected_texts = train_summary_df["text"].apply(lambda x: spell(x)).tolist()

# 철자가 수정된 단어 및 그 개수를 저장할 딕셔너리
corrections = {}

# 원본 텍스트와 교정된 텍스트를 비교
for original, corrected in zip(original_texts, corrected_texts):
    original_words = original.split()
    corrected_words = corrected.split()
    
    # 단어별로 비교
    for orig_word, corr_word in zip(original_words, corrected_words):
        if orig_word != corr_word:
            key = (orig_word, corr_word)
            if key not in corrections:
                corrections[key] = 1
            else:
                corrections[key] += 1

# 결과를 수정 횟수에 따라 내림차순으로 정렬
sorted_corrections = sorted(corrections.items(), key=lambda x: x[1], reverse=True)
"""
딕셔너리에 items() 메서드를 사용하면, 딕셔너리의 항목들을
(key, value) 형태로 가져올 수 있다.

key=lambda x: x[1]는 정렬 기준을 결정하는 부분이다. x는 (key, value)형태인데,
x[1]은 value를 기준으로 정렬하겠다는 뜻이다. 즉, 오타 수정한 숫자만큼 정렬
근데 reverse=True이므로 내림차순으로 정렬하겠다는 것.
sorted() 하면 반환값은 항상 리스트로 나오게 된다.

이 리스트는 [((orig, corr), count), ((orig, corr), count), ...] 형태
왜냐? 위에서  key = (orig_word, corr_word) 즉, key를 튜플형태로 해놨기 떄문
"""


# 결과 출력
for (orig, corr), count in sorted_corrections:
    print(f"'{orig}' was corrected to '{corr}' {count} times.")

'borax' was corrected to 'bora' 567 times.
'pharaohs' was corrected to 'pharaoh' 548 times.
'goverment' was corrected to 'government' 386 times.
'dosed' was corrected to 'dose' 293 times.
'hams' was corrected to 'has' 242 times.
'moldy' was corrected to 'mold' 232 times.
'hoppers,' was corrected to 'hopper,' 209 times.
'Pharaohs' was corrected to 'Pharaoh' 179 times.
'toiled' was corrected to 'tailed' 163 times.
'pharaohs,' was corrected to 'pharaoh,' 155 times.
'Osiris,' was corrected to 'Siris,' 146 times.
'hoppers' was corrected to 'hopper' 91 times.
'belived' was corrected to 'believed' 86 times.
'pharoh' was corrected to 'pharaoh' 83 times.
'pickling' was corrected to 'picking' 80 times.
'tradgedy' was corrected to 'tragedy' 78 times.
'Craftsmen' was corrected to 'Craftsman' 77 times.
'pharaohs.' was corrected to 'pharaoh.' 74 times.
'goverment.' was corrected to 'government.' 72 times.
'quelled' was corrected to 'fuelled' 70 times.
'pharoah' was corrected to 'pharaoh' 68 times.
'

In [39]:
train_summary_df["text"] = train_summary_df["text"].apply(lambda x: spell(x))
test_summary_df["text"] = test_summary_df["text"].apply(lambda x: spell(x))
"""
.apply() 메서드는 데이터프레임의 특정 열의 각 값에 () 안의 함수를 적용한다.
그니까 위의 두 코드는 'text'열에 있는 모든 텍스트에 맞춤법 교정을 적용하고, 
그 결과를 원래의 'text'열에 다시 저장하는 것이다
"""

"\n.apply() 메서드는 데이터프레임의 특정 열의 각 값에 () 안의 함수를 적용한다.\n그니까 위의 두 코드는 'text'열에 있는 모든 텍스트에 맞춤법 교정을 적용하고, \n그 결과를 원래의 'text'열에 다시 저장하는 것이다\n"

Model and Tokenizer Definition

In [40]:
tokenizer = DebertaV2TokenizerFast.from_pretrained(MODEL_PATH)
model = DebertaV2ForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=2).to(device)
"""
토크나이저와 모델 정의
"""



'\n토크나이저와 모델 정의\n'

merge Datasets

In [41]:
train_df = train_prompt_df.merge(train_summary_df, on="prompt_id")
"""
프롬프트 df와 요약문 df를 합친다. prompt_id을 기준으로.
"""
train_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886


In [42]:
train_df["inputs"] = train_df["prompt_question"] + " " + train_df["prompt_title"] + " " + tokenizer.sep_token + " " + train_df["text"]

"""
train_df에 'inputs'라는 새로운 column을 만든다. 여기에 뭘 집어넣냐?
'prompt_question' + "prompt_title" + [SEP] +  "text"

프롬프트의 text는 넣지 않는다.
"""
train_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,content,wording,inputs
0,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00791789cc1f,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,Summarize at least 3 elements of an ideal trag...
1,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0086ef22de8f,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,Summarize at least 3 elements of an ideal trag...
2,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,0094589c7a22,Aristotle states that an ideal tragedy should ...,-0.387791,-0.584181,Summarize at least 3 elements of an ideal trag...
3,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00cd5736026a,One element of an Ideal tragedy is having a co...,0.088882,-0.59471,Summarize at least 3 elements of an ideal trag...
4,39c16e,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...,00d98b8ff756,The 3 ideal of tragedy is how complex you need...,-0.687288,-0.460886,Summarize at least 3 elements of an ideal trag...


In [50]:
test_df = test_prompt_df.merge(test_summary_df, on="prompt_id")
test_df["inputs"] = test_df["prompt_question"] + " " + test_df["prompt_title"] + tokenizer.sep_token + test_df["text"]
"""
test_df에도 위의 train때와 마찬가지로 적용
"""
test_df.head()

Unnamed: 0,prompt_id,prompt_question,prompt_title,prompt_text,student_id,text,inputs
0,abc123,Summarize...,Example Title 1,Heading\nText...,000000ffffff,Example text 1,Summarize... Example Title 1[SEP]Example text 1
1,abc123,Summarize...,Example Title 1,Heading\nText...,222222cccccc,Example text 3,Summarize... Example Title 1[SEP]Example text 3
2,def789,Summarize...,Example Title 2,Heading\nText...,111111eeeeee,Example text 2,Summarize... Example Title 2[SEP]Example text 2
3,def789,Summarize...,Example Title 2,Heading\nText...,333333dddddd,Example text 4,Summarize... Example Title 2[SEP]Example text 4


In [48]:
max_length = train_df['inputs'].apply(lambda x: len(x)).max()
"""
'inputs' 열에서 문자의 길이를 측정해서 최대값을 뽑아본다
len(문자열)은 모든 문자의 수를 나타냄
-> len("I am a boy")는 10이다.
"""
print(max_length)


4152


In [49]:
s = train_df['inputs'].str.len()
"""
'inputs'열의 각 문자열에 대한 길이를 계산하고
describe() 메서드를 이용해서 여러가지 통계를 뽑아본다.
"""
s.describe()


count    7165.000000
mean      563.672994
std       318.629232
min       220.000000
25%       356.000000
50%       470.000000
75%       668.000000
max      4152.000000
Name: inputs, dtype: float64

Dataset and DataLoader implementation

In [51]:
class SummaryTrainDataset(Dataset):
    def __init__(self, inputs, content, wording, tokenizer):
        self.scores = torch.tensor([list(x) for x in zip(content, wording)])
        self.encodings = tokenizer(inputs, padding=True, truncation=True, max_length=MAX_LENGTH)
        """
        inputs : 토큰화할 텍스트 데이터
        content, wording : 리스트
        content와 wording 리스트를 zip으로 묶어서 각 튜플을 리스트로 변환하고
        이를 Pytorch tensor로 변환한 것이 self.scores
        
        inputs을 토큰화한 것이 self.encodings (딕셔너리)
        input_ids나 attention_mask 등의 key 값들이 있다.
        value는 리스트임
        """
        
    def __len__(self):
        """
        Dataset의 길이를 반환
        """
        return len(self.scores)
    
    def __getitem__(self , idx):
        """
        특정 인덱스의 데이터를 반환
        self.encodings 딕셔너리에서 각 리스트의 idx 번째 값을
        텐서로 변환하고 새로운 딕셔너리를 만든다.
        
        이걸 왜 하냐? DataLoader를 통해서 모델에 배치 단위로 제공된다.
        
        그리고 out_dic에 새로운 열 'scores'를 추가하고 
        ids번째 score 쌍을 넣는다.
        """
        out_dic = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    
        out_dic["scores"] = self.scores[idx]
        return out_dic
    
class SummarySubmitDataset(Dataset):
    """
    예는 train이 아니라서 content나 score이 없음
    """
    def __init__(self, inputs, ids, tokenizer):
        self.ids = ids
        self.encodings = tokenizer(inputs, padding=True, truncation=True, max_length=MAX_LENGTH)

    def __len__(self):
        return len(self.ids)

    def __getitem__(self , idx):
        out_dic = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        out_dic["ids"] = self.ids[idx]
        return out_dic

In [52]:
dataset = SummaryTrainDataset(train_df["inputs"].to_list(), train_df["content"].to_list(), train_df["wording"].to_list(), tokenizer)

print(len(dataset))
print(dataset[0])


7165
{'input_ids': tensor([     1, 105982,    288,    668,    404,   2019,    265,    299,   1949,
          8948,    261,    283,   1897,    293,  26446,    260,    589,  56195,
             2,    376,   3036,    265,    299,   1949,   8948,    269,    272,
           278,    403,    282,   6128,    277,    266,   1739,    741,    260,
          1811,   3036,    265,    299,   1949,   8948,    269,    272,    278,
           403,    364,    286,    311,    872,    889,    260,    279,    437,
          3036,    265,    299,   1949,   8948,    269,    272,    278,    403,
           286,    266,   1664,   3676,   4278,    263,    299,   3680,  21419,
           270,    462,    397,    263,    966,    260,      2,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      

In [53]:
test_dataset = SummarySubmitDataset(test_df["inputs"].to_list(), test_df["student_id"].to_list(), tokenizer)
print(len(test_dataset))
print(test_dataset[0])

4
{'input_ids': tensor([     1, 105982,    260,    260,    260,  11134,   7181,    376,      2,
         11134,   1529,    376,      2]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'ids': '000000ffffff'}


In [55]:
# 난수 생성하고 0.9, 0.1 비율로 train, val dataset 분할
generator = torch.Generator().manual_seed(RANDOM_SEED)
train_dataset, val_dataset = random_split(dataset, [0.9, 0.1], generator=generator)

In [56]:
"""
train, validation, test DataLoader를 만든다.
"""

train_dataloader = DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)

val_dataloader = DataLoader(dataset=val_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True)

test_dataloader = DataLoader(dataset=test_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=False)

Training Loop

In [None]:
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = MSELoss(reduction="mean")

start = timeit.default_timer() 
for epoch in tqdm(range(EPOCHS), position=0, leave=True):
    model.train()
    train_running_loss = 0 
    for idx, sample in enumerate(tqdm(train_dataloader, position=0, leave=True)):
        input_ids = sample["input_ids"].to(device)
        attention_mask = sample["attention_mask"].to(device)
        targets = sample["scores"].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(targets, outputs["logits"])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_running_loss += loss.item()
    train_loss = train_running_loss / (idx + 1)

    model.eval()
    val_running_loss = 0 
    preds = []
    golds = []
    with torch.no_grad():
        for idx, sample in enumerate(tqdm(val_dataloader, position=0, leave=True)):
            input_ids = sample["input_ids"].to(device)
            attention_mask = sample["attention_mask"].to(device)
            targets = sample["scores"].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            loss = criterion(targets, outputs[0])
            
            val_running_loss += loss.item()
    val_loss = val_running_loss / (idx + 1)

    print("-"*30)
    print(f"Train Loss EPOCH {epoch+1}: {train_loss:.4f}")
    print(f"Valid Loss EPOCH {epoch+1}: {val_loss:.4f}")
    print("-"*30)
stop = timeit.default_timer()
print(f"Training Time: {stop-start:.2f}s")

In [None]:
torch.cuda.empty_cache()

In [None]:
contents = []
wordings = []
ids = []
model.eval()
with torch.no_grad():
    for idx, sample in enumerate(tqdm(test_dataloader, position=0, leave=True)):
        input_ids = sample["input_ids"].to(device)
        attention_mask = sample["attention_mask"].to(device)
        ids.extend(sample["ids"])
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)["logits"]
        
        contents.extend([float(i) for i in outputs[:,0]])
        wordings.extend([float(i) for i in outputs[:,1]])

In [None]:
submission_df = pd.DataFrame(list(zip(ids, contents, wordings)),
               columns =["student_id", "content", "wording"])
submission_df.to_csv("submission.csv", index=False)
submission_df.head()

In [None]:
# del model
# torch.cuda.init()

# torch.cuda.empty_cache()

# import gc
# gc.collect()