# Library

In [1]:
import os
import re
import warnings

import numpy as np
import pandas as pd


warnings.filterwarnings("ignore")

import tokenizers

# from transformers import RobertaTokenizer, RobertaForMaskedLM, LineByLineTextDataset, DataCollatorForLanguageModeling,Trainer, TrainingArguments
import torch
import transformers
from sklearn.model_selection import GroupKFold, StratifiedKFold
from tqdm import tqdm
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    DebertaV2ForMaskedLM,
    DebertaV2Tokenizer,
    LineByLineTextDataset,
    Trainer,
    TrainingArguments,
)

2024-04-25 12:49:33.937065: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-25 12:49:33.937171: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-25 12:49:34.072023: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Tokenizer

In [2]:
# ====================================================
# tokenizer
# ====================================================
# tokenizer = AutoTokenizer.from_pretrained(CFG.path)
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/lal-deberta-base-v018/")
tokenizer.add_special_tokens({"additional_special_tokens": ["[BR]"]})

1

# Data Loading

In [3]:
train = pd.read_csv("/kaggle/input/lal-train-data2/train.csv")
train = train[["essay_id", "full_text"]]
train.columns = ["id", "text"]

In [4]:
import codecs
from typing import Dict, List, Tuple

from text_unidecode import unidecode


def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end


# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)


def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text

In [5]:
train["text"] = train["text"].apply(lambda x: resolve_encodings_and_normalize(x))
train["text"] = [text.replace("\n", "[BR]") for text in train["text"]]

# CV Split

In [6]:
gkf = GroupKFold(n_splits=20)
# mskf = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=42)
# labels = [c for c in dfx.columns if c != "anchor"]
# dfx_labels = dfx[labels]
train["fold"] = -1

for fold, (trn_, val_) in enumerate(gkf.split(train, train, train["id"])):
    train.loc[val_, "fold"] = fold

train_text = "\n".join(train.loc[train["fold"] != 0, "text"].tolist())
val_text = "\n".join(train.loc[train["fold"] == 0, "text"].tolist())

with open("train_text.txt", "w") as f:
    f.write(train_text)
with open("val_text.txt", "w") as f:
    f.write(val_text)

# Train

In [7]:
model_name = "microsoft/deberta-v3-base"
model = DebertaV2ForMaskedLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="train_text.txt",  # mention train text file here
    block_size=512,
)

In [9]:
valid_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="val_text.txt",  # mention valid text file here
    block_size=512,
)

In [10]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

training_args = TrainingArguments(
    output_dir="./deberta_v3_base_chk",  # select model path for checkpoint
    overwrite_output_dir=True,
    num_train_epochs=8,
    per_device_train_batch_size=4,
    evaluation_strategy="steps",
    save_total_limit=0,
    save_strategy="steps",
    save_steps=14456,
    eval_steps=7228,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    prediction_loss_only=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [11]:
trainer.train()
trainer.save_model(f"./deberta_v3_base")

Step,Training Loss,Validation Loss
7228,2.0609,1.899971
14456,1.7981,1.656094
21684,1.6823,1.575009
28912,1.5695,1.492748
36140,1.5312,1.444767
43368,1.4683,1.420768
50596,1.4346,1.376557
57824,1.4064,1.348831


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].
