reference : https://www.kaggle.com/code/tsunotsuno/debertav3-baseline-content-and-wording-models

In [66]:
import numpy as np
import pandas as pd
import warnings
import logging
import os
import shutil
import json
import transformers
from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from datasets import Dataset, load_dataset, load_from_disk
from transformers import TrainingArguments, Trainer
from datasets import load_metric, disable_progress_bar
from sklearn.metrics import mean_squared_error
import torch
from sklearn.model_selection import KFold, GroupKFold

# logging setting
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
disable_progress_bar()

In [67]:
# set random seed
def seed_everything(seed:int):
    import random, os
    import numpy as np
    import torch
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(seed=42)

In [68]:
class CFG:
    model_name="microsoft/deberta-v3-base"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.0 # 0.005
    attention_probs_dropout_prob=0.0 # 0.005
    num_train_epochs=3
    n_splits=4
    batch_size=8
    random_seed=42
    save_steps=100
    max_length=512

In [69]:
DATA_DIR = "./CommonLit_data/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

train = summaries_train.merge(prompts_train, how="left", on="prompt_id")
test = summaries_test.merge(prompts_test, how="left", on="prompt_id")

# train = train.head(100)
print(train.shape)
train.head(10)

(7165, 8)


Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...
5,0071d51dab6d,ebad26,They would use chemicals and substances to cha...,0.205683,0.380538,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
6,0072b649a88c,3b9047,The Egyptian society is really different from ...,0.205683,0.380538,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...
7,00746c7c79c3,ebad26,"Many times the factories would, according to t...",-0.878889,-0.96633,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an..."
8,00791789cc1f,39c16e,1 element of an ideal tragedy is that it shoul...,-0.210614,-0.471415,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...
9,0086ef22de8f,39c16e,The three elements of an ideal tragedy are: H...,-0.970237,-0.417058,Summarize at least 3 elements of an ideal trag...,On Tragedy,Chapter 13 \r\nAs the sequel to what has alrea...


In [70]:
gkf = GroupKFold(n_splits=CFG.n_splits)

for i, (_, val_index) in enumerate(gkf.split(train, groups=train["prompt_id"])):
    train.loc[val_index, "fold"] = i  # train 데이터프레임에 "fold" 열을 추가하고, i를 할당한다(fold 번호)
    
train.head()

Unnamed: 0,student_id,prompt_id,text,content,wording,prompt_question,prompt_title,prompt_text,fold
0,000e8c3c7ddb,814d6b,The third wave was an experimentto see how peo...,0.205683,0.380538,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,3.0
1,0020ae56ffbf,ebad26,They would rub it up with soda to make the sme...,-0.548304,0.506755,Summarize the various ways the factory would u...,Excerpt from The Jungle,"With one member trimming beef in a cannery, an...",2.0
2,004e978e639e,3b9047,"In Egypt, there were many occupations and soci...",3.128928,4.231226,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,1.0
3,005ab0199905,3b9047,The highest class was Pharaohs these people we...,-0.210614,-0.471415,"In complete sentences, summarize the structure...",Egyptian Social Structure,Egyptian society was structured like a pyramid...,1.0
4,0070c9e7af47,814d6b,The Third Wave developed rapidly because the ...,3.272894,3.219757,Summarize how the Third Wave developed over su...,The Third Wave,Background \r\nThe Third Wave experiment took ...,3.0


In [71]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse" : rmse}

def compute_mcrmse(eval_pred):
    
    preds, labels = eval_pred
    
    col_rmse = np.srqt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)
    
    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred) ** (1/2)
    wording_score = mean_squared_error(wording_true, wording_pred) ** (1/2)
    
    return (content_score + wording_score) / 2
    

In [72]:
class ContentScoreRegressor:
    def __init__(self, model_name: str,
                 model_dir: str,
                 target: str,
                 hidden_dropout_prob: float,
                 attention_probs_dropout_prob: float,
                 max_length: int,):
        self.text_cols = ["text"]
        self.target = target
        self.target_cols = [target]
        
        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length
        
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model_config = AutoConfig.from_pretrained(model_name)
        
        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels" : 1,
            "problem_type": "regression"
        })
        
        seed_everything(seed=42)
        
        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )
    
    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples["text"],
                                   padding=False,
                                   truncation=True,
                                   max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels
        }
        
    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples["text"],
                                   padding=False,
                                   truncation=True,
                                   max_length=self.max_length)
        return tokenized
    
    def train(self,
              fold: int,
              train_df: pd.DataFrame,
              valid_df: pd.DataFrame,
              batch_size: int,
              learning_rate: float,
              weight_decay: float,
              save_steps: int,
              num_train_epochs: int,
              ) -> None:
        
        train_df = train_df[self.text_cols + self.target_cols]
        valid_df = valid_df[self.text_cols + self.target_cols]
        
        model_content = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, config=self.model_config)
        
        train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False)
        
        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)
        
        model_fold_dir = os.path.join(self.model_dir, str(fold))
        
        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True,
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',  # 어디에 로그를 남길지 결정. 'none'은 로깅x
            greater_is_better=False, # 메트릭 값이 클수록 좋은것인지 여부. 작을수록 좋으니 False
            save_strategy="steps", # 지정된 step마다 모델 저장
            evaluation_strategy="steps", #  # 지정된 step마다 evaluation 수행
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model='rmse',
            save_total_limit=1
        )
        
        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )
        
        trainer.train()
        
        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)
        
    def predict(self,
                test_df: pd.DataFrame,
                fold: int):
        test_ = test_df[self.text_cols]
        
        test_dataset = Dataset.from_pandas(test_, preserve_index=False)
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)
        
        model_content = AutoModelForSequenceClassification.from_pretrained("{}".format(
            self.model_dir
        ))
        model_content.eval()
        
        model_fold_dir = os.path.join(self.model_dir, str(fold))
        
        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train=False,
            do_predict=True,
            per_device_eval_batch_size=4,
            dataloader_drop_last=False,
        )
        
        # init trainer
        infer_content = Trainer(
            model=model_content,
            tokenizer=self.tokenizer,
            data_collator=self.data_collator,
            args=test_args
        )
        
        preds = infer_content.predict(test_tokenized_dataset)[0]
        
        return preds

In [73]:
def train_by_fold(
    train_df: pd.DataFrame,
    model_name: str,
    target: str,
    save_each_model: bool,
    n_splits: int,
    batch_size: int,
    learning_rate: int,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    weight_decay: float,
    num_train_epochs: int,
    save_steps: int,
    max_length: int):
    
    # delete old model files
    safe_model_name = model_name.replace('/', '_')
    if os.path.exists(model_name):
        shutil.rmtree(model_name)
        
    os.mkdir(safe_model_name)
    
    for fold in range(CFG.n_splits):
        print("fold {} :".format(fold))
        
        train_data = train_df[train_df["fold"] != fold] 
        valid_data = train_df[train_df["fold"] == fold]   
        
        if save_each_model == True:
            model_dir = f"{target}/{model_name}/fold_{fold}"
        else:
            model_dir = f"{model_name}/fold_{fold}"
            
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir=model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length
        )
        
        csr.train(
            fold=fold,
            train_df = train_data,
            valid_df=valid_data,
            batch_size=batch_size,
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            num_train_epochs=num_train_epochs,
            save_steps=save_steps,
        )
        
def validate(
    train_df: pd.DataFrame,
    target: str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length: int) -> pd.DataFrame:
    
    for fold in range(CFG.n_splits):
        print(f"fold {fold} :")
        
        valid_data = train_df[train_df["fold"] == fold]
        
        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir=model_dir,
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
        )
        
        pred = csr.predict(
            test_df=valid_data,
            fold=fold
        )
        
        train_df.loc[valid_data.index, f"{target}_pred"] = pred
        
    return train_df
        
        
def predict(
    test_df: pd.DataFrame,
    target:str,
    save_each_model: bool,
    model_name: str,
    hidden_dropout_prob: float,
    attention_probs_dropout_prob: float,
    max_length : int
    ):
    """predict using mean folds"""

    for fold in range(CFG.n_splits):
        print(f"fold {fold}:")
        
        if save_each_model == True:
            model_dir =  f"{target}/{model_name}/fold_{fold}"
        else: 
            model_dir =  f"{model_name}/fold_{fold}"

        csr = ContentScoreRegressor(
            model_name=model_name,
            target=target,
            model_dir = model_dir, 
            hidden_dropout_prob=hidden_dropout_prob,
            attention_probs_dropout_prob=attention_probs_dropout_prob,
            max_length=max_length,
           )
        
        pred = csr.predict(
            test_df=test_df, 
            fold=fold
        )
        
        test_df[f"{target}_pred_{fold}"] = pred
    
    test_df[f"{target}"] = test_df[[f"{target}_pred_{fold}" for fold in range(CFG.n_splits)]].mean(axis=1)

    return test_df
    

In [74]:
for target in ["content", "wording"]:
    train_by_fold(
        train,
        model_name=CFG.model_name,
        save_each_model=False,
        target=target,
        learning_rate=CFG.learning_rate,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        weight_decay=CFG.weight_decay,
        num_train_epochs=CFG.num_train_epochs,
        n_splits=CFG.n_splits,
        batch_size=CFG.batch_size,
        save_steps=CFG.save_steps,
        max_length=CFG.max_length
    )
    
    
    train = validate(
        train,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

    rmse = mean_squared_error(train[target], train[f"{target}_pred"], squared=False)
    print(f"cv {target} rmse: {rmse}")

    test = predict(
        test,
        target=target,
        save_each_model=False,
        model_name=CFG.model_name,
        hidden_dropout_prob=CFG.hidden_dropout_prob,
        attention_probs_dropout_prob=CFG.attention_probs_dropout_prob,
        max_length=CFG.max_length
    )

fold 0 :


  0%|          | 0/1917 [00:00<?, ?it/s]

  0%|          | 0/258 [00:00<?, ?it/s]

{'eval_loss': 0.43875548243522644, 'eval_rmse': 0.6623861789703369, 'eval_runtime': 144.995, 'eval_samples_per_second': 14.187, 'eval_steps_per_second': 1.779, 'epoch': 0.16}


  0%|          | 0/258 [00:00<?, ?it/s]

{'eval_loss': 0.21685656905174255, 'eval_rmse': 0.4656786024570465, 'eval_runtime': 143.8209, 'eval_samples_per_second': 14.303, 'eval_steps_per_second': 1.794, 'epoch': 0.31}


KeyboardInterrupt: 

In [None]:
mcrmse = compt_score(content_true=train["content"], 
            content_pred=train["content_pred"], 
            wording_true=train["wording"],
            wording_pred=train["wording_pred"], 
           )
print(f"cv mcrmse: {mcrmse}")

In [None]:
test

In [None]:
sample_submission

In [None]:
test[["student_id", "content", "wording"]].to_csv("submission.csv", index=False)

In [76]:
import torch
torch.cuda.empty_cache()


torch.cuda.init()

import gc
gc.collect()

7369