In [1]:
from kaggle_secrets import UserSecretsClient
import wandb
user_secrets = UserSecretsClient()
my_secret = user_secrets.get_secret("wandb_api_key") 
wandb.login(key=my_secret)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold
from transformers import AutoTokenizer,TrainingArguments, Trainer, AutoModelForSequenceClassification,DataCollatorWithPadding, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
import numpy as np
import shutil
import torch.nn as nn
import json
from torch.optim import AdamW

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
config = {
    "name":"DeBERTa-LLRD",
    "random_state":123,
    "n_folds":5,
    "model_path":"microsoft/deberta-v3-large",
    "lr":2e-5,
    "lr_decay": 0.97,
    "weight_decay":0.01,
    "n_epochs":5,
    "batch_size":18,
    "maxlen": 100,
    "strategy": "GROUPKFOLD",
    "loss_func":"MSELoss",
    
}
with open("config.json", "w") as outfile:
    json.dump(config, outfile)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(config["model_path"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
train_df = pd.read_csv("/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv")
titles_df = pd.read_csv("/kaggle/input/cpc-codes/titles.csv")
train_df = train_df.merge(titles_df, left_on='context', right_on='code')
train_df['input'] = train_df["anchor"] + "[SEP]" + train_df["target"] + "[SEP]" + train_df['title']
train_df.head()

Unnamed: 0,id,anchor,target,context,score,code,title,section,class,subclass,group,main_group,input
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,abatement[SEP]abatement of pollution[SEP]FURNI...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,abatement[SEP]act of abating[SEP]FURNITURE; DO...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,abatement[SEP]active catalyst[SEP]FURNITURE; D...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,abatement[SEP]eliminating process[SEP]FURNITUR...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,abatement[SEP]forest region[SEP]FURNITURE; DOM...


In [6]:
train_df["len"] = train_df["input"].apply(lambda x: len(tokenizer(x)["input_ids"]))
train_df[["len"]].describe()

Unnamed: 0,len
count,36473.0
mean,21.188167
std,10.58025
min,7.0
25%,13.0
50%,18.0
75%,26.0
max,92.0


In [7]:
def create_folds(df, n_folds, strategy):
    df["fold"] = -1
    # just use for spliting the data
    df["bins"] = pd.cut(
        df["score"], bins=5, labels=False
    )
    if strategy == "KFOLD":
        kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config["random_state"])
        for i, (train , val) in enumerate(kf.split(X=df, y=df["bins"].values)):
            df.loc[val, 'fold'] = i
            
    elif strategy == "GROUPKFOLD":
        gkf = StratifiedGroupKFold(n_splits=n_folds, shuffle=True, random_state=config["random_state"])
        for i, (train , val) in enumerate(gkf.split(X=df, y=df["bins"].values, groups=df["anchor"].values)):
            df.loc[val, 'fold'] = i
            
    df = df.drop("bins", axis=1)
    return df

train_df = create_folds(train_df, config["n_folds"], config["strategy"])

In [8]:
class TrainDataset(Dataset):
    def __init__(self, df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        self.label = df['score'].values

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        label = self.label[item]
        
        return {
        **tokenizer( 
            inputs,
            max_length=config["maxlen"],
            padding=False,
            truncation=True
        ),
        'label':label.astype(np.float32)
    }

In [9]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return {
        'pearson': np.corrcoef(predictions, labels)[0][1]
    }


def create_optimizer(model, config):
    no_decay = ["bias", "LayerNorm.weight"]
    lr = config["lr"]
    # Linear layers
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if "classifier" in n],
            "weight_decay": 0.0,
            "lr": lr
        }
    ]
    # Other layers
    layers = [model.deberta.embeddings] + list(model.deberta.encoder.layer)
    layers.reverse()
    for layer in layers:
        optimizer_grouped_parameters += [
            {
                "params": [p for n, p in layer.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": config["weight_decay"],
                "lr": lr
            },
            {
                "params": [p for n, p in layer.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
                "lr": lr
            }
        ]
        lr *= config["lr_decay"]
        
    optimizer = AdamW(optimizer_grouped_parameters, lr=config["lr"])
#     scheduler = get_linear_schedule_with_warmup(
#         optimizer,
#         num_warmup_steps=self.hparams.warmup_steps,
#         num_training_steps=self.trainer.estimated_stepping_batches,
#     )
    return optimizer

# def create_optimizer(model):
#     no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
#     optimizer_parameters = [
#         {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
#         'lr': config["encoder_lr"], 'weight_decay': config["weight_decay"]},
#         {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
#         'lr': config["encoder_lr"], 'weight_decay': 0.0},
#         {'params': [p for n, p in model.named_parameters() if "model" not in n],
#         'lr': config["decoder_lr"], 'weight_decay': 0.0}
#     ]
#     scheduler = get_cosine_schedule_with_warmup(
#             optimizer,
#             num_training_steps=Config.epochs * len(train_dl),
#             num_warmup_steps=50)
#     return optimizer_parameters

# class CorrLoss(nn.Module):
#     """
#     use 1 - correlational coefficience between the output of the network and the target as the loss
#     input (o, t):
#         o: Variable of size (batch_size, 1) output of the network
#         t: Variable of size (batch_size, 1) target value
#     output (corr):
#         corr: Variable of size (1)
#     """
#     def __init__(self):
#         super(CorrLoss, self).__init__()

#     def forward(self, o, t):
#         # calcu z-score for o and t
#         o_m = o.mean(dim = 0)
#         o_s = o.std(dim = 0)
#         o_z = (o - o_m)/o_s

#         t_m = t.mean(dim =0)
#         t_s = t.std(dim = 0)
#         t_z = (t - t_m)/t_s

#         # calcu corr between o and t
#         tmp = o_z * t_z
#         corr = tmp.mean(dim = 0).squeeze(0)
#         return  1 - corr

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.get("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         # compute custom loss
#         if config["loss_func"] == "MSELoss":
#             loss_fct = nn.MSELoss()
#         elif config["loss_func"] == "CorrLoss":
#             loss_fct = CorrLoss()
        
#         loss = loss_fct(logits, labels.unsqueeze(-1))
#         return (loss, outputs) if return_outputs else loss
        

In [10]:
oof_df = pd.DataFrame()
for fold in range(config["n_folds"]):
    wandb.init(project="USPPPM", name=f"{config['name']}_fold{fold}")
    wandb.log(config)
    
    tr_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
    va_data = train_df[train_df['fold']==fold].reset_index(drop=True)
    tr_dataset = TrainDataset(tr_data)
    va_dataset = TrainDataset(va_data)
    
    args = TrainingArguments(
        output_dir=f"/tmp/uspppm",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=config["batch_size"],
        per_device_eval_batch_size=config["batch_size"],
        num_train_epochs=config["n_epochs"],
        metric_for_best_model="pearson",
        load_best_model_at_end=True,
        learning_rate=config["lr"],
        weight_decay=config["weight_decay"],
    )
    
    model = AutoModelForSequenceClassification.from_pretrained(config["model_path"], num_labels=1)
    optimizer = create_optimizer(model, config)
    trainer = Trainer(
        model,
        args,
        train_dataset=tr_dataset,
        eval_dataset=va_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
        optimizers=(optimizer,None)
    )
    
    trainer.train()
    shutil.rmtree(f"/tmp/uspppm")
    trainer.save_model(f"uspppm_{fold}")
    
    outputs = trainer.predict(va_dataset)
    predictions = outputs.predictions.reshape(-1)
    va_data['preds'] = predictions
    oof_df = pd.concat([oof_df, va_data])
    
    wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mnorrawee[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.15.8 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.15.5
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20230816_120247-2t1och1i[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mDeBERTa-LLRD_fold0[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/norrawee/USPPPM[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/norrawee/USPPPM/runs/2t1och1i[0m


Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

Epoch,Training Loss,Validation Loss,Pearson
1,0.0238,0.024867,0.813518
2,0.0165,0.023575,0.81611
3,0.011,0.02158,0.824111
4,0.0087,0.022879,0.820755
5,0.0063,0.022547,0.820092


[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                     batch_size ▁
[34m[1mwandb[0m:                      eval/loss █▅▁▄▃
[34m[1mwandb[0m:                   eval/pearson ▁▃█▆▅
[34m[1mwandb[0m:                   eval/runtime ▅▂█▃▁
[34m[1mwandb[0m:        eval/samples_per_second ▄▇▁▆█
[34m[1mwandb[0m:          eval/steps_per_second ▄▇▁▆█
[34m[1mwandb[0m:                             lr ▁
[34m[1mwandb[0m:                       lr_decay ▁
[34m[1mwandb[0m:                         maxlen ▁
[34m[1mwandb[0m:                       n_epochs ▁
[34m[1mwandb[0m:                        n_folds ▁
[34m[1mwandb[0m:                   random_state ▁
[34m[1mwandb[0m:                    train/epoch ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇████
[34m[1mwandb[0m:              train/global_step ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇████
[34m[1mwandb[0m:            train/learning_rate ██▇▇▆▆▅▅▄▄▃▃▂▂

Epoch,Training Loss,Validation Loss,Pearson
1,0.0248,0.025415,0.821652
2,0.0171,0.023477,0.823877
3,0.012,0.022589,0.827923
4,0.0085,0.023366,0.828014
5,0.0067,0.02295,0.827412


[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                     batch_size ▁
[34m[1mwandb[0m:                      eval/loss █▃▁▃▂
[34m[1mwandb[0m:                   eval/pearson ▁▃██▇
[34m[1mwandb[0m:                   eval/runtime ▃▁█▁▇
[34m[1mwandb[0m:        eval/samples_per_second ▆█▁█▂
[34m[1mwandb[0m:          eval/steps_per_second ▆█▁█▂
[34m[1mwandb[0m:                             lr ▁
[34m[1mwandb[0m:                       lr_decay ▁
[34m[1mwandb[0m:                         maxlen ▁
[34m[1mwandb[0m:                       n_epochs ▁
[34m[1mwandb[0m:                        n_folds ▁
[34m[1mwandb[0m:                   random_state ▁
[34m[1mwandb[0m:                    train/epoch ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
[34m[1mwandb[0m:              train/global_step ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
[34m[1mwandb[0m:            train/learning_rate ██▇▇▆▆▅▅▄▄▃▃▂▂

Epoch,Training Loss,Validation Loss,Pearson
1,0.0254,0.021464,0.831439
2,0.0171,0.022316,0.832357
3,0.0117,0.020734,0.841406
4,0.0088,0.021134,0.839186
5,0.0069,0.021884,0.838485


[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                     batch_size ▁
[34m[1mwandb[0m:                      eval/loss ▄█▁▃▆
[34m[1mwandb[0m:                   eval/pearson ▁▂█▆▆
[34m[1mwandb[0m:                   eval/runtime ▃▆▂▁█
[34m[1mwandb[0m:        eval/samples_per_second ▆▃▇█▁
[34m[1mwandb[0m:          eval/steps_per_second ▆▃▇█▁
[34m[1mwandb[0m:                             lr ▁
[34m[1mwandb[0m:                       lr_decay ▁
[34m[1mwandb[0m:                         maxlen ▁
[34m[1mwandb[0m:                       n_epochs ▁
[34m[1mwandb[0m:                        n_folds ▁
[34m[1mwandb[0m:                   random_state ▁
[34m[1mwandb[0m:                    train/epoch ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇████
[34m[1mwandb[0m:              train/global_step ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇████
[34m[1mwandb[0m:            train/learning_rate ██▇▇▆▆▅▅▄▄▃▃▂▂

Epoch,Training Loss,Validation Loss,Pearson
1,0.0244,0.02541,0.805036
2,0.0166,0.02365,0.817107
3,0.0114,0.023908,0.816376
4,0.0082,0.023151,0.821295
5,0.0063,0.02362,0.819938


[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                     batch_size ▁
[34m[1mwandb[0m:                      eval/loss █▃▃▁▂
[34m[1mwandb[0m:                   eval/pearson ▁▆▆█▇
[34m[1mwandb[0m:                   eval/runtime ▄▁▄█▇
[34m[1mwandb[0m:        eval/samples_per_second ▅█▅▁▂
[34m[1mwandb[0m:          eval/steps_per_second ▅█▅▁▂
[34m[1mwandb[0m:                             lr ▁
[34m[1mwandb[0m:                       lr_decay ▁
[34m[1mwandb[0m:                         maxlen ▁
[34m[1mwandb[0m:                       n_epochs ▁
[34m[1mwandb[0m:                        n_folds ▁
[34m[1mwandb[0m:                   random_state ▁
[34m[1mwandb[0m:                    train/epoch ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
[34m[1mwandb[0m:              train/global_step ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇▇███
[34m[1mwandb[0m:            train/learning_rate ██▇▇▆▆▅▅▄▄▃▃▂▂

Epoch,Training Loss,Validation Loss,Pearson
1,0.0265,0.024744,0.809842
2,0.0176,0.022269,0.82178
3,0.012,0.023721,0.825416
4,0.0089,0.022676,0.824459
5,0.0068,0.022371,0.825089


[34m[1mwandb[0m: Waiting for W&B process to finish... [32m(success).[0m
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:                     batch_size ▁
[34m[1mwandb[0m:                      eval/loss █▁▅▂▁
[34m[1mwandb[0m:                   eval/pearson ▁▆███
[34m[1mwandb[0m:                   eval/runtime ▃▁▅█▁
[34m[1mwandb[0m:        eval/samples_per_second ▆█▄▁█
[34m[1mwandb[0m:          eval/steps_per_second ▆█▄▁█
[34m[1mwandb[0m:                             lr ▁
[34m[1mwandb[0m:                       lr_decay ▁
[34m[1mwandb[0m:                         maxlen ▁
[34m[1mwandb[0m:                       n_epochs ▁
[34m[1mwandb[0m:                        n_folds ▁
[34m[1mwandb[0m:                   random_state ▁
[34m[1mwandb[0m:                    train/epoch ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇▇███
[34m[1mwandb[0m:              train/global_step ▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇▇███
[34m[1mwandb[0m:            train/learning_rate ██▇▇▆▆▅▅▄▄▃▃▂▂

In [11]:
predictions = oof_df['preds'].values
label = oof_df['score'].values
eval_pred = predictions, label
compute_metrics(eval_pred)

{'pearson': 0.8255098776481588}

In [12]:
oof_df.to_csv('oof_df.csv')