<a href="https://www.kaggle.com/code/hetarthchopra/bert-from-huggingface?scriptVersionId=112895882" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

This code is heavily inspired from the code of https://www.kaggle.com/code/ksork6s4/uspppm-bert-for-patents-baseline-train and even Jeremy Howard's https://www.kaggle.com/code/jhoward/getting-started-with-nlp-for-absolute-beginners/notebook. I am simply using it for my own learning of HuggingFace

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deberta-v3-large/deberta-v3-large/spm.model
/kaggle/input/deberta-v3-large/deberta-v3-large/config.json
/kaggle/input/deberta-v3-large/deberta-v3-large/README.md
/kaggle/input/deberta-v3-large/deberta-v3-large/tokenizer_config.json
/kaggle/input/deberta-v3-large/deberta-v3-large/pytorch_model.bin
/kaggle/input/cpc-codes/titles.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/sample_submission.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/train.csv
/kaggle/input/us-patent-phrase-to-phrase-matching/test.csv


In [2]:
import os
import pandas as pd
import numpy as np 
from sklearn.model_selection import KFold, StratifiedKFold
import shutil

from torch.utils.data import DataLoader, Dataset
import datasets, transformers
from transformers import TrainingArguments, Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [3]:
class CFG:
    input_path = '../input/us-patent-phrase-to-phrase-matching/'
    model_path = '../input/deberta-v3-large/deberta-v3-large/'
    
    learning_rate = 2e-5
    weight_decay = 0.01
    num_fold = 2
    epochs = 5
    batch_size = 8

os.environ["WANDB_DISABLED"] = "true"

## PreProcessing

In [4]:
train_df = pd.read_csv(f"{CFG.input_path}train.csv")
titles = pd.read_csv('../input/cpc-codes/titles.csv')
test_df = pd.read_csv(f"{CFG.input_path}test.csv")
train_df = train_df.merge(titles, left_on='context', right_on='code') # basically this replaces 
test_df = test_df.merge(titles, left_on='context', right_on='code') # basically this replaces 

In [5]:
# train_df = train_df.head(100)

In [6]:
def create_folds(data, num_splits):
    # we create a new column called kfold and fill it with -1
    data["fold"] = -1
    
    # bin targets
    data.loc[:, "bins"] = pd.cut(data["score"], bins=5, labels=False) # 
    # initiate the kfold class from model_selection module
    kf = StratifiedKFold(n_splits=num_splits, shuffle=True, random_state=42)
    
    # fill the new kfold column
    # note that, instead of targets, we use bins!
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'fold'] = f
    
    # drop the bins column
    data = data.drop("bins", axis=1)

    # return dataframe with folds
    return data

In [7]:
train_df['input'] = train_df['title']+'[SEP]'+train_df['anchor']
train_df = create_folds(train_df, CFG.num_fold)
test_df['input']=test_df['title']+'[SEP]'+train_df['anchor']



In [8]:
train_df
# in this label = score 
# sentence 1 = input (context+anchor)
# sentence 2 = 

Unnamed: 0,id,anchor,target,context,score,code,title,section,class,subclass,group,main_group,input,fold
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,1
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,1
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,1
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,a28320e15e1aa1de,cervical support,comfort support,A47,0.50,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,1
96,999d1bb85a8c63c7,cervical support,comfort to comfort,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,1
97,c3f9606db5901c42,cervical support,comfort when comfort,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,1
98,8a1215a697f793f6,cervical support,contouring,A47,0.25,A47,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,A,47.0,,,,FURNITURE; DOMESTIC ARTICLES OR APPLIANCES; CO...,0


## Tokenizer

In [9]:
tokenizer = AutoTokenizer.from_pretrained(CFG.model_path)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Dataset

In [10]:
class TrainDataset(Dataset): # takes in the Dataset module from PyTorch 
    def __init__(self,df): # init, used to structure the dataset
        self.inputs = df['input'].values.astype(str) # input
        self.targets = df['target'].values.astype(str) # target
        self.label = df['score'].values # similarity
        
    def __len__(self):
        return len(self.inputs) # return length 
    
    def __getitem__(self,item):
        inputs = self.inputs[item] # get item mean, index dalo, and returns the value
        targets = self.targets[item] # get target is similar, index dalo, and return the value 
        label = self.label[item] 
        return {**tokenizer(inputs,targets), 'label':label.astype(np.float32)}

In [11]:
class InferDataset(Dataset):
    def __init__(self,df):
        self.inputs = df['input'].values.astype(str)
        self.targets = df['target'].values.astype(str)
        
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self,item):
        inputs = self.inputs[item]
        targets = self.targets[item]
        return {**tokenizer(inputs,targets)}
        

## Training 

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(len(predictions))
    return{'pearson':np.corrcoef(predictions,labels)[0][1]} # yaha wala pearson

In [13]:
oof_df = pd.DataFrame()
# for fold in range(CFG.num_fold):
fold=1
training_data = train_df[train_df['fold']!=fold].reset_index(drop=True)
validation_data = train_df[train_df['fold']==fold].reset_index(drop=True)

tr_data = TrainDataset(training_data)
va_data = TrainDataset(validation_data)

# provide training arguments 
args = TrainingArguments(
    output_dir=f"/tmp/uspppm",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate = CFG.learning_rate,
    per_device_train_batch_size=CFG.batch_size,
    per_device_eval_batch_size=CFG.batch_size,
    num_train_epochs=CFG.epochs,
    warmup_ratio=0.1,
    weight_decay=CFG.weight_decay,
    metric_for_best_model="pearson", # is the same as yaha wala pearson
    load_best_model_at_end=True,
    fp16=True,
    logging_steps=10,
)

model = AutoModelForSequenceClassification.from_pretrained(CFG.model_path,num_labels=1)

trainer=Trainer(
    model,
    args,
    train_dataset = tr_data,
    eval_dataset = va_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics)

trainer.train()
shutil.rmtree(f'/tmp/uspppm')
trainer.save_model(f'uspppm_{fold}')

outputs = trainer.predict(va_data)
predictions = outputs.predictions.reshape(-1)
validation_data['preds'] = predictions
oof_df = pd.concat([oof_df, validation_data])

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Some weights of the model checkpoint at ../input/deberta-v3-large/deberta-v3-large/ were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifer.weight', 'mask_predictions.classifer.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.063185,0.000671
2,0.100800,0.063137,0.114615


***** Running Evaluation *****
  Num examples = 50
  Batch size = 8
Saving model checkpoint to /tmp/uspppm/checkpoint-7
Configuration saved in /tmp/uspppm/checkpoint-7/config.json
Model weights saved in /tmp/uspppm/checkpoint-7/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-7/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-7/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 50
  Batch size = 8
Saving model checkpoint to /tmp/uspppm/checkpoint-14
Configuration saved in /tmp/uspppm/checkpoint-14/config.json
Model weights saved in /tmp/uspppm/checkpoint-14/pytorch_model.bin
tokenizer config file saved in /tmp/uspppm/checkpoint-14/tokenizer_config.json
Special tokens file saved in /tmp/uspppm/checkpoint-14/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /tmp/uspppm/checkpoint-14 (score: 0.11461533418901215).
Saving model checkpoint t

In [14]:
predictions = oof_df['preds'].values
label = oof_df['score'].values
eval_pred = predictions, label
print(compute_metrics(eval_pred))

{'pearson': 0.11461533418901215}


In [16]:
os.listdir()

['.virtual_documents', 'uspppm_1', '__notebook_source__.ipynb']

In [17]:
predictions = []

# for fold in range(CFG.num_fold):
fold=1
te_dataset = InferDataset(test_df)
model = AutoModelForSequenceClassification.from_pretrained(f'uspppm_{fold}', num_labels=1)
trainer = Trainer(model,tokenizer=tokenizer)

outputs = trainer.predict(te_dataset)
prediction = outputs.predictions.reshape(-1)
predictions.append(prediction)

predictions = np.mean(predictions, axis=0)
submission = datasets.Dataset.from_dict({
    'id': test_df['id'],
    'score': predictions,
})

loading configuration file uspppm_1/config.json
Model config DebertaV2Config {
  "_name_or_path": "uspppm_1",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vo

loading configuration file uspppm_1/config.json
Model config DebertaV2Config {
  "_name_or_path": "uspppm_1",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "float32",
  "transformers_version": "4.20.1",
  "type_vocab_size": 0,
  "vo

In [18]:
submission = submission.to_pandas()
submission.score = submission.score.astype(float)

In [19]:
def round_off(entry):
    round_to = [0.0,0.5,0.25,0.75,1]
    return min(round_to, key=lambda x: abs(x - entry))

In [20]:
submission.score = submission.score.apply(lambda x: round_off(x))

In [21]:
submission.to_csv('submission.csv', index=False)

In [22]:
submission

Unnamed: 0,id,score
0,4112d61851461f60,0.5
1,5203a36c501f1b7c,0.5
2,7aa5908a77a7ec24,0.5
3,09e418c93a776564,0.5
4,36baf228038e314b,0.5
5,b892011ab2e2cabc,0.5
6,1f37ead645e7f0c8,0.5
7,71a5b6ad068d531f,0.5
8,16ae4b99d3601e60,0.5
9,474c874d0c07bd21,0.5


In [None]:
# test_df = test_df.reset_index(drop=True)
# te_data = InferDataset(test_df)
# outputs = trainer.predict(te_data)

# predictions=outputs.predictions.reshape(-1)
# submission = datasets.Dataset.from_dict({
#     'id': test_df['id'],
#     'score': predictions,
# })

In [None]:
# submission.to_pandas()
# submission.to_csv('submission.csv' , index=False)