 In this challenge [U.S. Patent Phrase to Phrase Matching](https://www.kaggle.com/competitions/us-patent-phrase-to-phrase-matching/), we are tasked with comparing two words or short phrases, and scoring them based on whether they're similar or not, based on which patent class they were used in. With a score of 1 it is considered that the two inputs have identical meaning, and 0 means they have totally different meaning.

### Setup Kaggle and Download dataset

In [None]:
!pip install kaggle
!pip install accelerate -U # requires restart

In [2]:
creds = '{"username":"davbetm","key":"ABC"}'

# for working with paths in Python, I recommend using `pathlib.Path`
from pathlib import Path

cred_path = Path("~/.kaggle/kaggle.json").expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [3]:
# Download the dataset
path = Path('us-patent-phrase-to-phrase-matching')

if not path.exists():
    import zipfile, kaggle
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f"{path}.zip").extractall(path)

Downloading us-patent-phrase-to-phrase-matching.zip to /content


100%|██████████| 682k/682k [00:00<00:00, 1.49MB/s]







In [4]:
!ls $path

sample_submission.csv  test.csv  train.csv


### Pre-processing

In [5]:
import pandas as pd


df = pd.read_csv(path / "train.csv")
df

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00
...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50
36471,756ec035e694722b,wood article,wooden material,B44,0.75


In [6]:
"""
- anchor: a patent's phrase
- target: another patent's phrase
- context: patent's context
- score: The scores are in the 0-1 range with increments of 0.25
    - 1.0 - Very close match.
    - 0.75 - Close synonym
    - 0.5 - Synonyms which don’t have the same meaning (same function, same properties)
    - 0.25 - Somewhat related. It also includes antonyms.
    - 0.0 Unrelated.
"""


df.describe(include="object")

Unnamed: 0,id,anchor,target,context
count,36473,36473,36473,36473
unique,36473,733,29340,106
top,37d61fd2272659b1,component composite coating,composition,H01
freq,1,152,24,2186


In [7]:
"""We could represent the input to the model as something like

"TEXT1: abatement; TEXT2: eliminating process".

We'll need to add the context to this too.
"""

df["input"] = "TEXT1: " + df.context + "; TEXT2: " + df.target + "; ANC1: " + df.anchor
df

Unnamed: 0,id,anchor,target,context,score,input
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.50,TEXT1: A47; TEXT2: abatement of pollution; ANC...
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75,TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2,36d72442aefd8232,abatement,active catalyst,A47,0.25,TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.50,TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.00,TEXT1: A47; TEXT2: forest region; ANC1: abatement
...,...,...,...,...,...,...
36468,8e1386cbefd7f245,wood article,wooden article,B44,1.00,TEXT1: B44; TEXT2: wooden article; ANC1: wood ...
36469,42d9e032d1cd3242,wood article,wooden box,B44,0.50,TEXT1: B44; TEXT2: wooden box; ANC1: wood article
36470,208654ccb9e14fa3,wood article,wooden handle,B44,0.50,TEXT1: B44; TEXT2: wooden handle; ANC1: wood a...
36471,756ec035e694722b,wood article,wooden material,B44,0.75,TEXT1: B44; TEXT2: wooden material; ANC1: wood...


In [8]:
# Transformers always assumes that your labels has the column name labels,
# but in our dataset it's currently 'score'
df_preprocessed = df.rename(columns={"score": "labels"}, inplace=False)

### Tokenization

In [9]:
!pip install -q datasets transformers[sentencepiece]

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.6/485.6 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m108.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [10]:
from datasets import Dataset, DatasetDict

dataset = Dataset.from_pandas(df_preprocessed)
type(dataset), dataset

(datasets.arrow_dataset.Dataset,
 Dataset({
     features: ['id', 'anchor', 'target', 'context', 'labels', 'input'],
     num_rows: 36473
 }))

A deep learning model expects numbers as inputs, not English sentences! So we need to do two things:

- Tokenization: Split each text up into words (or actually, as we'll see, into tokens).

- Numericalization: Convert each word (or token) into a number.

In [11]:
model_nm = "microsoft/deberta-v3-small"

# AutoTokenizer will create a tokenizer appropriate for a given model:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokz = AutoTokenizer.from_pretrained(model_nm)

Downloading (…)okenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
# Here's an example of how the tokenizer splits a text into "tokens"

tokz.tokenize("G'day folks, I'm David from GColab")

['▁G',
 "'",
 'day',
 '▁folks',
 ',',
 '▁I',
 "'",
 'm',
 '▁David',
 '▁from',
 '▁G',
 'Cola',
 'b']

Here's an example of how the tokenizer splits a text into "tokens".

In [13]:
def tok_func(tokenizer, x):
    """Simple function which tokenizes our inputs."""
    return tokenizer(x["input"])

In [14]:
from functools import partial

fn = partial(tok_func, tokz)

# To run this quickly in parallel on every row in our dataset, use map:
tok_ds = dataset.map(fn, batched=True)

type(tok_ds)

Map:   0%|          | 0/36473 [00:00<?, ? examples/s]

datasets.arrow_dataset.Dataset

In [15]:
# This adds a new item to our dataset called input_ids. This means for Numericalization

first_row = tok_ds[0]

first_row["input"], first_row["input_ids"]

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

In [16]:
# There's a list called vocab in the tokenizer which contains a unique integer
# for every possible token string.
tokz.vocab["▁of"]

265

### Training, validation and test sets

In [17]:
test_df = pd.read_csv(path / "test.csv")
test_df.describe()

Unnamed: 0,id,anchor,target,context
count,36,36,36,36
unique,36,34,36,29
top,4112d61851461f60,el display,inorganic photoconductor drum,G02
freq,1,2,1,3


Transformers uses a `DatasetDict` for holding your training and validation sets. To create one that contains 25% of our data for the validation set, and 75% for the training set, use `train_test_split`.

In [18]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [19]:
test_df["input"] = (
    "TEXT1: " + test_df.context + "; TEXT2: " + test_df.target + "; ANC1: " + test_df.anchor
)
test_ds = Dataset.from_pandas(test_df).map(fn, batched=True)
test_ds

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36
})

### Metrics and correlation

When we're training a model, there will be one or more metrics that we're interested in maximising or minimising. These are the measurements that should, hopefully, represent how well our model will works for us.

On Kaggle submissions are evaluated on the Pearson correlation coefficient between the predicted and actual similarity scores. This coefficient is usually abbreviated using the single letter r.

It is the most widely used measure of the degree of relationship between two variables.

`r` can vary between -1, which means perfect inverse correlation, and +1, which means perfect positive correlation.

In [20]:
import numpy as np

def corr(x, y):
    return np.corrcoef(x, y)[0][1]

Transformers expects metrics to be returned as a dict, since that way the trainer knows what label to use, so let's create a function to do that:

In [21]:
def corr_d(eval_pred):
    return {"pearson": corr(*eval_pred)}


### Training

In [22]:
from transformers import TrainingArguments, Trainer

BATCH_SIZE = 128
EPOCHS = 4
LR = 8e-5

In [23]:
# Transformers uses the TrainingArguments class to set up arguments

args = TrainingArguments(
    "outputs",
    learning_rate=LR,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    fp16=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    report_to="none"
)

In [24]:
# We can now create our model, and Trainer, which is a class which combines the
# data and model together

model = AutoModelForSequenceClassification.from_pretrained(
    model_nm, num_labels=1
)

trainer = Trainer(
    model,
    args,
    train_dataset=dds["train"],
    eval_dataset=dds["test"], # actually it's the validation set
    tokenizer=tokz,
    compute_metrics=corr_d
)

Downloading pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

In [25]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.027999,0.792947
2,No log,0.025091,0.819949
3,0.031600,0.022455,0.831139
4,0.031600,0.022421,0.832225


TrainOutput(global_step=856, training_loss=0.02401382956549386, metrics={'train_runtime': 215.8714, 'train_samples_per_second': 506.857, 'train_steps_per_second': 3.965, 'total_flos': 716605488222960.0, 'train_loss': 0.02401382956549386, 'epoch': 4.0})

In [26]:
preds = trainer.predict(test_ds).predictions.astype(float)
preds

array([[ 4.90722656e-01],
       [ 7.00195312e-01],
       [ 6.18652344e-01],
       [ 3.49121094e-01],
       [-2.42614746e-02],
       [ 5.18554688e-01],
       [ 5.24414062e-01],
       [-3.68041992e-02],
       [ 2.55615234e-01],
       [ 1.12402344e+00],
       [ 2.22045898e-01],
       [ 2.86865234e-01],
       [ 7.62695312e-01],
       [ 8.70117188e-01],
       [ 7.41210938e-01],
       [ 4.87060547e-01],
       [ 2.71484375e-01],
       [ 8.11576843e-04],
       [ 6.22558594e-01],
       [ 3.05419922e-01],
       [ 3.77441406e-01],
       [ 2.24609375e-01],
       [ 9.33837891e-02],
       [ 2.52441406e-01],
       [ 5.95214844e-01],
       [-2.02178955e-02],
       [-2.40631104e-02],
       [-1.69372559e-02],
       [-3.12500000e-02],
       [ 7.13378906e-01],
       [ 3.73535156e-01],
       [ 8.59375000e-02],
       [ 7.05078125e-01],
       [ 4.82666016e-01],
       [ 4.63378906e-01],
       [ 1.93237305e-01]])

Let's fix those out-of-bounds predictions:

In [31]:
preds = np.clip(preds, 0, 1)
preds

array([[4.90722656e-01],
       [7.00195312e-01],
       [6.18652344e-01],
       [3.49121094e-01],
       [0.00000000e+00],
       [5.18554688e-01],
       [5.24414062e-01],
       [0.00000000e+00],
       [2.55615234e-01],
       [1.00000000e+00],
       [2.22045898e-01],
       [2.86865234e-01],
       [7.62695312e-01],
       [8.70117188e-01],
       [7.41210938e-01],
       [4.87060547e-01],
       [2.71484375e-01],
       [8.11576843e-04],
       [6.22558594e-01],
       [3.05419922e-01],
       [3.77441406e-01],
       [2.24609375e-01],
       [9.33837891e-02],
       [2.52441406e-01],
       [5.95214844e-01],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [0.00000000e+00],
       [7.13378906e-01],
       [3.73535156e-01],
       [8.59375000e-02],
       [7.05078125e-01],
       [4.82666016e-01],
       [4.63378906e-01],
       [1.93237305e-01]])

### Submission

In [32]:
import datasets

submission = datasets.Dataset.from_dict({
    "id": test_ds["id"],
    "score": preds
})

submission_filename = "submission.csv"

submission.to_csv(submission_filename, index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1030

In [41]:
!pip install -U -q kaggle

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone


In [42]:
!kaggle competitions submit -c us-patent-phrase-to-phrase-matching -f submission.csv -m 'transformers'


100% 1.01k/1.01k [00:00<00:00, 1.54kB/s]
400 - Bad Request
