# "NLP Transformers model"

> "Basic NLP model using huggingface's Transformers"

- toc: true
- branch: master
- comments : False
- author : Eric Vincent
- categories : [fastpages, jupyter]

# This notebook runs through Paperspace's GPU.

In [1]:
#hide
import os
iskaggle = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')

In [2]:
#hide
! pip install kaggle


Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.0/59.0 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73031 sha256=ed5e01cf7bae3af90f8673ad3f99441f804ac6b06118afa8a1a140d171040260
  Stored in directory: /root/.cache/pip/wheels/ac/b2/c3/fa4706d469b5879105991d1c8be9a3c2ef329ba9fe2ce5085e
Successfully built kaggle
Installing collected packages: kaggle
Successfully installed kaggle-1.5.12
[0m

In [3]:
#hide
creds = '{"username":"ericvincent18","key":"e4acd0ae64ec375a76a571db9511fc28"}'

In [4]:
# for working with paths in Python, I recommend using `pathlib.Path`
from pathlib import Path

cred_path = Path('~/.kaggle/kaggle.json').expanduser()
if not cred_path.exists():
    cred_path.parent.mkdir(exist_ok=True)
    cred_path.write_text(creds)
    cred_path.chmod(0o600)

In [5]:
path = Path('us-patent-phrase-to-phrase-matching')

In [7]:
if not iskaggle and not path.exists():
    import zipfile,kaggle
    kaggle.api.competition_download_cli(str(path))
    zipfile.ZipFile(f'{path}.zip').extractall(path)

Downloading us-patent-phrase-to-phrase-matching.zip to /notebooks


100%|██████████| 682k/682k [00:00<00:00, 27.8MB/s]







In [8]:
!ls {path}

sample_submission.csv  test.csv  train.csv


In [9]:
import pandas as pd
df = pd.read_csv(path/'train.csv')
df['input'] = 'TEXT1: ' + df.context + '; TEXT2: ' + df.target + '; ANC1: ' + df.anchor


In [10]:
df.input.head()

0    TEXT1: A47; TEXT2: abatement of pollution; ANC...
1    TEXT1: A47; TEXT2: act of abating; ANC1: abate...
2    TEXT1: A47; TEXT2: active catalyst; ANC1: abat...
3    TEXT1: A47; TEXT2: eliminating process; ANC1: ...
4    TEXT1: A47; TEXT2: forest region; ANC1: abatement
Name: input, dtype: object

In [11]:
from datasets import Dataset,DatasetDict

ds = Dataset.from_pandas(df)

In [12]:
ds

Dataset({
    features: ['id', 'anchor', 'target', 'context', 'score', 'input'],
    num_rows: 36473
})

In [13]:
model_nm = 'microsoft/deberta-v3-small'
from transformers import AutoModelForSequenceClassification,AutoTokenizer
tokz = AutoTokenizer.from_pretrained(model_nm)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
tokz.tokenize("Testing out some lil' odd words such as see ya later, hadn't seen him, Capitalize, ChAngeItUp")

['▁Testing',
 '▁out',
 '▁some',
 '▁lil',
 "'",
 '▁odd',
 '▁words',
 '▁such',
 '▁as',
 '▁see',
 '▁ya',
 '▁later',
 ',',
 '▁hadn',
 "'",
 't',
 '▁seen',
 '▁him',
 ',',
 '▁Capital',
 'ize',
 ',',
 '▁Ch',
 'A',
 'nge',
 'It',
 'Up']

In [15]:
def tok_func(x): return tokz(x["input"])
tok_ds = ds.map(tok_func, batched=True)



  0%|          | 0/37 [00:00<?, ?ba/s]

In [16]:
# input ids for the first row of data
row = tok_ds[0]
row['input'], row['input_ids']

('TEXT1: A47; TEXT2: abatement of pollution; ANC1: abatement',
 [1,
  54453,
  435,
  294,
  336,
  5753,
  346,
  54453,
  445,
  294,
  47284,
  265,
  6435,
  346,
  23702,
  435,
  294,
  47284,
  2])

In [17]:
tok_ds = tok_ds.rename_columns({'score':'labels'})

In [18]:
# Create validation dataset
eval_df = pd.read_csv(path/'test.csv')

import numpy as np, matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

dds = tok_ds.train_test_split(0.25, seed=42)
dds

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


DatasetDict({
    train: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 27354
    })
    test: Dataset({
        features: ['id', 'anchor', 'target', 'context', 'labels', 'input', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9119
    })
})

In [19]:
# create the input column for the validation set
eval_df['input'] = 'TEXT1: ' + eval_df.context + '; TEXT2: ' + eval_df.target + '; ANC1: ' + eval_df.anchor
eval_ds = Dataset.from_pandas(eval_df).map(tok_func, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [20]:
# using numpy for correlation (Persons)
def corr(x,y): return np.corrcoef(x,y)[0][1]

In [21]:
# Plot function to visualize correlations
def show_corr(df, a, b):
    x,y = df[a],df[b]
    plt.scatter(x,y, alpha=0.5, s=4)
    plt.title(f'{a} vs {b}; r: {corr(x, y):.2f}')

In [22]:
from transformers import TrainingArguments,Trainer
bs = 128
epochs = 4

In [23]:
lr = 8e-5

In [24]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none')

In [26]:
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=1)
trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                  tokenizer=tokz, compute_metrics=corr_d)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

In [28]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, input, id, target, context. If anchor, input, id, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27354
  Num Epochs = 4
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 856


Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.026263,0.798741
2,No log,0.025944,0.823264
3,0.034800,0.022987,0.83324
4,0.034800,0.021924,0.833892


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, input, id, target, context. If anchor, input, id, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, input, id, target, context. If anchor, input, id, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9119
  Batch size = 256
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved 

In [29]:
# get predictions on the test set
preds = trainer.predict(eval_ds).predictions.astype(float)
preds

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: anchor, id, input, target, context. If anchor, id, input, target, context are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 36
  Batch size = 256


array([[ 5.74218750e-01],
       [ 6.59179688e-01],
       [ 5.41992188e-01],
       [ 3.12255859e-01],
       [-3.18908691e-02],
       [ 5.43945312e-01],
       [ 5.07324219e-01],
       [ 7.92694092e-03],
       [ 2.51464844e-01],
       [ 1.04882812e+00],
       [ 3.00537109e-01],
       [ 2.63671875e-01],
       [ 7.12402344e-01],
       [ 8.55957031e-01],
       [ 7.36816406e-01],
       [ 4.27490234e-01],
       [ 2.95166016e-01],
       [-6.78062439e-04],
       [ 6.18164062e-01],
       [ 3.39843750e-01],
       [ 4.55566406e-01],
       [ 2.38769531e-01],
       [ 9.44213867e-02],
       [ 2.19604492e-01],
       [ 5.26855469e-01],
       [-2.81066895e-02],
       [-4.91638184e-02],
       [-2.97546387e-02],
       [-4.06188965e-02],
       [ 5.79589844e-01],
       [ 3.13232422e-01],
       [ 1.97219849e-03],
       [ 8.07617188e-01],
       [ 4.92431641e-01],
       [ 4.26513672e-01],
       [ 2.25585938e-01]])

In [30]:
preds = np.clip(preds, 0, 1)
preds

array([[0.57421875],
       [0.65917969],
       [0.54199219],
       [0.31225586],
       [0.        ],
       [0.54394531],
       [0.50732422],
       [0.00792694],
       [0.25146484],
       [1.        ],
       [0.30053711],
       [0.26367188],
       [0.71240234],
       [0.85595703],
       [0.73681641],
       [0.42749023],
       [0.29516602],
       [0.        ],
       [0.61816406],
       [0.33984375],
       [0.45556641],
       [0.23876953],
       [0.09442139],
       [0.21960449],
       [0.52685547],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.57958984],
       [0.31323242],
       [0.0019722 ],
       [0.80761719],
       [0.49243164],
       [0.42651367],
       [0.22558594]])

In [31]:
#hide
import datasets

submission = datasets.Dataset.from_dict({
    'id': eval_ds['id'],
    'score': preds
})

submission.to_csv('submission.csv', index=False)

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

1032