In [1]:
import torch
from torch.utils.data import DataLoader
import csv
from tqdm.notebook import tqdm

!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cpu


In [3]:
import pandas as pd

train_data_path = "/home/liefe/data/public/us-patent/train.csv"
df = pd.read_csv(train_data_path)

df.head() 

Unnamed: 0,id,anchor,target,context,score
0,37d61fd2272659b1,abatement,abatement of pollution,A47,0.5
1,7b9652b17b68b7a4,abatement,act of abating,A47,0.75
2,36d72442aefd8232,abatement,active catalyst,A47,0.25
3,5296b0c19e1ce60e,abatement,eliminating process,A47,0.5
4,54c1e3b9184cb5b6,abatement,forest region,A47,0.0


In [4]:
len(df)

36473

In [5]:
pd.unique(df.context)

array(['A47', 'A61', 'A62', 'C01', 'F16', 'F24', 'F28', 'H01', 'H04',
       'B23', 'B41', 'D03', 'E03', 'C08', 'D01', 'D21', 'C07', 'A45',
       'B01', 'B08', 'G04', 'G06', 'B65', 'G16', 'G01', 'A41', 'C23',
       'F23', 'B25', 'A63', 'B28', 'B63', 'F04', 'B60', 'B32', 'C09',
       'C02', 'G03', 'C10', 'B61', 'C21', 'F42', 'A23', 'C11', 'B29',
       'F02', 'B62', 'B64', 'E21', 'B24', 'B22', 'H05', 'B27', 'E04',
       'B21', 'D06', 'C04', 'B05', 'G02', 'H03', 'C06', 'G11', 'C12',
       'E02', 'F15', 'A46', 'B66', 'G07', 'G08', 'C22', 'B44', 'A01',
       'F03', 'C25', 'F22', 'G05', 'G21', 'B07', 'F41', 'E01', 'H02',
       'C13', 'F01', 'F27', 'C14', 'A44', 'B67', 'A24', 'B02', 'E05',
       'D05', 'F25', 'A43', 'A22', 'A21', 'E06', 'F21', 'G10', 'C03',
       'B81', 'F17', 'B03', 'G09', 'D04', 'F26', 'B31'], dtype=object)

Number of domains/contexts

In [6]:
len(pd.unique(df.context))

106

In [7]:
df.context.value_counts()

H01    2186
H04    2177
G01    1812
A61    1477
F16    1091
       ... 
B03      47
F17      33
B31      24
A62      23
F26      18
Name: context, Length: 106, dtype: int64

So classes are unbalanced, but this may affect performance


In [8]:
df.groupby(["context"]).mean()

Unnamed: 0_level_0,score
context,Unnamed: 1_level_1
A01,0.367739
A21,0.400568
A22,0.410714
A23,0.328431
A24,0.360000
...,...
H01,0.359332
H02,0.372201
H03,0.367794
H04,0.346922


In [9]:
df.score.value_counts()

0.50    12300
0.25    11519
0.00     7471
0.75     4029
1.00     1154
Name: score, dtype: int64

In [10]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses


In [11]:
import numpy as np

pairs = []
labels = []
domains = []

with open(train_data_path) as fp:
    reader = csv.reader(fp, quotechar="\\", quoting=csv.QUOTE_ALL)
    next(reader)
    for line in tqdm(reader):
        # pairs.append(InputExample(texts=line[1:3], label=float(line[4])))
        pairs.append((line[1], line[2]))
        labels.append(float(line[4]))
        domains.append(line[3])

pairs = np.array(pairs)
labels = np.array(labels, dtype=np.float32)
domains = np.array(domains)

pairs[0], labels[0], domains[0]

0it [00:00, ?it/s]

(array(['abatement', 'abatement of pollution'], dtype='<U98'), 0.5, 'A47')

In [12]:
n = len(pairs)
print(f"{n} sentence pairs found")

pairs_indices = np.arange(len(pairs))


36473 sentence pairs found


In [13]:
from sklearn.model_selection import train_test_split


train_indices, test_indices, _, _ = train_test_split(pairs_indices, 
                                                    labels, 
                                                    test_size=.1, 
                                                    random_state=33,
                                                    stratify=domains)

test_indices, pairs[test_indices], labels[test_indices], domains[test_indices]

(array([12826,  4440, 28484, ..., 26641, 15617, 33969]),
 array([['filled interior', 'bottom cap'],
        ['carrier transportation', 'carrier deposition'],
        ['rotary section', 'revolving section'],
        ...,
        ['pulsed plasma', 'plasma pulser circuit'],
        ['hinge mechanisms', 'closing mechanism'],
        ['tubular latch', 'door closer']], dtype='<U98'),
 array([0.25, 0.5 , 0.75, ..., 0.75, 0.25, 0.25], dtype=float32),
 array(['B29', 'G03', 'B21', ..., 'H05', 'E05', 'E05'], dtype='<U3'))

In [14]:
len(train_indices), len(test_indices)

(32825, 3648)

In [15]:
train_ds = pairs[train_indices]
# test_ds = pairs[test_indices]
train_labels = labels[train_indices]
# test_labels = labels[test_indices]
train_ds[0], train_labels[0], domains[train_indices][0]


(array(['locking formation', 'lock'], dtype='<U98'), 0.25, 'B24')

In [16]:
val_indices, test_indices, _, _  = train_test_split(test_indices,
                                                            labels[test_indices], 
                                                                test_size=.5, 
                                                                random_state=33)
                                                                #stratify=domains[test_indices])

val_ds = pairs[val_indices]
test_ds = pairs[test_indices]
val_labels = labels[val_indices]
test_labels = labels[test_indices]

test_ds[0], test_labels[0], domains[test_indices][0]

(array(['faucet assembly', 'water supply assembly'], dtype='<U98'), 0.5, 'E03')

In [17]:
# import random


# print(len(pairs))
# random.shuffle(pairs)
# train_data = pairs[:int(.9*n)]
# val_data = pairs[int(.9*n):]


In [18]:
train_pairs = []
for idx, example in enumerate(train_ds):
    train_pairs.append(InputExample(texts=example, label=train_labels[idx]))
train_pairs[-1].texts, train_pairs[-1].label

(array(['further additional elements', 'additional manpower'], dtype='<U98'),
 0.0)

In [19]:
val_pairs = []
for idx, example in enumerate(val_ds):
    val_pairs.append(InputExample(texts=example, label=val_labels[idx]))
val_pairs[-1].texts, val_pairs[-1].label

(array(['instruction processing', 'device information process'],
       dtype='<U98'),
 0.5)

In [20]:
# model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
# model = SentenceTransformer("nghuyong/ernie-2.0-en")?\
model_name = "roberta-base"
model = SentenceTransformer(model_name)

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/603k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Some weights of the model checkpoint at /home/liefe/.cache/torch/sentence_transformers/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
loss = losses.CosineSimilarityLoss(model)

In [22]:
# train_ds = SentencesDataset(model, train_data)
# val_ds = SentencesDataset(model, val_data)

In [23]:
from torch.utils.data import DataLoader

train_dl = DataLoader(train_pairs, shuffle=True, batch_size=8)
# val_dl = DataLoader(val_pairs, shuffle=True, batch_size=8)


In [24]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_pairs, name=model_name)

In [25]:
# loss = losses.SoftmaxLoss(model, num_labels=5, sentence_embedding_dimension=model.get_sentence_embedding_dimension())

In [26]:
model.fit(train_objectives=[(train_dl, loss)],
          epochs = 4,
          evaluator=evaluator,
          evaluation_steps=500,
          #evaluator=evaluator,
          output_path="./",
          warmup_steps=100
         )             



Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4104 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4104 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4104 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4104 [00:00<?, ?it/s]

In [27]:
labels.dtype

dtype('float32')

In [28]:
model.save("./models/"+model_name.replace("/", "-"), model_name=model_name)

In [29]:
test_pairs = []
for idx, example in enumerate(test_ds):
    test_pairs.append(InputExample(texts=example, label=test_labels[idx]))
test_pairs[-1].texts, test_pairs[-1].label

(array(['sprayed', 'directly'], dtype='<U98'), 0.25)

In [31]:
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_pairs, name="test_eval")

In [34]:
max_score = model.evaluate(evaluator)

In [38]:
with open("./eval/"+"test_score"+("_"+model_name if model_name else ""), "wt") as fp:
    fp.write(str(max_score))
    