In [1]:
from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, LoggingHandler, losses, models, util
from torch.utils.data import DataLoader
from sentence_transformers.evaluation import TripletEvaluator
from datetime import datetime
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas()

import csv
import logging
import os

  from pandas import Panel


In [2]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [3]:
model_name = "/var/patentmark/transformer-training/patent-electra-v4"

In [4]:
train_batch_size = 16
output_path = "output/training-triplets-"+model_name+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
num_epochs = 1

In [5]:
word_embedding_model = models.Transformer(model_name)

In [6]:
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

In [7]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

2020-11-21 07:27:27 - Use pytorch device: cuda


In [9]:

def build_example(row):
    return InputExample(texts=[row['label'], row['positive'], row['negative']], label=0)

logging.info("Read Triplet train dataset")
train_examples_df = pd.read_parquet("training_triplets.parquet")
train_examples = train_examples_df.progress_apply(build_example, axis=1).values


2020-11-21 07:29:25 - Read Triplet train dataset


HBox(children=(FloatProgress(value=0.0, max=3701622.0), HTML(value='')))




In [10]:
train_dataset = SentencesDataset(train_examples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model)

In [11]:
logging.info("Read Triplet dev dataset")
dev_examples_df = pd.read_parquet("testing_triplets.parquet")
dev_examples = train_examples_df.sample(10000).progress_apply(build_example, axis=1).values

2020-11-21 07:30:35 - Read Triplet dev dataset


HBox(children=(FloatProgress(value=0.0, max=10000.0), HTML(value='')))




In [12]:
len(dev_examples)

10000

In [13]:
evaluator = TripletEvaluator.from_input_examples(dev_examples, name='dev')

In [14]:
warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data


In [15]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=output_path)

# ##############################################################################
# #
# # Load the stored model and evaluate its performance on STS benchmark dataset
# #
# ##############################################################################

# logging.info("Read test examples")
# test_examples = []
# with open(os.path.join(dataset_path, 'test.csv'), encoding="utf-8") as fIn:
#     reader = csv.DictReader(fIn, delimiter=',', quoting=csv.QUOTE_MINIMAL)
#     for row in reader:
#         test_examples.append(InputExample(texts=[row['Sentence1'], row['Sentence2'], row['Sentence3']]))


# model = SentenceTransformer(output_path)
# test_evaluator = TripletEvaluator.from_input_examples(test_examples, name='test')
# test_evaluator(model, output_path=output_path)

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=231352.0, style=ProgressStyle(description…

2020-11-21 07:38:56 - TripletEvaluator: Evaluating the model on dev dataset in epoch 0 after 1000 steps:
2020-11-21 07:40:45 - Accuracy Cosine Distance:   	51.43
2020-11-21 07:40:45 - Accuracy Manhatten Distance:	51.12
2020-11-21 07:40:45 - Accuracy Euclidean Distance:	50.70

2020-11-21 07:40:45 - Save model to output/training-triplets-/var/patentmark/transformer-training/patent-electra-v4-2020-11-21_07-27-24
2020-11-21 07:49:05 - TripletEvaluator: Evaluating the model on dev dataset in epoch 0 after 2000 steps:
2020-11-21 07:50:57 - Accuracy Cosine Distance:   	55.11
2020-11-21 07:50:57 - Accuracy Manhatten Distance:	55.74
2020-11-21 07:50:57 - Accuracy Euclidean Distance:	54.32

2020-11-21 07:50:57 - Save model to output/training-triplets-/var/patentmark/transformer-training/patent-electra-v4-2020-11-21_07-27-24
2020-11-21 07:59:20 - TripletEvaluator: Evaluating the model on dev dataset in epoch 0 after 3000 steps:
2020-11-21 08:01:11 - Accuracy Cosine Distance:   	62.97
2020-11-21 0

KeyboardInterrupt: 