In [1]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def read_passages(path_data, test_size=0.1):
    df = pd.read_csv(path_data)
    documents_a = df['log_a'].to_list()
    documents_b = df['log_b'].to_list()
    labels_str = df['target_same'].to_list()

    labels = [1.0 if label_str else 0.0 for label_str in labels_str]
    return train_test_split(list(zip(documents_a, documents_b)),labels, test_size=test_size)
# end

In [3]:
model_name = 'nli-distilroberta-base-v2'
train_batch_size = 4
num_epochs = 5
test_size=0.1

In [4]:
content = pd.read_csv('data/pair_all_1.csv')
text_train_list, text_valid_list, labels_train, labels_valid = read_passages('data/pair_all_1.csv')

In [5]:
# labels_train[:20]

In [6]:
samples_train = []
samples_valid = []

for text_train_2, label_train in zip(text_train_list, labels_train):
    samples_train.append(InputExample(texts=[*text_train_2], label=label_train))
# end

for text_valid_2, label_valid in zip(text_valid_list, labels_valid):
    samples_valid.append(InputExample(texts=[*text_valid_2], label=label_valid))
# end

In [7]:
model = SentenceTransformer(model_name)

Downloading:   0%|          | 0.00/736 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.71k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/679 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
train_dataloader = DataLoader(samples_train, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(samples_valid)
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)

In [9]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=2000,
          warmup_steps=0,
          output_path='./models-{}'.format(model_name))



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23620 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23620 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23620 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23620 [00:00<?, ?it/s]

Iteration:   0%|          | 0/23620 [00:00<?, ?it/s]

In [10]:
# test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(samples_valid)
# test_evaluator(model, output_path='./models2')