In [1]:
pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-1.1.1.tar.gz (81 kB)
[K     |████████████████████████████████| 81 kB 713 kB/s eta 0:00:01
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25ldone
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-1.1.1-py3-none-any.whl size=123336 sha256=c49cee307264a858a8707fdfa990aa48a3c481c087f0b4ded14578df8e39051f
  Stored in directory: /root/.cache/pip/wheels/9d/f2/81/9a97074f4974b3ade9fee286b3ea9acba88e7c9282928ba187
Successfully built sentence-transformers
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-1.1.1
Note: you may need to restart the kernel to use updated packages.


In [31]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv
import pickle 
import pandas as pd
import numpy as np
from random import shuffle
import nltk
import re
import itertools
from sklearn.model_selection import KFold
from nltk.tokenize import sent_tokenize
from functools import reduce
import torch

In [3]:
#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

  0%|          | 0.00/392k [00:00<?, ?B/s]

In [4]:
# Read the dataset
model_name = 'allenai/scibert_scivocab_uncased'
train_batch_size = 4
num_epochs = 4
model_save_path = 'output/training_arxiv_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

In [5]:
#Read the arxiv corpus dataset 
class Corpus:
    def __init__(self, path_to_pkl):
        self.data = None
        self.load_pickle(path_to_pkl)

    def load_pickle(self, path_to_pkl):
        with open(path_to_pkl, 'rb') as f:
            self.data = pickle.load(f)

    def get_data(self):
        return self.data

    def get_keys(self):
        keys_list = []
        for k in self.data.keys():
            keys_list.append(k)
        return keys_list

    def get_values(self):
        values_list = []
        for k, v in self.data.items():
            values_list.append(v)
        return values_list

    def get_paper_by_id(self, id):
        return self.data[id]
    
def pre_process(paragraph):
    paragraph = re.sub(r'\S*@\S*\s?', '', paragraph, flags=re.MULTILINE)  # remove email
    paragraph = re.sub(r'http\S+', '', paragraph, flags=re.MULTILINE)  # remove web addresses
    paragraph = paragraph.replace("et al.", "")
    return paragraph


def get_sentences_from_paragraph(paragraph):
    sentences = nltk.sent_tokenize(paragraph)
    return sentences

def flatten_list(sen_list):
    return list(itertools.chain.from_iterable(sen_list))

def create_tsv(m_list):
    columnName = ["id1", "id2", "sentence1", "sentence2"]
    half_count = len(m_list)/2
    # Using Dictionary comprehension
    myDict = {key: None for key in columnName}

    count = 0
    list1 = []
    list2 = []

    for line in range(len(m_list)):

        if (count < half_count):
            list1.append(m_list[line])
            myDict["sentence1"] = list1
        else:
            list2.append(m_list[line])
            myDict["sentence2"] = list2
        count += 1
    df = pd.DataFrame(myDict)
    id = list(map(str, np.arange(df.shape[0])))
    df["id1"] = np.core.defchararray.add('sent1', id)
    df["id2"] = np.core.defchararray.add('sent2', id)
   # df.to_csv("../arxiv_data/output.tsv", sep="\t", index=False)

path_to_pkl = "../input/corpus/corpus_dict.pkl"
corpus = Corpus(path_to_pkl)
list_keys = corpus.get_keys()

main_list =[]
for k in list_keys:
    paper = corpus.get_paper_by_id(k)

    para_paper = list(map(pre_process, paper))
    sentences = [get_sentences_from_paragraph(p) for p in para_paper]
    merged = flatten_list(sentences)
    main_list.append(merged)


flattened_main_list = flatten_list(main_list)

shuffle(flattened_main_list)
print(flattened_main_list[:10])
create_tsv(flattened_main_list)

['Fig.', 'However, [8] recently showed that the alternating minimization method in fact converges to the global minima of two low-rank estimation problems: matrix sensing with RIP matrices and matrix completion.', 'Traditionally, cancer survival curves have been estimated using Kaplan-Meier methods [5].', 'This shortcoming is typical for convergence proofs that use similar types of conditions (cf.', 'Let us now focus on binary label prediction, that is Y={±1}.', 'Moreover, the algorithm runs in time O(qnk2) and succeeds with high probability when number of samples q=Ω(mlog2m).', 'We vary sparsity of S∗ and rank of L∗ for RTD with a fixed tensor size.', 'If the iteration complexity of an oblivious optimization algorithm for smooth and convex finite sum problems equipped with a first-order and a coordinate-descent oracle is of the form of the l.h.s.', 'Assuming that all counts (Nk)k=1,…,K are strictly positive, the criterion', 'thanks to the symmetry of the matrix Δ⊤Fπ∗ (cf.']


In [6]:
#Create two list of sentences 
path_sentence_pairs = "../input/sentence-pairs/arxiv_corpus_new.tsv"
unlabelled_df = pd.read_csv(path_sentence_pairs, sep='\t', quoting=csv.QUOTE_NONE)

sentence1 = unlabelled_df['sentence1'].tolist()
sentence2 = unlabelled_df['sentence2'].tolist()

s1 = np.array(sentence1)
s2 = np.array(sentence2)

#Splitting the unlabelled dataset into k folds for self training
def return_k_fold(un_texts):
    #Splitting the unlabelled dataset into k folds for self training
    kf = KFold(n_splits=5, shuffle=True)

    X_fold = list()

    for _, fold in kf.split(un_texts):
        X_fold.append(un_texts[fold])

    X_fold = np.array(X_fold, dtype="object")
    
    return X_fold

X_fold_s1 = return_k_fold(s1)
X_fold_s2 = return_k_fold(s2)

In [7]:
def get_new_train_data(predictions, fold_n):
    X_new_s1 = list()
    y_new = list()
    X_new_s2 = list()

       
    for i, x in enumerate(predictions):
        if x[i] > 0.90 or x[i] < 0.10:
            X_new_s1.append(X_fold_s1[fold_n][i])
            X_new_s2.append(X_fold_s2[fold_n][i])
            y_new.append(x[i])
           
    return np.array(X_new_s1), np.array(y_new), np.array(X_new_s2)


def join_shuffle(train_samples, X_new_s1, y_new, X_new_s2):
    
    for s1, s2, label in zip(X_new_s1, X_new_s2, y_new):
        inp_new = InputExample(texts=[s1, s2], label=label)
        train_samples.append(inp_new)
    return train_samples 


In [8]:
# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)
model.max_seq_length = 128

Exception when trying to download http://sbert.net/models/allenai/scibert_scivocab_uncased.zip. Response 404


Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [9]:
test_df = pd.read_csv("../input/annotations/annotations_arxiv_merged.tsv", sep='\t',quoting=csv.QUOTE_NONE)
test_labels = test_df.Classification.values

test = []

for s1, s2, sco in zip(test_df['sentence1'].tolist(), test_df['sentence2'].tolist(), test_labels.tolist()):
    tst_example = InputExample(texts=[s1, s2], label=sco)
    test.append(tst_example)

In [10]:
# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")

train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

        if row['split'] == 'dev':
            dev_samples.append(inp_example)
        elif row['split'] == 'test':
            test_samples.append(inp_example)
        else:
            train_samples.append(inp_example)
            

In [11]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)
print(type(train_dataloader))

<class 'torch.utils.data.dataloader.DataLoader'>


In [12]:
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

In [13]:
# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

In [14]:
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1438 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1438 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1438 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1438 [00:00<?, ?it/s]

In [15]:
index = 0
for fold1, fold2 in zip(X_fold_s1, X_fold_s2):
    embeddings1 = model.encode(fold1, convert_to_tensor=True)
    embeddings2 = model.encode(fold2, convert_to_tensor=True)
    
    #Compute cosine-similarits
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    
    X_new_s1, y_new, X_new_s2 = get_new_train_data(cosine_scores, index)
    
    print(f"{len(X_new_s1)} high-probability predictions added to training data.")
    
    if(len(X_new_s1) != 0):
        train_samples = join_shuffle(train_samples, X_new_s1, y_new, X_new_s2)
    
    t_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
    t_loss = losses.CosineSimilarityLoss(model=model)

    model.fit(train_objectives=[(t_dataloader, t_loss)],
          evaluator=evaluator,
          epochs=2,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)
    index+=1

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

2588 high-probability predictions added to training data.


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2085 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2085 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

4306 high-probability predictions added to training data.


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3161 [00:00<?, ?it/s]

Iteration:   0%|          | 0/3161 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

6024 high-probability predictions added to training data.


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4667 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4667 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

7907 high-probability predictions added to training data.


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6644 [00:00<?, ?it/s]

Iteration:   0%|          | 0/6644 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

Batches:   0%|          | 0/790 [00:00<?, ?it/s]

9498 high-probability predictions added to training data.


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9018 [00:00<?, ?it/s]

Iteration:   0%|          | 0/9018 [00:00<?, ?it/s]

In [16]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test, name='arxiv-test')
test_evaluator(model, output_path=model_save_path)

0.5399054589118731

In [17]:
#RUN PREDICTION ON TEST SENTENCES 
#Sentences are encoded by calling model.encode()
emb1 = model.encode("Unsupervised training data for the sentence encoding models are drawn from a variety of web sources")
emb2 = model.encode("We augment unsupervised learning with training on supervised data from the Stanford Natural Language Inference(SNLI) CORPUS")

emb3 = model.encode("Unsupervised training data for the sentence encoding models are drawn from a variety of web sources")
emb4 = model.encode("Rather the network maintains a scalable linear efficiency across all layers, realizing the transformer full potential")


cos_sim_1 = util.pytorch_cos_sim(emb1, emb2)
cos_sim_2 = util.pytorch_cos_sim(emb3, emb4)
print("Cosine-Similarity:", cos_sim_1)
print("Cosine-Similarity:", cos_sim_2)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine-Similarity: tensor([[0.8941]])
Cosine-Similarity: tensor([[0.1726]])


In [38]:
dic_para = {'paragraph1': 
     'Multi-objective optimization aims at finding trade-off solutions to conflicting objectives. These constitute the Pareto optimal set. In the context of expensive-to-evaluate functions, it is impossible and often non-informative to look for the entire set. As an end-user would typically prefer a certain part of the objective space, we modify the Bayesian multi-objective optimization algorithm which uses Gaussian Processes to maximize the Expected Hypervolume Improvement, to focus the search in the preferred region. The acumulated effects of the Gaussian Processes and the targeting strategy lead to a particularly efficient convergence to the desired part of the Pareto set. To take advantage of parallel computing, a multi-point extension of the targeting criterion is proposed and analyzed.'
, 'paragraph2':
      'We consider the problem of constrained multi-objective (MO) blackbox optimization using expensive function evaluations, where the goal is to approximate the true Pareto set of solutions satisfying a set of constraints while minimizing the number of function evaluations. We propose a novel framework named Uncertainty-aware Search framework for Multi-Objective Optimization with Constraints (USeMOC) to efficiently select the sequence of inputs for evaluation to solve this problem. The selection method of UseMOC consists of solving a cheap constrained MO optimization problem via surrogate models of the true functions to identify the most promising candidates and picking the best candidate based on a measure of uncertainty. We applied this framework to optimize the design of a multi-output switched-capacitor voltage regulator via expensive simulations. Our experimental results show that UseMOC is able to achieve more than 90 % reduction in the number of simulations needed to uncover optimized circuits.'
}
source_paragraph = dic_para['paragraph1']
target_paragraph = dic_para['paragraph2']
source_sentences = sent_tokenize(source_paragraph)
target_sentences = sent_tokenize(target_paragraph)

#Compute embedding for both lists
embeddings1 = model.encode(source_sentences, convert_to_tensor=True)
embeddings2 = model.encode(target_sentences, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)


max_scores = []
for scores in cosine_scores:
    max_elements, max_indices = torch.max(scores, dim=0)
    max_index = max_indices.item()
    max_scores.append(max_elements)

def count_average(lst):
    return reduce(lambda a,b:a+b, lst) / len(lst)
avg = count_average(max_scores)
dis_score = 1-avg
print("The two paragraphs have a similarity of {}".format(avg))
print("The two paragraphs have a dissimilarity of {}".format(dis_score))


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

The two paragraphs have a similarity of 0.6267346739768982
The two paragraphs have a dissimilarity of 0.3732653260231018
