In [2]:
!pip install transformers sentence-transformers datasets

Collecting sentence-transformers
  Downloading sentence_transformers-2.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.5/171.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl 

In [3]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import datetime
import random
import numpy as np
import pandas as pd

In [4]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [5]:
dataset = load_dataset("paws-x", "en")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/310k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/307k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [6]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 49401
    })
    test: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['id', 'sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})


In [7]:
print("A sample from the medical_question dataset's training split:")
print(dataset['validation'][98])

A sample from the medical_question dataset's training split:
{'id': 414, 'sentence1': 'The chief editor is Herbert Wessels ( since 2009 ) , second editor is Markus Ermert ( since 2002 ) .', 'sentence2': 'Second editor is Herbert Wessels ( since 2009 ) , Chief Editor is Markus Ermert ( since 2002 ) .', 'label': 0}


In [8]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
class STSBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset):

        similarity_scores = [i['label'] for i in dataset]
        self.normalized_similarity_scores = [i/5.0 for i in similarity_scores]
        self.first_sentences = [i['sentence1'] for i in dataset]
        self.second_sentences = [i['sentence2'] for i in dataset]
        self.concatenated_sentences = [[str(x), str(y)] for x,y in zip(self.first_sentences, self.second_sentences)]

    def __len__(self):
        return len(self.concatenated_sentences)

    def get_batch_labels(self, idx):
        return torch.tensor(self.normalized_similarity_scores[idx])

    def get_batch_texts(self, idx):
        return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y


def collate_fn(texts):
    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']
    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
    return features

In [10]:
class BertForS(torch.nn.Module):

    def __init__(self):
        super(BertForS, self).__init__()
        self.bert = models.Transformer('bert-base-uncased', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output

In [11]:
model = BertForS()
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  )
)

In [12]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs])
        emb_2 = torch.stack([inp[1] for inp in inputs])
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2))
        return self.loss_fn(outputs, labels.squeeze())

In [13]:
train_ds = SDataset(dataset['validation'])
val_ds = SDataset(dataset['test'])

train_size = len(train_ds)
val_size = len(val_ds)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

2,000 training samples
2,000 validation samples


In [14]:
batch_size = 8

train_dataloader = DataLoader(
            train_ds,
            num_workers = 4,
            batch_size = batch_size,
            shuffle=True
        )

validation_dataloader = DataLoader(
            val_ds,
            num_workers = 4,
            batch_size = batch_size
        )



In [15]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-6)

In [16]:
epochs = 8

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)

In [17]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))

    return str(datetime.timedelta(seconds=elapsed_rounded))

In [18]:
def train():
  seed_val = 42

  criterion = CosineSimilarityLoss()
  criterion = criterion.to(device)

  random.seed(seed_val)
  torch.manual_seed(seed_val)
  training_stats = []
  total_t0 = time.time()

  for epoch_i in range(0, epochs):


      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      for train_data, train_label in tqdm(train_dataloader):

          train_data['input_ids'] = train_data['input_ids'].to(device)
          train_data['attention_mask'] = train_data['attention_mask'].to(device)

          train_data = collate_fn(train_data)
          model.zero_grad()

          output = [model(feature) for feature in train_data]

          loss = criterion(output, train_label.to(device))
          total_train_loss += loss.item()

          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()


      avg_train_loss = total_train_loss / len(train_dataloader)

      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.5f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0
      for val_data, val_label in tqdm(validation_dataloader):

          val_data['input_ids'] = val_data['input_ids'].to(device)
          val_data['attention_mask'] = val_data['attention_mask'].to(device)

          val_data = collate_fn(val_data)

          with torch.no_grad():
              output = [model(feature) for feature in val_data]

          loss = criterion(output, val_label.to(device))
          total_eval_loss += loss.item()

      avg_val_loss = total_eval_loss / len(validation_dataloader)

      validation_time = format_time(time.time() - t0)

      print("  Validation Loss: {0:.5f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))

      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )

  print("")
  print("Training complete!")

  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

  return model, training_stats

In [19]:
model, training_stats = train()


Training...


  self.pid = os.fork()
100%|██████████| 250/250 [01:55<00:00,  2.16it/s]



  Average training loss: 0.67285
  Training epoch took: 0:01:56

Running Validation...


100%|██████████| 250/250 [00:35<00:00,  7.10it/s]


  Validation Loss: 0.70822
  Validation took: 0:00:35

Training...


100%|██████████| 250/250 [01:50<00:00,  2.27it/s]



  Average training loss: 0.45361
  Training epoch took: 0:01:50

Running Validation...


100%|██████████| 250/250 [00:35<00:00,  7.08it/s]


  Validation Loss: 0.61373
  Validation took: 0:00:35

Training...


100%|██████████| 250/250 [01:50<00:00,  2.26it/s]



  Average training loss: 0.27719
  Training epoch took: 0:01:51

Running Validation...


100%|██████████| 250/250 [00:35<00:00,  7.14it/s]


  Validation Loss: 0.56417
  Validation took: 0:00:35

Training...


100%|██████████| 250/250 [01:50<00:00,  2.27it/s]



  Average training loss: 0.20464
  Training epoch took: 0:01:50

Running Validation...


100%|██████████| 250/250 [00:34<00:00,  7.15it/s]


  Validation Loss: 0.52593
  Validation took: 0:00:35

Training...


100%|██████████| 250/250 [01:49<00:00,  2.27it/s]



  Average training loss: 0.16013
  Training epoch took: 0:01:50

Running Validation...


100%|██████████| 250/250 [00:35<00:00,  7.12it/s]


  Validation Loss: 0.51221
  Validation took: 0:00:35

Training...


100%|██████████| 250/250 [01:50<00:00,  2.27it/s]



  Average training loss: 0.13128
  Training epoch took: 0:01:50

Running Validation...


100%|██████████| 250/250 [00:35<00:00,  7.12it/s]


  Validation Loss: 0.50441
  Validation took: 0:00:35

Training...


100%|██████████| 250/250 [01:50<00:00,  2.27it/s]



  Average training loss: 0.11662
  Training epoch took: 0:01:50

Running Validation...


100%|██████████| 250/250 [00:35<00:00,  7.13it/s]


  Validation Loss: 0.50475
  Validation took: 0:00:35

Training...


100%|██████████| 250/250 [01:49<00:00,  2.28it/s]



  Average training loss: 0.11433
  Training epoch took: 0:01:50

Running Validation...


100%|██████████| 250/250 [00:35<00:00,  7.12it/s]

  Validation Loss: 0.50695
  Validation took: 0:00:35

Training complete!
Total training took 0:19:28 (h:mm:ss)





In [20]:
df_stats = pd.DataFrame(data=training_stats)

df_stats = df_stats.set_index('epoch')

df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.672848,0.708223,0:01:56,0:00:35
2,0.453606,0.613733,0:01:50,0:00:35
3,0.27719,0.564173,0:01:51,0:00:35
4,0.204638,0.525925,0:01:50,0:00:35
5,0.160128,0.512213,0:01:50,0:00:35
6,0.131276,0.504406,0:01:50,0:00:35
7,0.116616,0.50475,0:01:50,0:00:35
8,0.114332,0.506945,0:01:50,0:00:35


In [21]:
test_dataset = load_dataset("stsb_multi_mt", name="en", split="train")

first_sent = [i['sentence1'] for i in test_dataset]
second_sent = [i['sentence2'] for i in test_dataset]
full_text = [[str(x), str(y)] for x,y in zip(first_sent, second_sent)]

Downloading readme:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/470k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/108k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [22]:
model.eval()

def predict_similarity(sentence_pair):
  test_input = tokenizer(sentence_pair, padding='max_length', max_length = 128, truncation=True, return_tensors="pt").to(device)
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']
  del test_input['token_type_ids']
  output = model(test_input)
  sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()

  return sim

In [23]:
example_1 = full_text[100]
print(f"Sentence 1: {example_1[0]}")
print(f"Sentence 2: {example_1[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_1), 2)}")

Sentence 1: A man is playing a guitar.
Sentence 2: Someoen is playing guitar.
Predicted similarity score: 0.68


In [24]:
example2 = full_text[222]
print(f"Sentence 1: {example2[0]}")
print(f"Sentence 2: {example2[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example2), 2)}")

Sentence 1: A man is playing the guitar.
Sentence 2: A man plays an acoustic guitar.
Predicted similarity score: 0.85


In [25]:
example_3 = full_text[1083]
print(f"Sentence 1: {example_3[0]}")
print(f"Sentence 2: {example_3[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_3), 2)}")

Sentence 1: A white animal running through snow covered woods.
Sentence 2: A white dog runs through a snow covered wood.
Predicted similarity score: 0.77
