<a href="https://colab.research.google.com/github/chandhinipj/Sentence_Similarity_LLM/blob/main/SemanticTextualSimilarity_stsb_bert_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Install and import the required packages

In [None]:
!pip install transformers sentence-transformers datasets

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m31.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading t

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from transformers import  AutoTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import datetime
import random
import numpy as np
import pandas as pd

### 2. Use Google Colab's GPU for training

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


### **3.** Load and preview the Semantic Textual Similarity Benchmark (STSB) dataset

In [None]:
# Load the English version of the STSB dataset
dataset = load_dataset("stsb_multi_mt", "en")

Downloading builder script:   0%|          | 0.00/7.43k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/19.0k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.98k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/74.0k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/52.9k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})


### **4.** Define the dataset loader class


In [None]:
# Instantiate the BERT tokenizer
# You can use larger variants of the model, here we're using the base model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/stsb-bert-base')

Downloading (…)okenizer_config.json:   0%|          | 0.00/377 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
class STSBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset):

        # Normalize the similarity scores in the dataset
        similarity_scores = [i['similarity_score'] for i in dataset]
        self.normalized_similarity_scores = [i/5.0 for i in similarity_scores]
        self.first_sentences = [i['sentence1'] for i in dataset]
        self.second_sentences = [i['sentence2'] for i in dataset]
        self.concatenated_sentences = [[str(x), str(y)] for x,y in zip(self.first_sentences, self.second_sentences)]

    def __len__(self):
        return len(self.concatenated_sentences)

    def get_batch_labels(self, idx):
        return torch.tensor(self.normalized_similarity_scores[idx])

    def get_batch_texts(self, idx):
        return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y


def collate_fn(texts):
    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']
    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
    return features

### 5. Define the model class based on BERT

In [None]:
class BertForSTS(torch.nn.Module):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = models.Transformer('sentence-transformers/stsb-bert-base', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output

In [None]:
# Instantiate the model and move it to GPU
model = BertForSTS()
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)

### 6. Define the Cosine Similarity loss function

In [None]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs])
        emb_2 = torch.stack([inp[1] for inp in inputs])
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2))
        return self.loss_fn(outputs, labels.squeeze())

### 7. Prepare the training and validation data split

In [None]:
train_ds = STSBDataset(dataset['train'])
val_ds = STSBDataset(dataset['dev'])

# Create a 90-10 train-validation split.
train_size = len(train_ds)
val_size = len(val_ds)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

5,749 training samples
1,500 validation samples


In [None]:
batch_size = 8

train_dataloader = DataLoader(
            train_ds,  # The training samples.
            num_workers = 4,
            batch_size = batch_size, # Use this batch size.
            shuffle=True # Select samples randomly for each batch
        )

validation_dataloader = DataLoader(
            val_ds,
            num_workers = 4,
            batch_size = batch_size # Use the same batch size
        )



### 8. Define the Optimizer and Scheduler

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-6)

In [None]:
epochs = 8

# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps, )

### 9. Define a helper function for formatting the elapsed training time as `hh:mm:ss`

In [None]:
# Takes a time in seconds and returns a string hh:mm:ss
def format_time(elapsed):
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

### 10. Define the training function, and start the training loop

In [None]:
def train():
  seed_val = 42

  criterion = CosineSimilarityLoss()
  criterion = criterion.to(device)

  random.seed(seed_val)
  torch.manual_seed(seed_val)

  # We'll store a number of quantities such as training and validation loss,
  # validation accuracy, and timings.
  training_stats = []
  total_t0 = time.time()

  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      # For each batch of training data...
      for train_data, train_label in tqdm(train_dataloader):

          train_data['input_ids'] = train_data['input_ids'].to(device)
          train_data['attention_mask'] = train_data['attention_mask'].to(device)

          train_data = collate_fn(train_data)
          model.zero_grad()

          output = [model(feature) for feature in train_data]

          loss = criterion(output, train_label.to(device))
          total_train_loss += loss.item()

          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()
          scheduler.step()


      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)

      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.5f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))

      # ========================================
      #               Validation
      # ========================================

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_accuracy = 0
      total_eval_loss = 0
      nb_eval_steps = 0

      # Evaluate data for one epoch
      for val_data, val_label in tqdm(validation_dataloader):

          val_data['input_ids'] = val_data['input_ids'].to(device)
          val_data['attention_mask'] = val_data['attention_mask'].to(device)

          val_data = collate_fn(val_data)

          with torch.no_grad():
              output = [model(feature) for feature in val_data]

          loss = criterion(output, val_label.to(device))
          total_eval_loss += loss.item()

      # Calculate the average loss over all of the batches.
      avg_val_loss = total_eval_loss / len(validation_dataloader)

      # Measure how long the validation run took.
      validation_time = format_time(time.time() - t0)

      print("  Validation Loss: {0:.5f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )

  print("")
  print("Training complete!")

  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

  return model, training_stats

In [None]:
# Launch the training
model, training_stats = train()


Training...


100%|██████████| 384/384 [05:42<00:00,  1.12it/s]



  Average training loss: 0.00320
  Training epoch took: 0:05:42

Running Validation...


100%|██████████| 100/100 [00:26<00:00,  3.70it/s]


  Validation Loss: 0.02524
  Validation took: 0:00:27

Training...


100%|██████████| 384/384 [05:14<00:00,  1.22it/s]



  Average training loss: 0.00307
  Training epoch took: 0:05:15

Running Validation...


100%|██████████| 100/100 [00:27<00:00,  3.64it/s]


  Validation Loss: 0.02513
  Validation took: 0:00:28

Training...


100%|██████████| 384/384 [05:12<00:00,  1.23it/s]



  Average training loss: 0.00296
  Training epoch took: 0:05:13

Running Validation...


100%|██████████| 100/100 [00:27<00:00,  3.70it/s]


  Validation Loss: 0.02514
  Validation took: 0:00:27

Training...


100%|██████████| 384/384 [05:11<00:00,  1.23it/s]



  Average training loss: 0.00286
  Training epoch took: 0:05:11

Running Validation...


100%|██████████| 100/100 [00:27<00:00,  3.69it/s]


  Validation Loss: 0.02524
  Validation took: 0:00:27

Training...


100%|██████████| 384/384 [05:09<00:00,  1.24it/s]



  Average training loss: 0.00287
  Training epoch took: 0:05:10

Running Validation...


100%|██████████| 100/100 [00:26<00:00,  3.77it/s]


  Validation Loss: 0.02522
  Validation took: 0:00:27

Training...


100%|██████████| 384/384 [05:10<00:00,  1.24it/s]



  Average training loss: 0.00283
  Training epoch took: 0:05:10

Running Validation...


100%|██████████| 100/100 [00:27<00:00,  3.70it/s]


  Validation Loss: 0.02526
  Validation took: 0:00:27

Training...


100%|██████████| 384/384 [05:08<00:00,  1.24it/s]



  Average training loss: 0.00271
  Training epoch took: 0:05:09

Running Validation...


100%|██████████| 100/100 [00:27<00:00,  3.70it/s]


  Validation Loss: 0.02528
  Validation took: 0:00:27

Training...


100%|██████████| 384/384 [05:09<00:00,  1.24it/s]



  Average training loss: 0.00280
  Training epoch took: 0:05:10

Running Validation...


100%|██████████| 100/100 [00:26<00:00,  3.78it/s]

  Validation Loss: 0.02527
  Validation took: 0:00:26

Training complete!
Total training took 0:45:36 (h:mm:ss)





In [None]:
# Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

Unnamed: 0_level_0,Training Loss,Valid. Loss,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.003197,0.02524,0:05:42,0:00:27
2,0.00307,0.025127,0:05:15,0:00:28
3,0.002964,0.025139,0:05:13,0:00:27
4,0.002857,0.025243,0:05:11,0:00:27
5,0.002865,0.025223,0:05:10,0:00:27
6,0.002834,0.025256,0:05:10,0:00:27
7,0.002713,0.025276,0:05:09,0:00:27
8,0.002796,0.025268,0:05:10,0:00:26


In [None]:
test_dataset = load_dataset("stsb_multi_mt", name="en", split="test")


In [None]:
df_test_dataset=pd.DataFrame(test_dataset)

In [None]:
df_test_dataset.head(5)

Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


In [None]:
df_test_dataset['actual_score']=round(df_test_dataset['similarity_score']/5.0, 2)

In [None]:
df_test_dataset.head(5)

Unnamed: 0,sentence1,sentence2,similarity_score,actual_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5,0.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6,0.72
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0,1.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2,0.84
4,A man is playing a harp.,A man is playing a keyboard.,1.5,0.3


In [None]:
model.eval()

BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  )
)

In [None]:
def predict_similarity(a, b):
  from transformers import  AutoTokenizer
  import torch

  # filePath = open('albert_base_modelfile', 'rb')
  # model= dill.loads(filePath.read())

  # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/stsb-bert-base')
  # device = torch.device("cuda")

  test_input = tokenizer((a,b), padding='max_length', max_length = 128, truncation=True, return_tensors="pt").to(device)
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']
  try:
    del test_input['token_type_ids']
  except:
    pass
  output = model(test_input)
  sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()

  return round(sim, 2)

In [None]:
cl= []
for i in range(0, len(df_test_dataset)):
    cl.append(predict_similarity(df_test_dataset['sentence1'][i], df_test_dataset['sentence2'][i]))

In [None]:
score =[]
for i in range(0, len(df_test_dataset)):
  score.append(round(cl[i], 2))

In [None]:
df_test_dataset['predicted_score']= score

In [None]:
df_test_dataset.head(5)

Unnamed: 0,sentence1,sentence2,similarity_score,actual_score,predicted_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5,0.5,0.76
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6,0.72,0.86
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0,1.0,0.91
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2,0.84,0.96
4,A man is playing a harp.,A man is playing a keyboard.,1.5,0.3,0.24


In [None]:
df_test_dataset.to_csv('final_test_stsdataset_withScores_bert-base.csv')

# Active and Passive, Direct and Indirect


In [None]:
!pip install --q --upgrade gdown

In [None]:
!gdown 1O4XgBg1pn2RBV1hPvF0F_0Dv19at5mqm
!gdown 1ORTNm-X1bCKv4k3PkkSNVwatXimpwi-E
!gdown 1EU40KBcVvVOcAK5OOxtQCx8jVXfQtDhe

Downloading...
From: https://drive.google.com/uc?id=1O4XgBg1pn2RBV1hPvF0F_0Dv19at5mqm
To: /content/ActivePassiveFinal.csv
100% 17.4k/17.4k [00:00<00:00, 35.5MB/s]
Downloading...
From: https://drive.google.com/uc?id=1ORTNm-X1bCKv4k3PkkSNVwatXimpwi-E
To: /content/ActivePassiveShuffled.csv
100% 23.8k/23.8k [00:00<00:00, 45.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1EU40KBcVvVOcAK5OOxtQCx8jVXfQtDhe
To: /content/TwoWordChange.csv
100% 3.04k/3.04k [00:00<00:00, 10.4MB/s]


In [None]:
import pandas as pd


df_activepassive  = pd.read_csv('ActivePassiveFinal.csv')
df_activepassive.head(5)

Unnamed: 0,sentence1,sentence2,actual_score
0,The chef cooked the meal.,The meal was cooked by the chef.,0.94
1,She waters the plants every day.,The plants are watered every day by her.,0.93
2,The team won the championship.,The championship was won by the team.,0.87
3,They built a new house.,A new house was built by them.,0.95
4,The teacher explains the lesson.,The lesson is explained by the teacher.,0.93


In [None]:
df_activepassiveshuff  = pd.read_csv('ActivePassiveShuffled.csv')
df_activepassiveshuff.head(5)

Unnamed: 0,sentence1,sentence2,actual_score
0,The chef cooked the meal.,The meal was cooked by the chef.,1
1,The chef cooked the meal.,The plants are watered every day by her.,0
2,She waters the plants every day.,The plants are watered every day by her.,1
3,She waters the plants every day.,The championship was won by the team.,0
4,The team won the championship.,The championship was won by the team.,1


In [None]:
df_twoword  = pd.read_csv('TwoWordChange.csv')
df_twoword.head(5)

Unnamed: 0,sentence1,sentence2,actual_score
0,They cleaned the entire house.,They cleaned the entire home.,1
1,They cleaned the entire house.,They destroyed the entire ship.,0
2,He purchased a new car.,He bought a new car.,1
3,He purchased a new car.,He sold a new bike.,0
4,She traveled to Paris.,She journeyed to Paris.,1


In [None]:
df=df_activepassive

In [None]:
cl= []
for i in range(0, len(df)):
    cl.append(predict_similarity(df['sentence1'][i], df['sentence2'][i]))

In [None]:
score =[]
for i in range(0, len(df)):
  score.append(round(cl[i], 2))

In [None]:
df_activepassive['predicted_score']= score

In [None]:
df_activepassive.head(5)

Unnamed: 0,sentence1,sentence2,actual_score,predicted_score
0,The chef cooked the meal.,The meal was cooked by the chef.,0.94,0.95
1,She waters the plants every day.,The plants are watered every day by her.,0.93,0.82
2,The team won the championship.,The championship was won by the team.,0.87,0.95
3,They built a new house.,A new house was built by them.,0.95,0.93
4,The teacher explains the lesson.,The lesson is explained by the teacher.,0.93,0.97


In [None]:
df_activepassive.to_csv('final_activepassive_withScores_stsb-distilroberta-base-v2.csv')

In [None]:
df=df_twoword
df.head(5)

Unnamed: 0,sentence1,sentence2,actual_score
0,They cleaned the entire house.,They cleaned the entire home.,1
1,They cleaned the entire house.,They destroyed the entire ship.,0
2,He purchased a new car.,He bought a new car.,1
3,He purchased a new car.,He sold a new bike.,0
4,She traveled to Paris.,She journeyed to Paris.,1


In [None]:
cl= []
for i in range(0, len(df)):
    cl.append(predict_similarity(df['sentence1'][i], df['sentence2'][i]))

In [None]:
score =[]
for i in range(0, len(df)):
  score.append(round(cl[i], 2))

In [None]:
df_twoword['predicted_score']= score
df_twoword.head(5)

Unnamed: 0,sentence1,sentence2,actual_score,predicted_score
0,They cleaned the entire house.,They cleaned the entire home.,1,0.97
1,They cleaned the entire house.,They destroyed the entire ship.,0,0.41
2,He purchased a new car.,He bought a new car.,1,0.99
3,He purchased a new car.,He sold a new bike.,0,0.58
4,She traveled to Paris.,She journeyed to Paris.,1,0.96


In [None]:
df_twoword.to_csv('final_TwoWord_withScores_stsb-distilroberta-base-v2.csv')

In [None]:
df=df_activepassiveshuff
cl= []
for i in range(0, len(df)):
    cl.append(predict_similarity(df['sentence1'][i], df['sentence2'][i]))

df.head(5)

Unnamed: 0,sentence1,sentence2,actual_score
0,The chef cooked the meal.,The meal was cooked by the chef.,1
1,The chef cooked the meal.,The plants are watered every day by her.,0
2,She waters the plants every day.,The plants are watered every day by her.,1
3,She waters the plants every day.,The championship was won by the team.,0
4,The team won the championship.,The championship was won by the team.,1


In [None]:
score =[]
for i in range(0, len(df)):
  score.append(round(cl[i], 2))

In [None]:
df_activepassiveshuff['predicted_score']= score
df_activepassiveshuff.head(5)


Unnamed: 0,sentence1,sentence2,actual_score,predicted_score
0,The chef cooked the meal.,The meal was cooked by the chef.,1,0.96
1,The chef cooked the meal.,The plants are watered every day by her.,0,0.11
2,She waters the plants every day.,The plants are watered every day by her.,1,0.82
3,She waters the plants every day.,The championship was won by the team.,0,0.03
4,The team won the championship.,The championship was won by the team.,1,0.96


In [None]:
df_activepassiveshuff.to_csv('final_activepassiveshuff_withScores_stsb-distilroberta-base-v2.csv')

# spearmanr, pearsonr

In [None]:
pip install scipy



In [None]:
# !pip install scipy.stats
from scipy.stats import spearmanr, pearsonr


In [None]:

def correlation_score(a,b ):
   spearman_corr = spearmanr(a,b)         # Calculate Spearman correlation
   pearsonr_corr = pearsonr(a,b)         # Calculate pearson correlation
   # Display the results

   return spearman_corr, pearsonr_corr


#Correlation Score for Test dataset

In [None]:
correlation_spear,correlation_pear  = correlation_score(df_test_dataset['actual_score'], df_test_dataset['predicted_score'])
print(f'Spearman_correlation_score: {correlation_spear[0]} \nPearson_correlation_score : {correlation_pear[0]}')

Spearman_correlation_score: 0.8499771577175912 
Pearson_correlation_score : 0.8411296634508838


#Correlation Score for ActivePassiveShuffled sentence

In [None]:
correlation_spear,correlation_pear  = correlation_score(df_activepassiveshuff['actual_score'], df_activepassiveshuff['predicted_score'])
print(f'Spearman_correlation_score: {correlation_spear[0]} \nPearson_correlation_score : {correlation_pear[0]}')

Spearman_correlation_score: 0.867494183343942 
Pearson_correlation_score : 0.987040909514471


#Correlation Score for TwoWord senetences

In [None]:
correlation_spear,correlation_pear  = correlation_score(df_twoword['actual_score'], df_twoword['predicted_score'])
print(f'Spearman_correlation_score: {correlation_spear[0]} \nPearson_correlation_score : {correlation_pear[0]}')

Spearman_correlation_score: 0.8551830263101141 
Pearson_correlation_score : 0.8903962709935814


# Testing Dill

In [None]:
!pip install dill



In [None]:
def predict_similarity(a, b):
  from transformers import BertTokenizer, AutoTokenizer, AlbertTokenizer, AlbertModel
  import torch

  filePath = open('bert-base_modelfile', 'rb')
  model= dill.loads(filePath.read())

  tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/stsb-bert-base')
  device = torch.device("cuda")

  test_input1 = tokenizer((a,b), padding='max_length', max_length = 128,
                          truncation=True, return_tensors="pt").to(device)
  test_input1['input_ids'] = test_input1['input_ids']
  test_input1['attention_mask'] = test_input1['attention_mask']
  try:
    del test_input1['token_type_ids']
  except:
    pass
  output1 = model(test_input1)
  sim = torch.nn.functional.cosine_similarity(output1[0], output1[1], dim=0).item()

  return round(sim, 2)

In [None]:
import dill
modelFile = open('bert-base_modelfile', 'wb')
dill.dump(model,modelFile)

predictFile = open('bert-base_PredictFile', 'wb')
dill.dump(predict_similarity,predictFile)


In [None]:
modelFile = open('bert-base_modelfile', 'rb')
modelLoad= dill.loads(modelFile.read())


predictFile = open('bert-base_PredictFile', 'rb')
predictLoad= dill.loads(predictFile.read())

In [None]:
predictLoad('Two men are playing football.','Two men are practicing football.')

0.81

In [None]:
%%writefile app.py
import streamlit as st
import io
import dill

filePath = open('bert-base_PredictFile', 'rb')
predict = dill.loads(filePath.read())

def main():
    st.set_page_config(page_title='Text Similarity Checker')
    st.header('Text Similarity Checker')


    sentence1=st.text_input('Enter First Sentence')
    sentence2=st.text_input('Enter second Sentence')
    sentence_Scoring = ''

    # if len(s1)>0 and  len(s2) > 0:

    if st.button('Process'):
        if sentence1 is not None and sentence2 is not None and sentence1 != '' and sentence2 != '':
            #sentence_Scoring = 10
            sentence_Scoring = predict(sentence1, sentence2)
            st.caption(f'Similarity score {sentence_Scoring}')
            #st.caption(sentence_Scoring)
        else:
            st.caption('Please provide inputs')
    else:
        st.caption(sentence_Scoring)


if __name__ == '__main__':
    main()


Writing app.py


In [None]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.25.0-py2.py3-none-any.whl (8.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
Collecting pympler<2,>=0.9 (from streamlit)
  Downloading Pympler-1.0.1-py3-none-any.whl (164 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting tzlocal<5,>=1.1 (from streamlit)
  Downloading tzlocal-4.3.1-py3-none-any.whl (20 kB)
Collecting validators<1,>=0.2 (from streamlit)
  Downloading validators-0.21.2-py3-none-any.whl (25 kB)
Collecting gitpython!=3.1.19,<4,>=3.0.7 (from streamlit)
  Downloading GitPython-3.1.32-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydeck<1,>=0.8 (from streamlit)
  Downloading pydeck-0.8.0-py2.py3-none-any.whl (4.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!npm install localtunnel


[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[34;40mnotice[0m[35m[0m created a lockfile as package-lock.json. You should commit this file.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
[K[?25h+ localtunnel@2.0.2
added 22 packages from 22 contributors and audited 22 packages in 3.411s

3 packages are looking for funding
  run `npm fund` for details

found [92m0[0m vulnerabilities

[K[?25h

In [None]:
!pip freeze > requirement.txt

In [None]:
!streamlit run app.py & npx localtunnel --port 8501

[##................] \ fetchMetadata: sill resolveWithNewModule ms@2.1.2 checki[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.204.63.114:8501[0m
[0m
[K[?25hnpx: installed 22 in 4.112s
your url is: https://spicy-dancers-travel.loca.lt
[34m  Stopping...[0m
^C
