In [22]:
# Import necessary libraries
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
# Check for CUDA and move model to device (GPU if available)
device = "cuda" if torch.cuda.is_available() else "cpu"

device

'cuda'

In [9]:
# Load pre-trained tokenizer and model for fine-tuning
model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Move the model to the selected device (GPU or CPU)
model = model.to(device)

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-mpnet-base-v2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Create a synthetic dataset with positive and negation pairs
# Load the dataset (first 8000 samples)
dataset = load_dataset("jinaai/negation-dataset", split="train")
positive_examples = dataset["entailment"][:4000]
negative_examples = dataset["negative"][:4000]

In [27]:
def tokenize_examples(examples, tokenizer):
    return tokenizer(list(examples), padding=True, truncation=True, return_tensors="pt")

positive_encodings = tokenize_examples(positive_examples, tokenizer)
negation_encodings = tokenize_examples(negative_examples, tokenizer)

# Move tensors to CUDA if available
positive_encodings = {k: v.to(device) for k, v in positive_encodings.items()}
negation_encodings = {k: v.to(device) for k, v in negation_encodings.items()}

# Create Dataset for training
train_dataset = Dataset.from_dict({
    'input_ids': positive_encodings['input_ids'].tolist() + negation_encodings['input_ids'].tolist(),
    'attention_mask': positive_encodings['attention_mask'].tolist() + negation_encodings['attention_mask'].tolist(),
    'labels': [1.0] * len(positive_examples) + [0.0] * len(negative_examples)  # 1 for positive, 0 for negation
})


In [None]:
# Define data collator
data_collator = DataCollatorWithPadding(tokenizer)

# Define Training Arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=500,
    evaluation_strategy="no",
)

# Define the Trainer
trainer = Trainer(
    model=model,                         # the model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    data_collator=data_collator,  # Use data collator for padding

)

# Train the model
trainer.train()



Step,Training Loss
1000,0.0754


TrainOutput(global_step=1500, training_loss=0.060680341084798174, metrics={'train_runtime': 67.3002, 'train_samples_per_second': 356.611, 'train_steps_per_second': 22.288, 'total_flos': 480995579376000.0, 'train_loss': 0.060680341084798174, 'epoch': 3.0})

In [31]:
# Fine-tuning completed, now testing cosine similarity between sentence pairs

# Test sentences for cosine similarity comparison
test_sentence_1 = "A man playing guitar on stage."
test_sentence_2 = "A man not playing guitar on stage."

# Tokenize the test sentences
test_encodings_1 = tokenizer(test_sentence_1, return_tensors='pt', padding=True, truncation=True).to(device)
test_encodings_2 = tokenizer(test_sentence_2, return_tensors='pt', padding=True, truncation=True).to(device)

# Obtain embeddings for test sentences
with torch.no_grad():
    model.eval()
    embedding_1 = model(**test_encodings_1).logits
    embedding_2 = model(**test_encodings_2).logits

# Compute cosine similarity
cosine_sim = cosine_similarity(embedding_1.cpu().numpy(), embedding_2.cpu().numpy())

print(f"Cosine Similarity between the two test sentences: {cosine_sim[0][0]}")






Cosine Similarity between the two test sentences: 1.0
