# Install necesasary packages and import required libraries

In [1]:
!pip install torch transformers pandas scikit-learn scipy seaborn matplotlib datasets sentencepiece accelerate
import requests
import torch
import accelerate
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from scipy.stats import spearmanr
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset



  from .autonotebook import tqdm as notebook_tqdm


## Set device

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


## Download dataset

In [None]:
url = 'https://jlu.myweb.cs.uwindsor.ca/8380/'
lines = requests.get(url + 'vldb_train.txt').text.split('\n')
documents = [line.lower().strip() for line in lines if line.strip()]

# 1: Generate Embeddings using Llama

In [9]:
def generate_embeddings(texts, model, tokenizer, pooling_method="cls", batch_size=4):
    embeddings_list = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        encoded = tokenizer(
            batch, padding=True, truncation=True, max_length=512, return_tensors="pt"
        )
        
        input_ids = encoded["input_ids"].to(model.device)
        attention_mask = encoded["attention_mask"].to(model.device)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)

        if pooling_method == "cls":
            batch_embeddings = outputs.last_hidden_state[:, 0, :]  # CLS token
        elif pooling_method == "mean":
            batch_embeddings = (
                outputs.last_hidden_state * attention_mask.unsqueeze(-1)
            ).sum(dim=1) / attention_mask.sum(dim=1).unsqueeze(-1)
        else:
            raise ValueError("Invalid pooling method. Use 'cls' or 'mean'.")

        embeddings_list.append(batch_embeddings.cpu().numpy())

    return np.vstack(embeddings_list)


In [11]:
# Load the LLaMA model and tokenizer
pretrained_weights = 'NousResearch/llama-2-7b-chat-hf'
tokenizer = AutoTokenizer.from_pretrained(pretrained_weights)
# Load model in FP16 precision
model = AutoModel.from_pretrained(
    pretrained_weights,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="balanced"
)
model.eval()

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.75s/it]


LlamaModel(
  (embed_tokens): Embedding(32000, 4096, padding_idx=0)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
        (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
        (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
    )
  )
  (norm): LlamaRMSNorm((4096,), eps=1e-05)
  (rotary_emb): Ll

In [12]:
print("Generating embeddings...")
embeddings = generate_embeddings(documents, model, tokenizer, pooling_method="cls", batch_size=2)
print(f"Embeddings shape: {embeddings.shape}")

Generating embeddings...


# 2: Evaluation in classification task

In [None]:
pos = requests.get(url + 'vldb.txt').text.split('\n')
neg = requests.get(url + 'icse.txt').text.split('\n')
k = 1000  # Work with 1000 samples initially for each class
documents_class = pos[:k] + neg[:k]
labels = [0] * k + [1] * k

In [None]:
print("Generating embeddings for classification task...")
class_embeddings = generate_embeddings(documents_class, model, tokenizer, pooling_method="cls", batch_size=2)

In [None]:
# Train-test split
train_features, test_features, train_labels, test_labels = train_test_split(class_embeddings, labels, random_state=42)

# Train classifier
clf = LogisticRegression(max_iter=500, solver='lbfgs')
clf.fit(train_features, train_labels)

In [None]:
# Evaluate the classifier
y_pred = clf.predict(test_features)
accuracy = accuracy_score(test_labels, y_pred)
precision = precision_score(test_labels, y_pred)
recall = recall_score(test_labels, y_pred)
f1 = f1_score(test_labels, y_pred)
cm = confusion_matrix(test_labels, y_pred)

print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(cm)

In [None]:
# Visualize the confusion matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# 3: Evaluation in STS task

In [None]:
sts_data = []
lines = requests.get(url + "ws/WS353-Sim.txt").text.rstrip().split('\n')
for line in lines:
    x, y, sim = line.lower().split()
    sts_data.append(((x, y), float(sim)))

In [None]:
actual, expected = [], []
for (x, y), sim in sts_data:
    x_emb = generate_embeddings([x], model, tokenizer, pooling_method="cls", batch_size=2)[0]
    y_emb = generate_embeddings([y], model, tokenizer, pooling_method="cls", batch_size=2)[0]
    cosine_sim = np.dot(x_emb, y_emb) / (np.linalg.norm(x_emb) * np.linalg.norm(y_emb))
    actual.append(cosine_sim)
    expected.append(sim)

In [None]:
# Calculate Spearman's correlation
spearman_corr, _ = spearmanr(actual, expected)
print(f"Spearman's Correlation: {spearman_corr:.4f}")

In [None]:
# Visualize actual vs. expected similarity
plt.scatter(expected, actual)
plt.xlabel('Expected Similarity')
plt.ylabel('Actual Similarity')
plt.title('STS Evaluation: Actual vs. Expected Similarity')
plt.show()

## STS Benchmark (STSb) dataset from Hugging Face

In [None]:
# Load the STSb dataset from Hugging Face
stsb_dataset = load_dataset("sentence-transformers/stsb", split="test")

In [None]:
actual, expected = [], []
for example in stsb_dataset:
    sentence1, sentence2 = example['sentence1'], example['sentence2']
    human_score = example['label'] / 5.0  # Normalize to [0, 1]

    # Generate embeddings for the two sentences
    sent1_emb = generate_embeddings([sentence1], model, tokenizer, pooling_method="cls", batch_size=2)[0]
    sent2_emb = generate_embeddings([sentence2], model, tokenizer, pooling_method="cls", batch_size=2)[0]

    # Calculate cosine similarity
    cosine_sim = np.dot(sent1_emb, sent2_emb) / (np.linalg.norm(sent1_emb) * np.linalg.norm(sent2_emb))

    actual.append(cosine_sim)
    expected.append(human_score)

In [None]:
# Calculate and display Spearman's correlation
spearman_corr, _ = spearmanr(actual, expected)
print(f"Spearman's Correlation (STS Benchmark): {spearman_corr:.4f}")

In [None]:
# Plot actual vs. expected similarities
plt.scatter(expected, actual, alpha=0.5)
plt.xlabel('Expected Similarity (Normalized)')
plt.ylabel('Actual Similarity')
plt.title('STS Benchmark Evaluation: Actual vs. Expected Similarity')
plt.show()
