In [1]:
# ! pip install sentence-transformers

# Exp 1: Given a sentence, predict the time it takes to answer it

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import random
import math
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from scipy.stats import pearsonr

import pickle

  from tqdm.autonotebook import tqdm, trange


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cpu


In [3]:
# # This is where the LLM will be saved
cache_dir = "C:\\LLMs"


## Create a dummy dataset. Replace this with the real one when you can!

In [4]:

# Dummy questions with made-up response times (in seconds)
data = [
    {"question": "Is the Earth round?", "response_time": 1.5},
    {"question": "Can humans breathe underwater without equipment?", "response_time": 2.0},
    {"question": "Is Python a programming language?", "response_time": 1.2},
    {"question": "Does the sun rise in the west?", "response_time": 1.8},
    {"question": "Are there 24 hours in a day?", "response_time": 1.3},
    {"question": "Is water wet?", "response_time": 1.1},
    {"question": "Do birds fly?", "response_time": 1.6},
    {"question": "Can a cat bark?", "response_time": 2.2},
    {"question": "Is ice hot?", "response_time": 1.9},
    {"question": "Is 2+2 equal to 4?", "response_time": 1.0}
]

with open('query_to_time_dataset.pkl', 'rb') as file:
    data = pickle.load(file)


# Convert to DataFrame
df = pd.DataFrame(data)

# Split the dataset into train and test sets (80/20 split)
train_df = df.sample(frac=0.8, random_state=42)
remaining_df = df.drop(train_df.index)
validation_df = remaining_df.sample(frac=0.5, random_state=42)
test_df = remaining_df.drop(validation_df.index)

# Display the datasets
print("Train Dataset:")
print(train_df)
print("\nTest Dataset:")
print(test_df)

Train Dataset:
                                               question  response_time
291   Are you interested in the following article? W...         49.516
2328  Do you enjoy reading articles about film, tele...          5.011
1462  Are you interested in the following article? W...         18.913
2736  Do you enjoy reading articles about sustainabl...          3.441
2282  Do you enjoy reading articles about the latest...          9.328
...                                                 ...            ...
814   Are you interested in the following article? W...         18.548
1852  Do you enjoy reading articles about philosophy...          2.467
2220  Do you enjoy reading articles about the latest...          1.751
2907  What types of books, podcasts, or documentarie...        212.342
2105  Do you enjoy reading articles about different ...          5.656

[2410 rows x 2 columns]

Test Dataset:
                                               question  response_time
4     Are you interest

## Sentence Transformer + MLP

In [5]:

# Load a pre-trained sentence transformer model
# sentence_model = SentenceTransformer('all-MiniLM-L6-v2', cache_folder = cache_dir)
sentence_model = SentenceTransformer('all-mpnet-base-v2', cache_folder = cache_dir)



In [6]:

class ResponseTimePredictor(nn.Module):
    def __init__(self, embedding_dim):
        super(ResponseTimePredictor, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Predict a single scalar value (response time)
        )

    def forward(self, x):
        return self.mlp(x)


## Train

In [7]:


# Initialize the SentenceTransformer model and the MLP model
embedding_dim = sentence_model.get_sentence_embedding_dimension()
model = ResponseTimePredictor(embedding_dim).to(device)  # Move model to device

# Encode the questions to get embeddings and targets on the correct device
def encode_questions(df):
    embeddings = sentence_model.encode(df['question'].tolist(), convert_to_tensor=True).to(device)
    targets = torch.tensor(df['response_time'].values, dtype=torch.float32).unsqueeze(1).to(device)
    return TensorDataset(embeddings, targets)

# Prepare DataLoader for train and test sets
train_dataset = encode_questions(train_df)
validation_dataset = encode_questions(validation_df)
test_dataset = encode_questions(test_df)
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

# Set up optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

# Training loop with correct device handling
def train(model, train_loader, optimizer, criterion, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for embeddings, targets in train_loader:
            # Ensure data is on the same device as the model
            embeddings, targets = embeddings.to(device), targets.to(device)

            optimizer.zero_grad()
            predictions = model(embeddings)
            loss = criterion(predictions, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for val_embeddings, val_targets in validation_loader:
                val_embeddings, val_targets = val_embeddings.to(device), val_targets.to(device)
                val_predictions = model(val_embeddings)
                val_loss = criterion(val_predictions, val_targets)
                total_val_loss += val_loss.item()

        avg_train_loss = total_loss / len(train_loader)
        avg_val_loss = total_val_loss / len(validation_loader)
        print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

        model.train()


In [8]:

# Train the model
train(model, train_loader, optimizer, criterion, 20)

Epoch 1, Training Loss: 492.3197, Validation Loss: 560.7288
Epoch 2, Training Loss: 369.3729, Validation Loss: 384.3930
Epoch 3, Training Loss: 298.4171, Validation Loss: 337.5547
Epoch 4, Training Loss: 276.1495, Validation Loss: 311.6008
Epoch 5, Training Loss: 261.3173, Validation Loss: 293.5452
Epoch 6, Training Loss: 250.9667, Validation Loss: 280.7455
Epoch 7, Training Loss: 243.2833, Validation Loss: 271.3485
Epoch 8, Training Loss: 237.5955, Validation Loss: 264.6963
Epoch 9, Training Loss: 233.2345, Validation Loss: 259.8240
Epoch 10, Training Loss: 229.9222, Validation Loss: 255.9068
Epoch 11, Training Loss: 228.2685, Validation Loss: 253.4319
Epoch 12, Training Loss: 225.8554, Validation Loss: 252.4332
Epoch 13, Training Loss: 223.7859, Validation Loss: 250.9240
Epoch 14, Training Loss: 222.2896, Validation Loss: 250.4499
Epoch 15, Training Loss: 221.1558, Validation Loss: 250.1460
Epoch 16, Training Loss: 220.0651, Validation Loss: 249.9001
Epoch 17, Training Loss: 218.8795

## Test

In [None]:
def evaluate(model, test_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for embeddings, targets in test_loader:
            predictions = model(embeddings)
            loss = criterion(predictions, targets)
            total_loss += loss.item()
    print(f"Test MSE: {total_loss / len(test_loader)}")

# Evaluate the model on the test set
evaluate(model, test_loader, criterion)

Test MSE: 242.3903601922487
304


In [None]:
### COMPARISON BASED EVALUATION ###

model.eval()
pred = []
target = []
with torch.no_grad():
    for embeddings, targets in test_loader:
        predictions = model(embeddings)
        pred.extend(predictions.flatten().tolist())
        target.extend(targets.flatten().tolist())

good = 0
for i in range(len(pred)):
    for j in range(i+1, len(pred)):
        if (pred[i] - pred[j])*(target[i] - target[j]) >= 0:
            good += 1

n = len(pred)
print(good/(n*(n-1)/2))

0.7120930232558139


In [29]:
def predict_question(question):
    model.eval()
    embeddings = sentence_model.encode(question, convert_to_tensor=True).to(device)
    prediction = model(embeddings)
    return prediction.item()

print(predict_question("Please write down all the topics for websites that you would be interested in."))
print(predict_question("List what types of websites that you would enjoy reading."))
print(predict_question("Do you prefer articles on science or articles on art?"))
print(predict_question("Are websites about travel interesting to you?"))
print(predict_question("Do you like websites about travel?"))
print(predict_question("Do you prefer websites about travel?"))
print(predict_question("Are you interested in websites about travel?"))

33.10298156738281
34.23445129394531
20.700101852416992
12.748628616333008
8.246345520019531
11.962993621826172
14.487029075622559


In [32]:
torch.save(model.state_dict(), "model_state_dict.pth")

In [14]:
del model # clear Vram

NameError: name 'model' is not defined

# Exp 2: What does model perplexity tell you about human effort?

In [11]:
# Load Falcon-7B model and tokenizer from Hugging Face
model_name = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
model = AutoModelForCausalLM.from_pretrained(model_name, cache_dir=cache_dir).to(device)


Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.31s/it]


In [12]:

# Function to compute perplexity for a given sentence
# This code runs slow :( 3 min per query
def compute_perplexity(sentence):
    # Tokenize input and move input IDs to the same device as the model
    inputs = tokenizer(sentence, return_tensors="pt").to(device)
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        # Model forward pass with inputs on the correct device
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss.item()  # Cross-entropy loss

    perplexity = math.exp(loss)  # Perplexity = exp(loss)
    return perplexity


In [13]:
perplexity = compute_perplexity("Please write down all the topics for websites that you would be interested in")
print(perplexity)

32.277402750475154


In [41]:
perplexities = []
for question in tqdm(df['question'], desc="Computing Perplexity"):
    print(question)
    perplexity = compute_perplexity(question)
    print(perplexity)
    perplexities.append(perplexity)

# Add perplexity column to DataFrame
df['perplexity'] = perplexities

Computing Perplexity:   0%|          | 0/3013 [00:00<?, ?it/s]

Are you interested in the following article? Website: msn.com
Title: Race Rewind: Phoenix heats up and Denny delivers
Description: Relive Denny Hamlin's incredible day at ISM Raceway in this week's Monster Energy NASCAR Cup Series Race Rewind.


Computing Perplexity:   0%|          | 1/3013 [00:02<1:51:16,  2.22s/it]

13.390038461510132
Are you interested in the following article? Website: msn.com
Title: 15 Things Dog Shelters Need You to Know
Description: When you're looking for your new best friend, staffers and volunteers at dog shelters can help.


Computing Perplexity:   0%|          | 2/3013 [00:04<1:47:08,  2.13s/it]

13.505579852256371
Are you interested in the following article? Website: msn.com
Title: 2021 VW Golf GTI Caught At The 'Ring Perfecting Hot Hatch Recipe
Description: One of the very few eighth-gen Golfs coming to the U.S.


Computing Perplexity:   0%|          | 2/3013 [00:06<2:35:16,  3.09s/it]


KeyboardInterrupt: 

In [17]:
df

Unnamed: 0,question,response_time,perplexity
0,Is the Earth round?,1.5,44.170919
1,Can humans breathe underwater without equipment?,2.0,36.28654
2,Is Python a programming language?,1.2,21.043721
3,Does the sun rise in the west?,1.8,7.074077
4,Are there 24 hours in a day?,1.3,10.138601
5,Is water wet?,1.1,77.514559
6,Do birds fly?,1.6,167.622608
7,Can a cat bark?,2.2,122.176084
8,Is ice hot?,1.9,620.395673
9,Is 2+2 equal to 4?,1.0,9.60842


In [None]:
# Extract perplexity and response time values
perplexities = df['perplexity'].values
response_times = df['response_time'].values

# Compute Pearson correlation
corr, _ = pearsonr(perplexities, response_times)

print(f"Pearson Correlation between Perplexity and Response Time: {corr:.4f}")

# Ignore under this

# Train

In [15]:
# Prepare DataLoader
def prepare_data(df):
    perplexities = torch.tensor(df['perplexity'].values, dtype=torch.float32).unsqueeze(1).to(device)
    response_times = torch.tensor(df['response_time'].values, dtype=torch.float32).unsqueeze(1).to(device)
    return TensorDataset(perplexities, response_times)

# Create DataLoader
train_loader = DataLoader(prepare_data(df), batch_size=4, shuffle=True)

# Define the MLP model
class PerplexityResponseTimePredictor(nn.Module):
    def __init__(self):
        super(PerplexityResponseTimePredictor, self).__init__()
        self.mlp = nn.Sequential(
            nn.Linear(1, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)  # Predict response time
        )

    def forward(self, x):
        return self.mlp(x)

# Initialize model, optimizer, and loss function
model = PerplexityResponseTimePredictor().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

# Training loop with proper device handling
def train(model, loader, optimizer, criterion, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for perplexities, targets in loader:
            # Ensure the data is on the same device as the model
            perplexities, targets = perplexities.to(device), targets.to(device)

            optimizer.zero_grad()
            predictions = model(perplexities)
            loss = criterion(predictions, targets)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(loader)}")

In [16]:
# Train the model
train(model, train_loader, optimizer, criterion)

Epoch 1, Loss: 862.4992268880209
Epoch 2, Loss: 845.4175694783529
Epoch 3, Loss: 752.4874006112417
Epoch 4, Loss: 747.1314239501953
Epoch 5, Loss: 1224.9714260101318
Epoch 6, Loss: 667.5557505289713
Epoch 7, Loss: 588.7677764892578
Epoch 8, Loss: 554.1025924682617
Epoch 9, Loss: 518.0783847967783
Epoch 10, Loss: 496.5027732849121


## Eval

Pearson Correlation between Perplexity and Response Time: 0.3782


# Boiler plate starter code

In [None]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Set a random seed
random_seed = 42
random.seed(random_seed)

# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)


In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')


In [None]:
# Input text
text = "GeeksforGeeks is a computer science portal"

# Tokenize and encode text using batch_encode_plus
# The function returns a dictionary containing the token IDs and attention masks
encoding = tokenizer.batch_encode_plus(
    [text],                  # List of input texts
    padding=True,             # Pad to the maximum sequence length
    truncation=True,          # Truncate to the maximum sequence length if necessary
    return_tensors='pt',      # Return PyTorch tensors
    add_special_tokens=True   # Add special tokens CLS and SEP
)

input_ids = encoding['input_ids']  # Token IDs
# Print input IDs
print(f"Input ID: {input_ids}")

attention_mask = encoding['attention_mask']  # Attention mask
# Print attention mask
print(f"Attention mask: {attention_mask}")


In [None]:
# Generate embeddings using BERT model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state  # This contains the embeddings

# Output the shape of word embeddings
print(f"Shape of Word Embeddings: {word_embeddings.shape}")


In [None]:
# Assuming the tokenizer is already defined and used
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])

# Print word embeddings for each token
for token, embedding in zip(tokens, word_embeddings[0]):
    print(f"Token: {token}")
    # print(f"Embedding: {embedding}\n")


In [None]:
# Compute the average of word embeddings to get the sentence embedding
sentence_embedding = word_embeddings.mean(dim=1)  # Average pooling along the sequence length dimension

# # Print the sentence embedding
# print("Sentence Embedding:")
# print(sentence_embedding)

# Output the shape of the sentence embedding
print(f"Shape of Sentence Embedding: {sentence_embedding.shape}")


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Example sentence for similarity comparison
example_sentence = "GeeksforGeeks is a technology website"

# Tokenize and encode the example sentence
example_encoding = tokenizer.batch_encode_plus(
    [example_sentence],
    padding=True,
    truncation=True,
    return_tensors='pt',
    add_special_tokens=True
)
example_input_ids = example_encoding['input_ids']
example_attention_mask = example_encoding['attention_mask']

# Generate embeddings for the example sentence
with torch.no_grad():
    example_outputs = model(example_input_ids, attention_mask=example_attention_mask)
    example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)

# Compute cosine similarity between the original sentence embedding and the example sentence embedding
similarity_score = cosine_similarity(
    sentence_embedding.numpy(), example_sentence_embedding.numpy()
)

# Print the similarity score
print("Cosine Similarity Score:", similarity_score[0][0])
