In [1]:
import torch
from tqdm import tqdm
import pandas as pd
import numpy as np
# from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.decomposition import PCA
# import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
# tqdm.pandas()

from transformers import pipeline, AutoTokenizer,GPT2Tokenizer, GPT2Model, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.model_selection import train_test_split

import os
import time

import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForMaskedLM, AdamW
from tqdm import tqdm
import re

2024-03-14 04:55:18.069992: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()  # Releases all unused cached memory from PyTorch
print(torch.cuda.memory_summary())

device

|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 0            |        cudaMalloc retries: 0         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------------------|
| Active memory         |      0 B   |      0 B   |      0 B   |      0 B   |
|       from large pool |      0 B   |      0 B   |      0 B   |      0 B   |
|       from small pool |      0 B   |      0 B   |      0 B   |      0 B   |
|---------------------------------------------------------------

device(type='cuda')

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForCausalLM.from_pretrained('bert-base-uncased', is_decoder=True).to(device)

In [4]:
# df = pd.read_csv("data/rewrite_data.csv")
df = pd.read_csv("data/8000_data.csv")
df = df.dropna()
df['word_count'] = df['prompt'].apply(lambda x: len(x.split()))

# Get the distribution of word counts
word_count_distribution = df['word_count'].value_counts(normalize=True)
word_count_distribution

random_word_counts = np.random.choice(word_count_distribution.index, p=word_count_distribution.values, size=1)

# Print the generated random numbers
df['original_text'] = df['original_text'].apply(lambda x: x[:100])
df['rewritten_text'] = df['rewritten_text'].apply(lambda x: x[:100])

In [5]:
import re
import torch
from torch.utils.data import Dataset

class MaskedSequenceDataset(Dataset):
    def __init__(self, dataframe, tokenizer, mask_token='[MASK]'):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.mask_token = mask_token

    def combine_and_mask(self, original, rewrite, prompt_length):
        masks = " ".join([self.mask_token for _ in range(prompt_length)])
        masked_sequence = f"{original} The task is to rewrite this narrative with the given blanks: {masks} {rewrite}"
        return masked_sequence

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        prompt = row['prompt']
        
        # Tokenize prompt to get target ids for masked tokens
        target_subsequence_tokens = self.tokenizer.tokenize(prompt)
        target_subsequence_ids = self.tokenizer.convert_tokens_to_ids(target_subsequence_tokens)

        # Create a masked sequence combining original, mask tokens, and rewritten text
        masked_sequence = self.combine_and_mask(row['original_text'], row['rewritten_text'], len(target_subsequence_tokens))
        # Tokenize the combined text
        inputs = self.tokenizer(masked_sequence, truncation=True, padding='max_length', max_length=512)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        
        # Create labels with the same length as input_ids initialized to -100
        labels = [-100] * len(input_ids)
        
        # Find the indices of the mask tokens and replace -100 with the target ids
        mask_indices = [i for i, token_id in enumerate(input_ids) if token_id == self.tokenizer.mask_token_id]

        if len(target_subsequence_ids) > len(mask_indices):
            raise ValueError(f"Prompt tokenization resulted in {len(target_subsequence_ids)} tokens, which exceeds the available space of {len(mask_indices)} MASK tokens.")
        
        if len(mask_indices) >= len(target_subsequence_ids):
            for mask_index, target_id in zip(mask_indices, target_subsequence_ids):
                labels[mask_index] = target_id
        else:
            raise ValueError("Not enough mask tokens to fit the rewrite prompt")

        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

# Usage:
# Assume you have a DataFrame 'df' and a tokenizer 'tokenizer' already defined
# dataset = MaskedSequenceDataset(dataframe=df, tokenizer=tokenizer)


In [6]:
train_df, test_df = train_test_split(df, test_size=0.2)  # Split with 20% data as test set

# Convert DataFrame to DataLoader for train and test sets
train_dataset = MaskedSequenceDataset(train_df, tokenizer)
test_dataset = MaskedSequenceDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)



In [7]:
model.train()

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
epochs = 15
for epoch in range(epochs):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}", unit="batch"):
        input_ids, attention_mask, labels = batch  # 'masks' is renamed to 'attention_mask' for clarity
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)  # Ensure attention_mask is also moved to the device
        labels = labels.to(device)
        
        model.zero_grad()
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Average loss at epoch {epoch + 1}: {avg_loss:.4f}")

# Save the model after training
# model.save_pretrained('path_to_save_model')
# tokenizer.save_pretrained('path_to_save_model')


Epoch 1/15:   0%|          | 1/775 [00:00<11:17,  1.14batch/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 476.00 MiB. GPU 0 has a total capacty of 10.91 GiB of which 204.06 MiB is free. Process 51791 has 2.71 GiB memory in use. Process 35162 has 8.00 GiB memory in use. Of the allocated memory 7.12 GiB is allocated by PyTorch, and 137.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Evaluation

In [None]:
def batch_cosine_similarity(x1, x2):

    # Normalize the embeddings to unit vectors
    x1_norm = torch.nn.functional.normalize(x1, p=2, dim=-1)
    x2_norm = torch.nn.functional.normalize(x2, p=2, dim=-1)
    
    # Compute the cosine similarity
    cos_sim = torch.mm(x1_norm, x2_norm.transpose(0, 1))
    
    return cos_sim

from sentence_transformers import SentenceTransformer

scs_model = SentenceTransformer("sentence-t5-base")

def sharpened_cosine_similarity_batch(scs_model, output_texts, target_texts, sharpen_factor=3):
    # Assuming scs_model.encode() returns PyTorch tensors
    target_embeddings = scs_model.encode(target_texts, convert_to_tensor=True)
    output_embeddings = scs_model.encode(output_texts, convert_to_tensor=True)
    
    # Calculate batch cosine similarities using the previously defined batch_cosine_similarity function
    cos_sims = batch_cosine_similarity(target_embeddings, output_embeddings)
    
    # Apply the sharpening factor to each similarity score and create a list of tensors
    sharpened_scores = [cos_sims[i][i].unsqueeze(0) ** sharpen_factor for i in range(cos_sims.size(0))]
    
    return sharpened_scores

In [None]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


In [None]:
model.eval()  # Set the model to evaluation mode

# Initialize lists to store predictions and targets for evaluation
predicts = []
targets = []

for batch in tqdm(test_loader, desc="Evaluating", unit="batch"):
    input_ids, attention_mask, labels = batch
    
    # Move tensors to the appropriate device
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    
    # Make predictions
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)
        
        # Identify the non-ignored labels
        valid_label_mask = labels != -100
        labels_index = valid_label_mask.nonzero(as_tuple=True)
        
        # Extract the valid labels and predictions
        valid_labels = labels[valid_label_mask]
        valid_predictions = predictions[valid_label_mask]
        
        # Convert labels and predictions to tokens and then to strings
        label_tokens = tokenizer.convert_ids_to_tokens(valid_labels.tolist())
        predicted_tokens = tokenizer.convert_ids_to_tokens(valid_predictions.tolist())
        
        label_sentences = tokenizer.convert_tokens_to_string(label_tokens)
        predicted_sentences = tokenizer.convert_tokens_to_string(predicted_tokens)
        
        # Store the sentences for later evaluation
        predicts.append(predicted_sentences)
        targets.append(label_sentences)
    
#     print("Predicted:", predicted_sentences)
#     print("Labels:", label_sentences)
#     print()

In [None]:
score = sharpened_cosine_similarity_batch(scs_model, predicts, targets, sharpen_factor=3)
stacked_tensors = torch.stack(score)
average = stacked_tensors.float().mean()

print(f"Avergage Sharpened Cosine Similarity Score: {average}")
