In [16]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import pandas as pd
from tqdm import tqdm
import json


In [17]:
# Constants
MODEL_NAME = "gpt2"
# MODEL_NAME = "facebook/opt-350m"
BATCH_SIZE = 1
EPOCHS = 10
PROMPT_TOKEN = "[GENERATE]"
MAX_LEN = 1024

# Soft Prompt Vocabulary
soft_prompt_vocab = ["[GENERATE]"]  # Define your custom vocabulary here

# Create a word2idx dictionary for the soft prompt vocabulary
soft_prompt_word2idx = {word: idx for idx, word in enumerate(soft_prompt_vocab)}

num_prompts = len([soft_prompt_word2idx[word] for word in PROMPT_TOKEN.split()])
prompt_id = torch.tensor([soft_prompt_word2idx[word] for word in PROMPT_TOKEN.split()])


In [18]:
# Model Architecture
class GPT2WithSoftPrompt(torch.nn.Module):
    def __init__(self, model_name, num_prompts, embedding_size=768):
        super().__init__()
        self.gpt2 = GPT2LMHeadModel.from_pretrained(model_name)
        self.soft_prompt = torch.nn.Embedding(num_prompts, embedding_size)

    def forward(self, input_ids, prompt_ids):
        prompt_embeddings = self.soft_prompt(prompt_ids)
        base_embeddings = self.gpt2.transformer.wte(input_ids)
        embeddings = torch.cat([prompt_embeddings, base_embeddings.squeeze(0)], dim=0)
        outputs = self.gpt2(inputs_embeds=embeddings)
        return outputs


In [19]:
# Data Loading and Preprocessing
def load_and_preprocess_data(file_path, num_prompts):
    file = open(file_path, "r")
    
    data = json.load(file)
    tokenized_inputs = []
    tokenized_outputs = []

    tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)

    for item in data:
        # Adjust the maximum length of articles to avoid exceeding MAX_LEN
        max_length_article = MAX_LEN - num_prompts 
        output_tokens = tokenizer.encode(json.dumps(item["output"]), truncation=True, max_length=max_length_article)
        input_tokens = tokenizer.encode(item["input"], truncation=True, max_length=300)

        max_length_summary = MAX_LEN
        padded_input = input_tokens + [tokenizer.eos_token_id] * (max_length_article - len(input_tokens))
        padded_output = output_tokens + [tokenizer.eos_token_id] * (max_length_summary - len(output_tokens))

        tokenized_inputs.append(padded_input)
        tokenized_outputs.append(padded_output)

    file.close()
    
    train_limit = int(len(tokenized_inputs) * 0.7)
    val_limit = int(len(tokenized_inputs) * 0.9)

    return tokenized_inputs[:train_limit], tokenized_outputs[:train_limit], tokenized_inputs[train_limit:val_limit], tokenized_outputs[train_limit:val_limit], tokenized_inputs[val_limit:], tokenized_outputs[val_limit:]


In [20]:
# Load and preprocess the data
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
tokenized_inputs_train, tokenized_outputs_train, tokenized_inputs_validation, tokenized_outputs_validation, tokenized_inputs_test, tokenized_outputs_test = load_and_preprocess_data("dataset.json", num_prompts)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Model Initialization
model = GPT2WithSoftPrompt(MODEL_NAME, num_prompts).to(device)


In [21]:
len(tokenized_inputs_train[0])

1023

In [22]:
len(tokenized_outputs_train[0])

1024

In [23]:
# Hyperparameters
BATCH_SIZE = 1
EPOCHS = 1
GRADIENT_ACCUMULATION_STEPS = 1
GRADIENT_CLIP_NORM = 1.0
EARLY_STOPPING_PATIENCE = 2
prompt_id = prompt_id.to(device)


def fine_tune_on_summarization(model, train_inputs, train_outputs, val_inputs, val_outputs, test_inputs, test_outputs):
    optimizer = torch.optim.Adam(model.soft_prompt.parameters())

    best_val_loss = float('inf')
    no_improvement_epochs = 0

    for epoch in range(EPOCHS):
        model.train()

        # Gradient accumulation initialization
        optimizer.zero_grad()

        accumulated_loss = 0
        loss = 0

        # Use tqdm for progress bar
        with tqdm(enumerate(zip(train_inputs, train_outputs)), total=len(train_inputs), desc=f"Epoch {epoch + 1}/{EPOCHS}", unit="batch") as progress:
            train_percentage_matched = 0
            train_percentage_matched_ct = 0

            for idx, (input, output) in progress:
                input_ids = torch.tensor(input).to(device)
                labels = torch.tensor(output).to(device)
                outputs = model(input_ids, prompt_id)

                ignore_index = tokenizer.eos_token_id
                loss += CrossEntropyLoss(ignore_index=ignore_index)(outputs.logits, labels)

                # Metrics
                set1 = set(torch.argmax(outputs.logits, dim=1).cpu().numpy())
                set2 = set(labels.cpu().numpy())

                # Calculate the intersection of sets
                intersection = set1.intersection(set2)

                # Calculate the percentage of indices in the first tensor that are also in the second tensor
                percentage = (len(intersection) / len(set1)) * 100
                train_percentage_matched += percentage
                train_percentage_matched_ct += 1

                # Backpropagate losses every GRADIENT_ACCUMULATION_STEPS or at the end of the dataset
                if (idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0 or idx == len(train_inputs) - 1:
                    (loss / GRADIENT_ACCUMULATION_STEPS).backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP_NORM)
                    optimizer.step()
                    optimizer.zero_grad()
                    loss = 0
            
            print("Train : % Exact Match: ",train_percentage_matched/train_percentage_matched_ct)

        # Validation
        model.eval()
        total_val_loss = 0

        with torch.no_grad():
            val_percentage_matched = 0
            val_percentage_matched_ct = 0

            for input, output in tqdm(zip(val_inputs, val_outputs), total=len(val_inputs), desc="Validation", unit="batch"):
                input_ids = torch.tensor(input).to(device)
                labels = torch.tensor(output).to(device)
                outputs = model(input_ids, prompt_id)

                ignore_index = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -100
                val_loss = CrossEntropyLoss(ignore_index=ignore_index)(outputs.logits, labels)
                total_val_loss += val_loss.item()

                # Metrics
                set1 = set(torch.argmax(outputs.logits, dim=1).cpu().numpy())
                set2 = set(labels.cpu().numpy())

                # Calculate the intersection of sets
                intersection = set1.intersection(set2)

                # Calculate the percentage of indices in the first tensor that are also in the second tensor
                percentage = (len(intersection) / len(set1)) * 100
                val_percentage_matched += percentage
                val_percentage_matched_ct += 1

        print("Val : % Exact Match: ",val_percentage_matched/val_percentage_matched_ct)
        avg_val_loss = total_val_loss / len(val_inputs)
        print("Val Loss : ",avg_val_loss)

        # Early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            no_improvement_epochs = 0
        else:
            no_improvement_epochs += 1
            if no_improvement_epochs >= EARLY_STOPPING_PATIENCE:
                print(f"Early stopping after {EARLY_STOPPING_PATIENCE} epochs without improvement.")
                break


    # Testing
    model.eval()
    total_test_loss = 0

    with torch.no_grad():
        test_percentage_matched = 0
        test_percentage_matched_ct = 0

        for input, output in tqdm(zip(test_inputs, test_outputs), total=len(test_inputs), desc="Test", unit="batch"):
            input_ids = torch.tensor(input).to(device)
            labels = torch.tensor(output).to(device)
            outputs = model(input_ids, prompt_id)

            ignore_index = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else -100
            test_loss = CrossEntropyLoss(ignore_index=ignore_index)(outputs.logits, labels)
            total_test_loss += test_loss.item()

            # Metrics
            set1 = set(torch.argmax(outputs.logits, dim=1).cpu().numpy())
            set2 = set(labels.cpu().numpy())

            # Calculate the intersection of sets
            intersection = set1.intersection(set2)

            # Calculate the percentage of indices in the first tensor that are also in the second tensor
            percentage = (len(intersection) / len(set1)) * 100
            test_percentage_matched += percentage
            test_percentage_matched_ct += 1
        
        
        print("Test : % Exact Match: ",test_percentage_matched/test_percentage_matched_ct)
        avg_test_loss = total_test_loss / len(test_inputs)
        print("Test Loss : ",avg_test_loss)


    return model


In [24]:
fine_tuned_model = fine_tune_on_summarization(model, tokenized_inputs_train, tokenized_outputs_train, tokenized_inputs_validation, tokenized_outputs_validation, tokenized_inputs_test, tokenized_outputs_test)


Epoch 1/1:   1%|▏         | 3/219 [00:15<18:58,  5.27s/batch]


KeyboardInterrupt: 

# Saving Model

In [None]:
# Save the fine-tuned model
torch.save(fine_tuned_model.state_dict(), 'fine_tuned_model.pth')


# Loading Model

In [None]:
# Initialize a new instance of the model
model = GPT2WithSoftPrompt(MODEL_NAME, num_prompts).to(device)

# Load the saved model state_dict
model.load_state_dict(torch.load('fine_tuned_model.pth'))

# Make sure the model is in evaluation mode after loading
model.eval()

  model.load_state_dict(torch.load('fine_tuned_model.pth'))


GPT2WithSoftPrompt(
  (gpt2): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 768)
      (wpe): Embedding(1024, 768)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-11): 12 x GPT2Block(
          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2SdpaAttention(
            (c_attn): Conv1D(nf=2304, nx=768)
            (c_proj): Conv1D(nf=768, nx=768)
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D(nf=3072, nx=768)
            (c_proj): Conv1D(nf=768, nx=3072)
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=768, out

# Inference

In [None]:
# Set the model to evaluation mode
model.eval()

# Input text for summarization
input_text = "Transform into JSON including 'multi_cloud_controller', 'orchestration_policies', 'migration_strategies', and 'cost_optimizations': 'CloudHarmonizer managed 2 orchestration policies, used live migration and backup-restore strategies, optimizing costs by 15%.'"

# Tokenize and encode the input text
input_ids = tokenizer.encode(input_text, truncation=True, max_length=1024)

# Convert the input_ids to a PyTorch tensor
input_ids = torch.tensor(input_ids)

# Generate a summary
with torch.no_grad():
    # Assuming single prompt
    outputs = model(input_ids.to(device), prompt_ids=prompt_id.to(device))
    pred_logits = outputs.logits
    print(pred_logits.shape)


# Get the token IDs with the highest probability for each position
predicted_token_ids = torch.argmax(pred_logits, dim=-1)

# Convert token IDs into words using the tokenizer
predicted_tokens = tokenizer.decode(predicted_token_ids.squeeze(0), skip_special_tokens=True)


torch.Size([64, 50257])


In [None]:
predicted_tokens

"\n. the. the ,-tscloud_ 'multi 'stra_controllerredicties_ 'multi__mategyies_ ' 'm_strized', 'msugeon___.stration_ ' ' to_, ',optimore,, used the, '% to\n"

In [None]:
# Set the model to evaluation mode
model.eval()

# Input text for summarization
input_text = "Convert the following sentence into a JSON object with clear key-value pairs: 'I bought 2 flowers and a flower pot.'"

# Tokenize and encode the input text
input_ids = tokenizer.encode(input_text, truncation=True, max_length=1024)

# Convert the input_ids to a PyTorch tensor
input_ids = torch.tensor(input_ids)

# Generate a summary
with torch.no_grad():
    # Assuming single prompt
    outputs = model(input_ids.to(device), prompt_ids=prompt_id.to(device))
    pred_logits = outputs.logits
    print(pred_logits.shape)


# Get the token IDs with the highest probability for each position
predicted_token_ids = torch.argmax(pred_logits, dim=-1)

# Convert token IDs into words using the tokenizer
predicted_tokens = tokenizer.decode(predicted_token_ids.squeeze(0), skip_special_tokens=True)


torch.Size([27, 50257])


In [None]:
predicted_tokens

"\n.. same day: the list string: the and:to::\n{'m a.: then new::\n"