#Installing Packages

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

In [2]:
# Load pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [3]:
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [4]:
token_ids = tokenizer.encode("hello", add_special_tokens=False)
token_ids

[31373]

In [5]:
tokenizer.decode([31373])

'hello'

#Downloading GPT2 Model

In [6]:
import copy

In [7]:
# model = GPT2LMHeadModel.from_pretrained('gpt2-xl') # around 6.5gb
model = GPT2LMHeadModel.from_pretrained('gpt2') # around 500mb
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])

1

In [9]:
model.resize_token_embeddings(len(tokenizer))
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50261, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50261, bias=False)
)

In [10]:
model_copy = copy.deepcopy(model)

#Connecting to Google Drive

In [11]:
# Mount Google Drive
from google.colab import drive as gdrive

MOUNT_DIR = '/content/drive'
gdrive.mount(MOUNT_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import os

BASE_DIR = "/content/drive/My Drive/AI/fine-tuned-gpt2"
os.chdir(BASE_DIR)
!ls

chat_data.json	generation_config.json	model.safetensors	 tokenizer_config.json
config.json	merges.txt		special_tokens_map.json  vocab.json


#Dataset Class

In [13]:
from torch.utils.data import Dataset
import json

class ChatData(Dataset):
    """
    Custom Dataset class for handling chat data.
    """

    def __init__(self, path: str, tokenizer, max_sample=5000):
        """
        Initialize the dataset by loading the data, formatting the dialogues,
        and tokenizing the text.

        Args:
            path (str): The file path to the JSON data.
            tokenizer: The tokenizer to be used for encoding the text data.
        """
        # Load JSON data from the specified file path
        with open(path, "r") as file:
            self.data = json.load(file)

        # Extract and format dialogues from the data
        self.X = [j['text'] for i in self.data for j in i['dialog']]

        # Format dialogues into the required string format
        for idx in range(len(self.X) - 1):
            self.X[idx] = f"<startofstring> {self.X[idx]} <bot>: {self.X[idx + 1]} <endofstring>"

        # Limit the dataset to 5000 samples for manageability
        self.max_sample = max_sample

        self.X = self.X[:self.max_sample]

        # Tokenize the dialogues using the provided tokenizer
        self.X_encoded = tokenizer(self.X, max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

        self.data_len = len(self.X)

    def __len__(self):
        """
        Return the number of samples in the dataset.

        Returns:
            int: Number of samples.
        """
        return self.data_len

    def __getitem__(self, idx):
        """
        Return a single sample of the dataset given an index.

        Args:
            idx (int): Index of the sample to retrieve.

        Returns:
            tuple: A tuple containing input_ids and attention_mask for the given index.
        """
        return self.input_ids[idx], self.attention_mask[idx]

    def print_original_text(self, idx):
        """
        Print the original non-tokenized text of the dataset given an index.

        Args:
            idx (int): Index of the sample to print.
        """
        if 0 <= idx < self.data_len:
            return self.X[idx]
        else:
            raise IndexError("Index out of range.")


In [14]:
# Create an instance of the ChatData class
path_to_json  = "chat_data.json"
ds_chat = ChatData(path=path_to_json,
                        tokenizer=tokenizer,
                        max_sample=5000)

In [15]:
idx = 28
print(ds_chat.print_original_text(idx))
print(ds_chat[idx])

<startofstring> /test <bot>: Text is not given. Please try to type /end and /test to reset the state and get text. <endofstring>
(tensor([50258,  1220,  9288,   220, 50260,  8255,   318,   407,  1813,    13,
         4222,  1949,   284,  2099,  1220,   437,   290,  1220,  9288,   284,
        13259,   262,  1181,   290,   651,  2420,    13,   220, 50259, 50257,
        50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))


#Preparing Dataloader for Batch Processing

In [16]:
from torch.utils.data import DataLoader

batch_size = 64
dl_chat = DataLoader(ds_chat,
                     batch_size=batch_size,
                     shuffle=True)


In [17]:
for batch in dl_chat:
    input_ids, attention_mask = batch
    print(input_ids.shape, attention_mask.shape)
    # Your training code here
    break


torch.Size([64, 40]) torch.Size([64, 40])


#Testing GPT2 Model

In [18]:
text = "Hello, my name is GKV and yourself?"
encoded_input = tokenizer.encode_plus(
    text,
    return_tensors='pt',
    padding=True,
    truncation=True
)
encoded_input

{'input_ids': tensor([[15496,    11,   616,  1438,   318,   402,    42,    53,   290,  3511,
            30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [19]:
pad_token_idx = tokenizer.encode(tokenizer.pad_token)[0]
pad_token_idx

50257

In [20]:
# Generate text
output_sequences = model.generate(
    input_ids=encoded_input['input_ids'],
    attention_mask=encoded_input['attention_mask'],
    max_length=50,  # Maximum length of the generated text
    num_return_sequences=1,  # Number of sequences to generate
    no_repeat_ngram_size=2,  # Prevent repeating the same n-gram
    top_k=50,  # Number of highest probability vocabulary tokens to keep for top-k-filtering
    top_p=0.95,  # Cumulative probability of parameter highest probability vocabulary tokens
    temperature=0.7,  # The value used to module the next token probabilities
    do_sample=True,  # Set to True for sampling; False for greedy decoding
    pad_token_id=pad_token_idx  # Setting pad token id
)

In [21]:
# Decode the generated text
generated_text = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
print(generated_text)

Hello, my name is GKV and yourself? <bot>:  the – is a my name and me I who that you your what which it


#Train and Inference Functions

In [22]:
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

In [23]:
def infer(gptModel, tokenizer, inp):
    """
    Generate a response from the model given an input string.

    Args:
        inp (str): The input string for which the model needs to generate a response.

    Returns:
        str: The generated response from the model.
    """
    gptModel.eval()
    # Prepare the input string by adding special tokens
    inp = "<startofstring> " + inp + " <bot>: "

    # Tokenize the input string and convert it to PyTorch tensors
    inp = tokenizer(inp, return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)

    # Generate the output using the model
    output = gptModel.generate(X, attention_mask=a)

    # Decode the output to get the generated response string
    output = tokenizer.decode(output[0])

    return output

#Setting Device

In [24]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [None]:
import torch
import tqdm


total_batches = len(dl_chat)
epochs = 10
save = False

model = copy.deepcopy(model_copy)
model = model.to(device)
optim = Adam(model.parameters(), lr=1e-3)

print("Training Started .... ")

for epoch in range(epochs):

  running_loss = 0.0
  model.train()

  print(f"Epoch: {epoch + 1}/{epochs} ")


  # Iterate over batches

  for batch_idx, (X, a) in enumerate(dl_chat):
    # Move data to the device (GPU or CPU)
    X = X.to(device)
    a = a.to(device)

    # Zero the parameter gradients
    optim.zero_grad()

    # Forward pass
    outputs = model(X, attention_mask=a, labels=X)
    loss = outputs.loss

    # Backward pass and optimization
    loss.backward()
    optim.step()

    # Accumulate loss
    running_loss += loss.item()

    # Print progress for each 10% completion
    if (batch_idx + 1) % (total_batches // 10) == 0:
        print('.', end='', flush=True)

    if batch_idx > 5:
      break

  if save:
    # Save the model and tokenizer after each epoch with epoch information in the directory name
    save_directory = f"./fine-tuned-gpt2-epoch-{epoch + 1}"
    model.save_pretrained(save_directory)
    tokenizer.save_pretrained("./tokenizer")

  # Print the loss for the current epoch and an inference example
  avg_loss = running_loss / total_batches
  print(f"\t Loss: {avg_loss:.4f}", flush=True)
  print("")
  print(infer(model, tokenizer, "hello how are you"))

  break

print("Training Completed .... ")

Training Started .... 
Epoch: 1/10 


In [None]:
print("Training Started .... ")
model = copy.deepcopy(model_copy)
trained_gpt = train(dl_chat, model, tokenizer, device, epochs=10)
print("Training Completed .... ")

Training Started .... 
Epoch: 1/10 


In [None]:
print("infer from model : ")
while True:
  inp = input()
  res = infer(trained_gpt, tokenizer, inp)
  print(res)

In [None]:
# Load the SQuAD dataset
squad_dataset = load_dataset("squad")

# Function to preprocess the dataset for GPT-2
def preprocess_function(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = [answer["text"][0] for answer in examples["answers"]]

    # Concatenate question and context with special tokens
    inputs = [f"question: {question} context: {context} answer:" for question, context in zip(questions, contexts)]
    targets = [f"{answer}" for answer in answers]
    return {"input_texts": inputs, "target_texts": targets}

# Apply the preprocessing function to the dataset
tt_datasets = squad_dataset.map(preprocess_function, batched=True)


In [None]:
tt_datasets

In [None]:
def print_dataset_info(dataset):
    print(f"Dataset name: {dataset.__class__.__name__}")
    print(f"Number of examples: {len(dataset)}")
    print(f"Features: {dataset.features}")
    print(f"Column names: {dataset.column_names}")


# Assuming 'tokenized_datasets' is your processed dataset from the previous steps
print_dataset_info(tt_datasets["train"])

In [None]:
import random

def display_random_examples(dataset, num_examples=5, tokenized=False):
    random_indices = random.sample(range(len(dataset)), num_examples)
    for idx in random_indices:
        example = dataset[idx]
        print(f"Example {idx}:")

        if tokenized:
           print(f"Question: {example['input_ids']}")
           print(f"Question: {example['labels']}")
        else:
          print(f"Question: {example['question']}")
          print(f"Context: {example['context']}")
          print(f"Answer: {example['answers']['text'][0]}")
        print()

display_random_examples(tt_datasets["train"], num_examples=3)

In [None]:
max_length = 64

In [None]:
# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(examples["input_texts"],
                       truncation=True,
                       padding="max_length",
                       max_length=max_length)

    targets = tokenizer(examples["target_texts"],
                        truncation=True,
                        padding="max_length",
                        max_length=max_length)

    # Convert to PyTorch tensors
    input_ids = inputs["input_ids"]
    target_ids = targets["input_ids"]
    return {"input_ids": input_ids, "labels": target_ids}

# Tokenize the dataset
rc = ["input_texts", "target_texts", "context", "question", "answers", "id", "title"]
tokenized_datasets = tt_datasets.map(tokenize_function,
                                            batched=True,
                                            remove_columns=rc)


In [None]:
tokenized_datasets

In [None]:
display_random_examples(tokenized_datasets["train"],
                        num_examples=3,
                        tokenized=True)

In [None]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

In [None]:
train_dataset = tokenized_datasets["train"]
eval_dataset = tokenized_datasets["validation"]
batch_size = 64
class QADataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return torch.tensor(item['input_ids']), torch.tensor(item['labels'])

train_dataloader = DataLoader(QADataset(train_dataset),
                              batch_size=batch_size,
                              shuffle=True)
eval_dataloader = DataLoader(QADataset(eval_dataset),
                             batch_size=batch_size)


In [None]:
from transformers import AdamW

In [None]:
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
optimizer = AdamW(model.parameters(), lr=1e-5)

# Training loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Inside the training loop
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Training
    model.train()
    total_train_loss = 0
    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        outputs = model(input_ids)
        logits = outputs.logits
        # Shift the labels to the right by one to match the GPT-2 training setup
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Training loss: {avg_train_loss}")

    # Evaluation
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            outputs = model(input_ids)
            logits = outputs.logits
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            loss = loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    print(f"Validation loss: {avg_eval_loss}")

In [None]:
# Save the fine-tuned model
model.save_pretrained("./fine-tuned-gpt2")
tokenizer.save_pretrained("./fine-tuned-gpt2")

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
model_path = "./fine-tuned-gpt2"

loaded_model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

In [None]:
model = copy.deepcopy(model_copy)
optimizer = AdamW(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Training
    model.train()
    total_train_loss = 0
    train_steps = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

        train_steps += 1

    avg_train_loss = total_train_loss / train_steps
    print(f"Training loss: {avg_train_loss}")

    # Evaluation
    model.eval()
    total_eval_loss = 0
    eval_steps = 0

    with torch.no_grad():
        for batch in eval_dataloader:
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

            eval_steps += 1

    avg_eval_loss = total_eval_loss / eval_steps
    print(f"Validation loss: {avg_eval_loss}")

# End of training loop


In [None]:
text = "Hello how are you?"
# loaded_model = loaded_model.eval().to(device)
model = model.eval().to(device)
for i in range(2):
  encoded_input = tokenizer.encode_plus(text,
                                        return_tensors='pt',
                                        padding=True,
                                        truncation=True)


  # Generate text
  output_sequences = model.generate(
      input_ids=encoded_input['input_ids'].to(device),
      attention_mask=encoded_input['attention_mask'].to(device),
      max_length=50,  # Maximum length of the generated text
      num_return_sequences=1,  # Number of sequences to generate
      no_repeat_ngram_size=2,  # Prevent repeating the same n-gram
      top_k=50,  # Number of highest probability vocabulary tokens to keep for top-k-filtering
      top_p=0.95,  # Cumulative probability of parameter highest probability vocabulary tokens
      temperature=0.9,  # The value used to module the next token probabilities
      do_sample=True,  # Set to True for sampling; False for greedy decoding
      pad_token_id=pad_token_id  # Setting pad token id
  )

  # Decode the generated text
  generated_text = tokenizer.decode(output_sequences[0],
                                    skip_special_tokens=True)

  print(i, "text", text, "\tgenerated_text", generated_text)

  text += " " + generated_text


print(text)

In [None]:
type(text)

In [None]:
def generate_answer(question, context, model, tokenizer):
    input_text = f"question: {question} context: {context} answer:"
    inputs = tokenizer.encode(input_text, return_tensors="pt")

    encoded_input = tokenizer.encode_plus(
    text,
    return_tensors='pt',
    padding=True,
    truncation=True
)


    inputs = encoded_input
    model.to(device)

    model.eval()
    outputs = model.generate(input_ids=inputs['input_ids'].to(device),
                              attention_mask=inputs['attention_mask'].to(device),
                              max_length=512, num_return_sequences=1)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("answer:")[-1].strip()

# Example usage
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."
answer = generate_answer(question, context, model, tokenizer)
print(answer)
