# Introduction generation

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch transformers
!pip install datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("Iliasselyaa/CMUBookSummaryDataset")

In [6]:
import torch.nn as nn
import tensorflow as tf
import pandas as pd
from transformers import GPT2LMHeadModel, BertTokenizer
from nltk.tokenize import sent_tokenize
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import torch


Dataset Class and Preprocessing

In [None]:
# tokenizer should match model being used aka the generator
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

In [8]:


def text_transform(text):
  input_ids = tokenizer.encode(text, return_tensors="pt", padding="max_length", max_length=1024, truncation=True)
  return input_ids

def worker_init_fn(worker_id):
    global tokenizer
    from transformers import GPT2Tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokenizer.pad_token = tokenizer.eos_token

class MyDataset(Dataset):
  def __init__(self, dataset, text_transform=None):
    self.data = dataset
    self.text_transform = text_transform

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    summary = self.data[idx]['summary']
    if self.text_transform:
        transformed_output = self.text_transform(summary).squeeze(0)
    return {'input_ids': transformed_output}

train_test_split = dataset["train"].train_test_split(test_size=0.1)

# Access the split datasets
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

train_dataset = MyDataset(train_dataset, text_transform=text_transform)

val_dataset = MyDataset(val_dataset, text_transform=text_transform)

In [9]:
train_data_loader = DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=3, worker_init_fn=worker_init_fn)

val_data_loader = DataLoader(val_dataset, batch_size=1, shuffle=True, num_workers=3, worker_init_fn=worker_init_fn)



# For Summary

In [7]:
# Dataset meant for model that is only trained on last 20% of summary inputs in order to make conclusions

class Conclusionizer(Dataset):
  def __init__(self, dataset, text_transform=None):
    self.data = dataset
    self.text_transform = text_transform

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    summary = self.data[idx]['summary']
    ending = int(len(summary)*0.8) # skip first 20% of summary
    summary = summary[ending:]
    if self.text_transform:
        transformed_output = self.text_transform(summary).squeeze(0)
    return {'input_ids': transformed_output}

train_test_split = dataset["train"].train_test_split(test_size=0.1)

# Access the split datasets
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

train_dataset_conclusion = Conclusionizer(train_dataset, text_transform=text_transform)

val_dataset_conclusion = Conclusionizer(val_dataset, text_transform=text_transform)

In [8]:
train_data_loader_conclusion = DataLoader(train_dataset_conclusion, batch_size=1, shuffle=True, num_workers=3, worker_init_fn=worker_init_fn)

val_data_loader_conclusion = DataLoader(val_dataset_conclusion, batch_size=1, shuffle=True, num_workers=3, worker_init_fn=worker_init_fn)



# Testing Dataloader

In [None]:
try:
    i = 0
    for batch in enumerate(train_data_loader):
        print(f"Batch {i} loaded successfully")
        i += 1
except Exception as e:
    print(f"Error during loading: {e}")

  self.pid = os.fork()


Batch 0 loaded successfully
Batch 1 loaded successfully
Batch 2 loaded successfully
Batch 3 loaded successfully
Batch 4 loaded successfully
Batch 5 loaded successfully
Batch 6 loaded successfully
Batch 7 loaded successfully
Batch 8 loaded successfully
Batch 9 loaded successfully
Batch 10 loaded successfully
Batch 11 loaded successfully
Batch 12 loaded successfully
Batch 13 loaded successfully
Batch 14 loaded successfully
Batch 15 loaded successfully
Batch 16 loaded successfully
Batch 17 loaded successfully
Batch 18 loaded successfully
Batch 19 loaded successfully
Batch 20 loaded successfully
Batch 21 loaded successfully
Batch 22 loaded successfully
Batch 23 loaded successfully
Batch 24 loaded successfully
Batch 25 loaded successfully
Batch 26 loaded successfully
Batch 27 loaded successfully
Batch 28 loaded successfully
Batch 29 loaded successfully
Batch 30 loaded successfully
Batch 31 loaded successfully
Batch 32 loaded successfully
Batch 33 loaded successfully
Batch 34 loaded successf

  self.pid = os.fork()


Batch 53 loaded successfully
Batch 54 loaded successfully
Batch 55 loaded successfully


KeyboardInterrupt: 

# Novelizer

In [10]:
class Novelizer(nn.Module):
  def __init__(self, generator, device):
    super(Novelizer, self).__init__()
    self.tokenizer = tokenizer
    self.generator = generator
    self.device = device

  def forward(self, input_ids, labels):
    input_ids = input_ids.to(self.device)

    outputs = self.generator(input_ids=input_ids, labels=input_ids)
    return outputs

  def generate_story_intro(self, story_type, num_return_sequences=1, device='cpu'): #Change num of sequences to get different answers
    # Tokenize the story type prompt
    input_ids = self.tokenizer.encode(story_type, return_tensors="pt").to(device)
    num_beams = max(num_return_sequences, 1)
    # Use the forward method to generate text
    self.generator.to(device)
    generated_sequences = self.generator.generate(
            input_ids,
            max_length=64, # max length
            num_return_sequences=num_return_sequences,
            num_beams=num_beams,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7
        )

    return [
            self.tokenizer.decode(generated_sequence, skip_special_tokens=True)
            for generated_sequence in generated_sequences
        ]


Training

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# NOT THIS ONE

In [None]:
# for when you need to load the latest model
model = Novelizer(generator=GPT2LMHeadModel.from_pretrained('gpt2'), device=device)
model.load_state_dict(torch.load('/content/drive/MyDrive/Junior/ACV/model6.pth')) # loading model 6

In [11]:
# where i set the generator
# model = Novelizer(generator=GPT2LMHeadModel.from_pretrained('gpt2'), device=device) # pretrained gpt2 for now

optimizer = Adam(model.parameters(), lr=3e-4) # optimizer and learning rate
criterion = nn.CrossEntropyLoss() # loss function
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)

def train_model(model, train_loader, val_loader, optimizer, criterion, scheduler, num_epochs=1, device='cpu'):
  model.to(device)

  accumulation_steps = 4  # Adjust based on your needs
  optimizer.zero_grad()
  patience = 0
  best_val_loss = 100000
  for epoch in range(num_epochs):
    # initial training
    model.train()
    train_loss = 0.0
    c = 0
    for batch in train_loader:
      if c % 1000 == 0:
        print(c)
      c += 1
      input_ids = batch['input_ids'].to(device)

      outputs = model(input_ids=input_ids, labels=input_ids)
      loss = outputs.loss

      # Backward pass and optimize
      loss.backward()

      optimizer.step()
      optimizer.zero_grad()

      train_loss += loss.item()

    scheduler.step()
    train_loss /= len(train_loader.dataset)


    # validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
      for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        # Forward pass
        outputs = model(input_ids=input_ids, labels=input_ids)
        loss = outputs.loss

        val_loss += loss.item()

    val_loss /= len(val_loader.dataset)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience = 0
        # model checkpoint
        model_save_path = '/content/drive/MyDrive/Junior/ACV/model6.pth' # now saving model 3
        torch.save(model.state_dict(), model_save_path)
    else:
        patience += 1
    if patience > 3:
        break  # e arly stopping trigger

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

In [12]:
import gc
torch.cuda.empty_cache()
gc.collect()
torch.cuda.memory_summary(device=None, abbreviated=False)



In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
train_model(model, train_data_loader, val_data_loader, optimizer, criterion, scheduler, num_epochs=100, device=device)

Testing

In [32]:
###### STOP #######
import torch
model_save_path = '/content/drive/MyDrive/Junior/ACV/conclusionizer3.pth' # now saving model 3
torch.save(model.state_dict(), model_save_path)

In [None]:
model.generate_story_intro('Then cheryl left on a boat to moscow',num_return_sequences=2, device=device)