### Span detection using BERT
It is kind of extractive summerization in Q/A systems

In [None]:
pip install transformers torch


In [None]:
# Using Bert for span detection task. Note that, we use special model of large BERT for QA
from transformers import BertTokenizer, BertForQuestionAnswering
import torch

# Load pre-trained BERT model and tokenizer for question answering
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

In [None]:
# Define the question and the context
question = "Who wrote Romeo and Juliet?"
context = "Romeo and Juliet is a tragedy written by William Shakespeare early in his career about two young star-crossed lovers."

# Tokenize the input, add special tokens ([CLS] and [SEP]), and convert to tensor
inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"]
inputs

{'input_ids': tensor([[  101,  2040,  2626, 12390,  1998, 13707,  1029,   102, 12390,  1998,
         13707,  2003,  1037, 10576,  2517,  2011,  2520,  8101,  2220,  1999,
          2010,  2476,  2055,  2048,  2402,  2732,  1011,  4625, 10205,  1012,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# Get the predicted start and end positions
with torch.no_grad():
    res = model(input_ids)

start_pos = torch.argmax(res.start_logits)
end_pos = torch.argmax(res.end_logits)

print("The probabilities for start are:", res.start_logits)
print("The strat index is:", start_pos)


The probabilities for start are: tensor([[-6.3058, -5.7577, -6.7796, -3.3953, -8.1165, -6.3404, -8.4627, -6.3057,
          0.1943, -5.3829, -3.9351, -5.0365, -3.2273, -2.6362, -1.4974, -2.2578,
          7.5604,  3.0763, -4.2326, -6.6545, -3.2653, -4.8068, -7.0875, -6.4545,
         -6.8650, -6.7581, -7.6815, -7.6125, -5.6213, -6.3056, -6.3057]])
The strat index is: tensor(16)


In [None]:
# Inference
# Convert the token IDs back to tokens and join to get the answer
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_pos:end_pos+1]))
print(answer)

william shakespeare


In [None]:
# Another Example



question = 'what services does Coles provied?'

context = "Coles Supermarkets Australia Pty Ltd, trading as Coles, is an Australian supermarket, retail and consumer services chain,\
             headquartered in Melbourne as part of the Coles Group. Founded in 1914 in Collingwood by George Coles, Coles operates 846[3] \
             supermarkets throughout Australia, including several now re-branded Bi-Lo Supermarkets. Coles has over 120,000 employees[3][4]\
              and accounts for around 27 per cent of the Australian market.[5] Coles' large head office site in Melbourne's inner south-east \
              has 4,000 employees of the workforce located inside.\
                Coles Online is the company's online shopping ('click & collect' and home delivery) service.\
              Between 1986 and 2006, Coles Supermarkets was a brand of Coles Myer, later Coles Group, prior to Wesfarmers purchasing Coles \
              Group in 2007. It became a subsidiary of Coles Group again after Wesfarmers spun-off the business in November 2018.[6]\
              In 2020, Coles changed its slogan to 'Value the Australian way'"

# Tokenize the input, add special tokens ([CLS] and [SEP]), and convert to tensor
inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"]


# Get the predicted start and end positions
with torch.no_grad():
    res = model(input_ids)

start_pos = torch.argmax(res.start_logits)
end_pos = torch.argmax(res.end_logits)



# Inference
# Convert the token IDs back to tokens and join to get the answer
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_pos:end_pos+1]))
print(answer)


consumer services


### Abstractive Summerization for QA

In [None]:
pip install sentencepiece

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the pre-trained T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
# Define the context and the question
questions = ['what is Coles?',
            'What does it do?',
            'When it is established?',
            'Is Mayer mentioned in the cotext (Yes/NO)? ',
            'Summerize the context'
]

context = "Coles Supermarkets Australia Pty Ltd, trading as Coles, is an Australian supermarket, retail and consumer services chain,\
             headquartered in Melbourne as part of the Coles Group. Founded in 1914 in Collingwood by George Coles, Coles operates 846[3] \
             supermarkets throughout Australia, including several now re-branded Bi-Lo Supermarkets. Coles has over 120,000 employees[3][4]\
              and accounts for around 27 per cent of the Australian market.[5] Coles' large head office site in Melbourne's inner south-east \
              has 4,000 employees of the workforce located inside.\
                Coles Online is the company's online shopping ('click & collect' and home delivery) service.\
              Between 1986 and 2006, Coles Supermarkets was a brand of Coles Myer, later Coles Group, prior to Wesfarmers purchasing Coles \
              Group in 2007. It became a subsidiary of Coles Group again after Wesfarmers spun-off the business in November 2018.[6]\
              In 2020, Coles changed its slogan to 'Value the Australian way'"


# Format the input string: The convention for T5 is to prepend "question: [your question] context: [your context]" It is like prompting
for question in questions:
  input_str = f"question: {question} context: {context}"
  inputs = tokenizer.encode(input_str, return_tensors="pt", max_length=512, truncation=True)

  answer_ids = model.generate(inputs, max_length=500, num_beams=4, early_stopping=True)
  # print(answer_ids)

  # Decode and print the answer
  answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)
  print(f"Question: {question}\nAnswer: {answer}\n")

Question: what is Coles?
Answer: an Australian supermarket, retail and consumer services chain

Question: What does it do?
Answer: operates 846[3] supermarkets throughout Australia

Question: When it is established?
Answer: 1914

Question: Is Mayer mentioned in the cotext (Yes/NO)? 
Answer: yes

Question: Summerize the context
Answer: Coles Supermarkets Australia Pty Ltd, trading as Coles, is an Australian supermarket, retail and consumer services chain, headquartered in Melbourne



## Text Generation Sampling Strategies
 * top-k
 * top_p
 * Beam search

In [15]:
# Top-p sample and top-k sampling methods
# Note that the beam search strategy, i.e., num_beams, can not be used with top-k or top-p sampling methods.

# Define the context and the question
questions = ['what is Coles?',
            'What does it do?',
            'When it is established?',
            'Is Mayer mentioned in the cotext (Yes/NO)? ',
            'Summerize the context'
]

context = "Coles Supermarkets Australia Pty Ltd, trading as Coles, is an Australian supermarket, retail and consumer services chain,\
             headquartered in Melbourne as part of the Coles Group. Founded in 1914 in Collingwood by George Coles, Coles operates 846[3] \
             supermarkets throughout Australia, including several now re-branded Bi-Lo Supermarkets. Coles has over 120,000 employees[3][4]\
              and accounts for around 27 per cent of the Australian market.[5] Coles' large head office site in Melbourne's inner south-east \
              has 4,000 employees of the workforce located inside.\
                Coles Online is the company's online shopping ('click & collect' and home delivery) service.\
              Between 1986 and 2006, Coles Supermarkets was a brand of Coles Myer, later Coles Group, prior to Wesfarmers purchasing Coles \
              Group in 2007. It became a subsidiary of Coles Group again after Wesfarmers spun-off the business in November 2018.[6]\
              In 2020, Coles changed its slogan to 'Value the Australian way'"


# Format the input string: The convention for T5 is to prepend "question: [your question] context: [your context]" It is like prompting
for question in questions:
  input_str = f"question: {question} context: {context}"
  inputs = tokenizer.encode(input_str, return_tensors="pt", max_length=512, truncation=True)

  answer_ids = model.generate(inputs,
                              # top_p=0.19,  # Adjust the top-p value as needed
                              do_sample = True,
                              top_k = 100,  # the large the more randomness
                              max_length=500,
                              # num_beams=4,
                              # early_stopping=True
                              )
  # print(answer_ids)

  # Decode and print the answer
  answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)
  print(f"Question: {question}\nAnswer: {answer}\n")

Question: what is Coles?
Answer: an Australian supermarket, retail and consumer services chain

Question: What does it do?
Answer: operates 846[3] supermarkets throughout Australia

Question: When it is established?
Answer: 1914

Question: Is Mayer mentioned in the cotext (Yes/NO)? 
Answer: Coles Myer

Question: Summerize the context
Answer: Coles Supermarkets Australia Pty Ltd, trading as Coles, is an Australian supermarket, retail and consumer services chain, headquartered in Melbourne as part of the Coles Group



# Fine Tuning T5 model
### Reading PDF files to text

In [None]:
pip install PyPDF2 transformers

In [3]:
import PyPDF2
from transformers import T5Tokenizer, T5ForConditionalGeneration, TrainingArguments, Trainer

# 1. Extract text from PDF
def extract_text_from_pdf(pdf_path: str):
  total_text =''
  reader = PyPDF2.PdfReader(pdf_path)
  number_of_pages = len(reader.pages)

  for page_num in range(number_of_pages):
    page = reader.pages[page_num]
    text = page.extract_text()
    total_text+=text

  return total_text

pdf_path = '/content/AA Academy_Master Pack_Guests.pdf'
text = extract_text_from_pdf(pdf_path)


text

'Creating smarter business and  customer \nexperiences together!AA\nAcademy\nReady for \nreview\nSilvio Giorgio\nGeneral Manager\nData & IntelligenceDay 1Ready for \nreview\n       Time Topic Session / sub -topic Speaker\n9:00 –9:10am Welcome Welcome and kick -off Silvio Giorgio, GM Data and Intelligence\n9:10 –9:30am Keynote speaker Understanding our existing and next generation shoppers; their mindset, motivations, needs and expectations. Wendy Stops, Coles Board member\n9:30 –9:50am Our strategic vision How are we aligned with corporate planning and how are we supporting the Coles -wide strategy? The Coles value chain and AA’s \ninvolvement and investment in it; enhancements, opportunities, and priorities over the coming 2+ years.Sam Riethmuller, Head of Advanced Analytics\n9:50 –10:00am Benefits How is AA driving value back out to the business and our customers? Sam Riethmuller, Head of Advanced Analytics\n10:00 –10:45am Machine learning A quick guide to machine learning, modelling

In [10]:
# We can write it to use it for GPT fine-tunning
with open('pdf2text.txt', 'w') as file:
    file.write(text)

In [None]:
# 2. Tokenize the text into input-target pairs
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tokenized_text = tokenizer.tokenize(text)

In [44]:
len(tokenized_text)

26124

In [7]:
# Convert token strings to token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)

# Create sequences of length N and predict the N+1 token

input_ids = []
target_ids = []

In [8]:
SEQUENCE_LENGTH = 50
for i in range(0, len(token_ids) - SEQUENCE_LENGTH):
    input_ids.append(token_ids[i:i+SEQUENCE_LENGTH])
    target_ids.append(token_ids[i+SEQUENCE_LENGTH])

# Decode the input IDs to get the input texts
input_texts = [tokenizer.decode(ids) for ids in input_ids]
target_texts = [tokenizer.decode([tid]) for tid in target_ids]  # Note: tid is wrapped in a list since it's a single token ID

In [37]:
print(len(input_texts))
print(input_texts[1])
print(target_texts[1])

52148
Creating smarter business and customer experiences together!AA Academy Ready for review Silvio Giorgio General Manager Data & IntelligenceDay 1Ready for review Time Topic Session / sub -topic Speaker 9:00 –9:
10


In [9]:
inputs = tokenizer(input_texts, padding='max_length', truncation=True, max_length=SEQUENCE_LENGTH, return_tensors="pt")
labels = tokenizer(target_texts, padding='max_length', truncation=True, max_length=SEQUENCE_LENGTH, return_tensors="pt").input_ids


In [46]:
inputs

{'input_ids': tensor([[    3, 18120,  2592,  ...,     3,   104,     1],
        [    3, 18120,  2592,  ...,     3,   104,     1],
        [ 2592,    49,   268,  ...,  1298,    10,     1],
        ...,
        [    3,  5498,  3224,  ...,  2197,     1,     0],
        [ 3224,   475,    41,  ...,     1,     0,     0],
        [  475,    41, 10622,  ...,     1,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0]])}

In [10]:
# When using the Trainer from the transformers library, the dataset passed to it should be a Dataset object that contains
# both the inputs and the labels. Let's create a custom dataset for this purpose:


from torch.utils.data import Dataset


class TextDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)


    def __getitem__(self, idx):
      return {
          "input_ids": self.inputs["input_ids"][idx][:SEQUENCE_LENGTH],  # Truncate or pad as needed
          "attention_mask": self.inputs["attention_mask"][idx][:SEQUENCE_LENGTH],  # Truncate or pad as needed
          "labels": self.labels[idx][:SEQUENCE_LENGTH]  # Truncate or pad as needed
      }




train_dataset = TextDataset(inputs, labels)

In [11]:
print(len(train_dataset))
train_dataset[3]

2


{'input_ids': tensor([    3,    49,   268,    11,   884,  2704,   544,    55,  5498,  4702,
         14476,    21,  1132, 25938,    32,  3156,   127, 10253,  2146,  3440,
          2747,     3,   184,  5869,  2825,  1433, 16803,   209, 19915,    63,
            21,  1132,  2900, 18059,   679,     7,  1938,     3,    87,   769,
             3,    18, 19710, 16778, 23239,     3,   104,  1298,    10,     1]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1]),
 'labels': tensor([5242,    1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0])}

### Fine tune the model

In [None]:
!pip install accelerate -U
!pip install transformers[torch] -U

In [12]:
# 3. Fine-tune T5
model = T5ForConditionalGeneration.from_pretrained("t5-large")

training_args = TrainingArguments(
    output_dir = '/content',
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=2,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Step,Training Loss,Validation Loss


TrainOutput(global_step=3, training_loss=33.508076985677086, metrics={'train_runtime': 24.2064, 'train_samples_per_second': 0.248, 'train_steps_per_second': 0.124, 'total_flos': 1268582400000.0, 'train_loss': 33.508076985677086, 'epoch': 3.0})

### Validating fine-tuning by asking questions

In [17]:
question = "What is AA vision?"
input_ids = tokenizer.encode(question, return_tensors="pt")

In [19]:
# Move model and input to GPU if available
# model = model.to(device)
# input_ids = input_ids.to(device)

# Generate the output token IDs
output_ids = model.generate(input_ids,
                            do_sample = True,
                            max_length=50,
                            num_beams=5,
                            temperature=0.9)

# Decode the output IDs to get the answer
answer = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(answer)

? What is AA vision? What is AA vision? What is AA vision?


## Fine tuning GPT2

In [8]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, TrainingArguments, Trainer

In [9]:
# Initialize the tokenizer
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [11]:
# Prepare the dataset

train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='/content/pdf2text.txt', # See extract_text_from_pdf() in the earler cells
    block_size=128
)



In [12]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [13]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
)

In [14]:
# fine tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

Step,Training Loss


TrainOutput(global_step=8, training_loss=4.445819854736328, metrics={'train_runtime': 376.4542, 'train_samples_per_second': 0.6, 'train_steps_per_second': 0.021, 'total_flos': 52471589240832.0, 'train_loss': 4.445819854736328, 'epoch': 1.0})

In [15]:
# Save the model and tokenizer after fine-tuning
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

## Validating the fine-tuning

In [47]:
input_text = "What is Smarter Forecast team in Coles Advanced Analytics?"
encoding = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=100)
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]


output = model.generate(input_ids,
                        max_length=100,
                        num_return_sequences=1,
                        attention_mask = attention_mask,
                        do_sample = True,
                        temperature=0.5)
for i, sequence in enumerate(output):
    decoded_sequence = tokenizer.decode(sequence, skip_special_tokens=True)
    print(f"Generated Sequence {i + 1}: {decoded_sequence}")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Sequence 1: What is Smarter Forecast team in Coles Advanced Analytics?

The Smarter Forecast team is a group of professional data scientists, data analysts, and data engineers from Coles Advanced Analytics who are working to identify, identify, and improve our business.

The Smarter Forecast team is a team of data scientists, data analysts, and data engineers from Coles Advanced Analytics.

We are a small group of data scientists, data analysts, and data engineers working to identify


In [40]:
encoding = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=100)
encoding

{'input_ids': tensor([[ 2061,   318,  2439,  2571,  4558,  2701,  1074,   287,  1623,   274,
         13435, 30437,    30]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}