#Importing Packages

In [None]:
!pip install --quiet  datasets #to access race dataset
!pip install --quiet pyarrow   #to deal with parquet files for saving dataset if required
!pip install --quiet  tqdm     #for progress bars
!pip install --quiet transformers # for t5 model
!pip install --quiet tokenizers  #tokenizers from HuggingFace
!pip install --quiet sentencepiece #subword tokenizer used by T5
!pip install --quiet pytorch-lightning # pytorch wrapper
!pip install --quiet torchtext # text utilities

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Fetching Datasets

In [3]:
#imports
import pandas as pd
import torch
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from pprint import pprint
import copy

#Fetching Custom Dataset

In [115]:
from google.colab import files
import io
import pandas as pd
#uploaded = files.upload()
#df2 = pd.read_csv(io.BytesIO(uploaded['Dataset - Sheet1.csv']))
df2 = pd.read_csv('/content/drive/MyDrive/Dataset - Sheet1.csv')

In [116]:
df3=df2.iloc[0:177, 1:4]
df3

Unnamed: 0,Context,Question,Answer
0,"We present QuAC, a dataset for Question Answer...",What is the purpose of the QuAC dataset?,The QuAC dataset was created for Question Answ...
1,"We present QuAC, a dataset for Question Answer...",How are the dialogs in the QuAC dataset struct...,The dialogs in the QuAC dataset involve a stud...
2,"We present QuAC, a dataset for Question Answer...",What are some unique challenges presented by t...,The QuAC dataset introduces several challenges...
3,"We present QuAC, a dataset for Question Answer...",How does the performance of the best model on ...,The best model evaluated on the QuAC dataset f...
4,"In information-seeking dialog, students repeat...",How does the QuAC dataset encourage natural an...,The QuAC dataset promotes natural and diverse ...
...,...,...,...
172,Photo sites such as Flickr are a rich source o...,What is the source of labeled data used in the...,The source of labeled data used in the image t...
173,Photo sites such as Flickr are a rich source o...,How does user-generated metadata differ from m...,"User-generated metadata, such as user-tags, ar..."
174,Photo sites such as Flickr are a rich source o...,What is the approach for representing image fe...,Image features are represented using a pre-tra...
175,Photo sites such as Flickr are a rich source o...,How is the conditional adversarial net trained...,The convolutional model and language model use...


In [23]:
def create_dataset(filename,  verbose = False):
  data  = pd.read_csv(filename, skipinitialspace=True, usecols=['Context','Question','Answer'] )
  result_df1  = pd.DataFrame(columns = ['context', 'question','answer'])
  result_df1 = data
  return result_df1

In [117]:
from sklearn.model_selection import train_test_split

train, valid = train_test_split(df2, test_size=0.3)
#valid, test = train_test_split(temp, test_size=0.1)

In [118]:
def create_pandas_dataset1(data, answer_threshold=512, verbose=False):
    count_long, count_short = 0, 0
    result_df = pd.DataFrame(columns=['context', 'question', 'answer'])
    for val in tqdm(data):
        if 'Context' in val and 'Question' in val and 'Answer' in val:
            passage = val['Context']
            question = val['Question']
            answer = val['Answer']
            no_of_words = len(answer.split())
            if no_of_words >= answer_threshold:
                count_long += 1
                continue
            else:
                result_df.loc[count_short] = [passage, question, answer]
                count_short += 1
        else:
            continue
    if verbose:
        return result_df, count_long, count_short
    else:
        return result_df




In [None]:
df_train1 , df_validation1 = create_pandas_dataset1(train) , create_pandas_dataset1(valid)
print(f"\n Total Train Samples:{df_train1.shape} , Total Validation Samples:{df_validation1.shape}")

In [59]:
device  = 'cuda' if torch.cuda.is_available() else "cpu"

In [60]:
pd.options.display.max_rows , pd.options.display.max_columns  = 100,100

#Importing Race Dataset

In [95]:
train_dataset = load_dataset('race','middle', split='train')
valid_dataset = load_dataset('race', 'middle', split='validation')
print(f"Total Train Samples:{len(train_dataset)} , Total Validation Samples:{len(valid_dataset)}")

Total Train Samples:25421 , Total Validation Samples:1436


In [96]:
print(train_dataset)

Dataset({
    features: ['example_id', 'article', 'answer', 'question', 'options'],
    num_rows: 25421
})


In [97]:
def create_pandas_dataset(data,
                          answer_threshold=512,
                          verbose = False):

  count_long ,count_short = 0 , 0
  result_df  = pd.DataFrame(columns = ['context', 'question','answer'])
  #print(data)
  for index,val in enumerate(tqdm(data)):
      passage = val['article']
      question = val['question']
      ans = val['answer']
      answer = val['options'][ord(ans)-65]
      no_of_words = len(answer.split())
      if no_of_words >= answer_threshold:
          count_long = count_long + 1
          continue
      else:
          result_df.loc[count_short] = [passage] +[question] + [answer]
          count_short = count_short + 1
  if verbose:
    return (result_df,
            count_long,
            count_short)
  else:
    return result_df

In [99]:
df_train , df_validation = create_pandas_dataset(train_dataset) , create_pandas_dataset(valid_dataset)
print(f"\n Total Train Samples:{df_train.shape} , Total Validation Samples:{df_validation.shape}")

Dataset({
    features: ['example_id', 'article', 'answer', 'question', 'options'],
    num_rows: 25421
})


100%|██████████| 25421/25421 [01:06<00:00, 382.60it/s]


Dataset({
    features: ['example_id', 'article', 'answer', 'question', 'options'],
    num_rows: 1436
})


100%|██████████| 1436/1436 [00:03<00:00, 359.24it/s]


 Total Train Samples:(25421, 3) , Total Validation Samples:(1436, 3)





In [None]:
# Saving data for future use
df_train.to_parquet('train_race.parquet')
df_validation.to_parquet('validation_race.parquet')

After saving we can run this file to read from drive

In [62]:
import pandas as pd
pd.read_parquet('/content/drive/MyDrive/train_race.parquet', engine='pyarrow')

Unnamed: 0,context,question,answer
0,Last week I talked with some of my students ab...,We can know from the passage that the author w...,teacher
1,Last week I talked with some of my students ab...,Many graduates today turn to cosmetic surgery ...,get an advantage over others in job-hunting
2,Last week I talked with some of my students ab...,"According to the passage, the author believes ...",media are to blame for misleading young people...
3,Last week I talked with some of my students ab...,Which' s the best title for the passage?.,Young Graduates Look to Surgery for Better Jobs
4,"YUZHOU, HENAN -An accident in a central China ...",What could be the best title for this passage?,A Coal Mine Accident in Central China
...,...,...,...
62440,What if I took that big jump on my bike?What's...,"According to the text,the teenager who explore...",have advantages over others.
62441,What if I took that big jump on my bike?What's...,What does the writer want to tell us by taking...,Mice also experience a period to explore the w...
62442,What if I took that big jump on my bike?What's...,What may the text discuss in the next part?,What really goes on in the teenage brain.
62443,"When officials in Richmond, B. C., Canada, tol...",Stephen Covey was doubtful at first because he...,youth crime was too complex for ordinary citiz...


# Creating a Pytorch DataSet for T5 Training and Validation

In [63]:
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

In [64]:
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small',model_max_length=512)
t5_model = T5ForConditionalGeneration.from_pretrained('t5-small')

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [65]:
class QuestionGenerationDataset(Dataset):
    def __init__(self, tokenizer, filepath, max_len_inp=512,max_len_out=512):
        self.path = filepath

        self.passage_column = "context"
        self.answer = "answer"
        self.question = "question"

        # self.data = pd.read_csv(self.path)
        self.data = pd.read_parquet(self.path).iloc[:2000,:]
        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()

    def __len__(self):
        return len(self.inputs)


    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  #squeeze to get rid of the batch dimension
        target_mask = self.targets[index]["attention_mask"].squeeze()  # convert [batch,dim] to [dim]


        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for rownum,val in tqdm(self.data.iterrows()): # Iterating over the dataframe
            passage,answer,target = val[self.passage_column],val[self.answer],val[self.question]

            input_ = f"context: {passage}" # T5 Input format for question answering tasks
            target = f"question: {str(target)} answer: {answer}" # Output format we require
            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input,padding='max_length',
                truncation = True,return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output,padding='max_length',
                truncation = True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [66]:
train_path = '/content/drive/MyDrive/train_race.parquet' # change this accordingly
validation_path = '/content/drive/MyDrive/validation_race.parquet'
train_dataset = QuestionGenerationDataset(t5_tokenizer,train_path)
validation_dataset = QuestionGenerationDataset(t5_tokenizer,validation_path)

2000it [00:07, 260.52it/s]
2000it [00:07, 278.86it/s]


In [67]:
# Data Sample

train_sample = train_dataset[10] # thanks to __getitem__
decoded_train_input = t5_tokenizer.decode(train_sample['source_ids'])
decoded_train_output = t5_tokenizer.decode(train_sample['target_ids'])

print(decoded_train_input)
print(decoded_train_output)

context: Understanding the process of making career choices and managing your career is a basic life skill that everyone should understand. Your career decisions have such a profound effect on all aspects of your life. It's important to have the knowledge and resources needed to make smart, informed decisions. Whether you are looking for a new job, aiming to take the next step at your current job or planning your retirement options, you are making career decisions. Using good resources and the guidance of a career counselor can help you to make those decisions well. Many people mistakenly believe that choosing a career is a one-time event that happens some time in early adulthood. However, career management is actually a life-long process, and we continue to make consequential career choices over the years. When people want to take action in their career, career management and job search are about so much more than writing a good resume. If you learn about and act on the following area

# Fine Tuning T5

In [68]:
import pytorch_lightning as pl
from torch.optim import AdamW
import argparse
from transformers import (
    get_linear_schedule_with_warmup
  )

class T5Tuner(pl.LightningModule):

    def __init__(self,t5model, t5tokenizer,batchsize=4):
        super().__init__()
        self.model = t5model
        self.tokenizer = t5tokenizer
        self.batch_size = batchsize

    def forward( self, input_ids, attention_mask=None,
                decoder_attention_mask=None,
                lm_labels=None):

         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

         return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset, batch_size=self.batch_size,
                          num_workers=2)

    def val_dataloader(self):
        return DataLoader(validation_dataset,
                          batch_size=self.batch_size,
                          num_workers=2)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer

In [None]:
model = T5Tuner(t5_model,t5_tokenizer)

trainer = pl.Trainer(max_epochs = 3,accelerator=device)

trainer.fit(model)

In [None]:
# saving the model
!mkdir "t5_tokenizer"
!mkdir "t5_trained_model"
model.model.save_pretrained('/content/drive/MyDrive/t5_trained_model')
t5_tokenizer.save_pretrained('/content/drive/MyDrive/t5_tokenizer')

# Inference / Predictions

Importing the Trained Model from Drive

In [69]:
trained_model_path = '/content/drive/MyDrive/t5_trained_model'
trained_tokenizer = '/content/drive/MyDrive/t5_tokenizer'
device = 'cpu'

In [70]:
model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer)

Text Sample

In [71]:
context ="Holi is considered as one of the most revered and celebrated festivals of India and it is celebrated in almost every part of the country. It is also sometimes called as the “festival of love” as on this day people get to unite together forgetting all resentments and all types of bad feeling towards each other. The great Indian festival lasts for a day and a night, which starts in the evening of Purnima or the Full Moon Day in the month of Falgun. It is celebrated with the name Holika Dahan or Choti Holi on first evening of the festival and the following day is called Holi. In different parts of the country it is known with different names. The vibrancy of colors is something that brings in a lot of positivity in our lives and Holi being the festival of colours is actually a day worth rejoicing. Holi is a famous Hindu festival that is celebrated in every part of India with utmost joy and enthusiasm. The ritual starts by lighting up the bonfire one day before the day of Holi and this process symbolizes the triumph of good over the bad. On the day of Holi people play with colours with their friends and families and in evening they show love and respect to their close ones with Abeer."

text = "context: "+context
print(text)

context: Holi is considered as one of the most revered and celebrated festivals of India and it is celebrated in almost every part of the country. It is also sometimes called as the “festival of love” as on this day people get to unite together forgetting all resentments and all types of bad feeling towards each other. The great Indian festival lasts for a day and a night, which starts in the evening of Purnima or the Full Moon Day in the month of Falgun. It is celebrated with the name Holika Dahan or Choti Holi on first evening of the festival and the following day is called Holi. In different parts of the country it is known with different names. The vibrancy of colors is something that brings in a lot of positivity in our lives and Holi being the festival of colours is actually a day worth rejoicing. Holi is a famous Hindu festival that is celebrated in every part of India with utmost joy and enthusiasm. The ritual starts by lighting up the bonfire one day before the day of Holi and

In [72]:
encoding = tokenizer.encode_plus(text,max_length =512,padding='max_length',
                                 truncation = True,
                                 return_tensors="pt").to(device)
print (encoding.keys())
input_ids,attention_mask  = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

dict_keys(['input_ids', 'attention_mask'])


In [73]:
model.eval()

beam_outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=72, # How long the generated questions should be
    early_stopping=True,
    num_beams=5,
    num_return_sequences=1
)

for beam_output in beam_outputs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    print(sent)

question: Are there any other interesting aspects about this article? answer: ['Holi is considered as one of the most revered and celebrated festivals of India']


# Testing before Deployment

In [74]:
def get_question(sentence,mdl,tknizer):

  ''' function to generate questions. Takes a sentence,answer,
      model and tokenizer
  '''

  text = "context: {}".format(sentence)
  #print (text)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=72)
  #print("outputs")
  #print(outs)
  for beam_output in outs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    #print(sent)

  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]
  #print("decs")
  #print(dec)

  Question = dec[0]
  #Question= Question.strip()
  #Answer= Answer.strip()
  #print(Answer)
  index = Question.find("answer:")

  # Extract the question and answer based on the position
  question = Question[10:index].strip()
  answer = Question[index + len("answer:"):].strip()
  #print("Question1:", question)
  #print("Answer1:", answer)

  return  question,answer

In [79]:
context = "Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.Machine learning approaches have been applied to many fields including large language models, computer vision, speech recognition, email filtering, agriculture, and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5] ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods. "

print("context: ",context)
ques,answer = get_question(context,model,tokenizer)
print ("question: ",ques)
print ("answer: ",answer)

context:  Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.Machine learning approaches have been applied to many fields including large language models, computer vision, speech recognition, email filtering, agriculture, and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5] ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods. 
question:  Are there any other interesting aspects about this article?
answer:  ["ML is known in its application across business problems under the 

#Fine Tuning on Custom Dataset

In [75]:
!pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m174.1/232.6 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [80]:
text_input = """ Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3]

Machine learning approaches have been applied to many fields including large language models, computer vision, speech recognition, email filtering, agriculture, and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5] ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods.

The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis through unsupervised learning.[7][8] From a theoretical point of view Probably approximately correct learning provides a framework for describing machine learning. """

In [100]:
def strip_into_paragraphs(text):

    # Split the text into individual lines
    lines = text.split('\n')

    # Initialize an empty list to store paragraphs
    paragraphs = []

    # Initialize an empty string to store the current paragraph
    current_paragraph = ''

    # Iterate over each line
    for line in lines:
        # If the line is empty, consider it as a paragraph break
        if not line.strip():
            # Append the current paragraph to the list if it's not empty
            if current_paragraph.strip():
                paragraphs.append(current_paragraph.strip())
            # Reset the current paragraph
            current_paragraph = ''
        else:
            # Append the line to the current paragraph with a space
            current_paragraph += line.strip() + ' '

    # Append the last paragraph if it's not empty
    if current_paragraph.strip():
        paragraphs.append(current_paragraph.strip())

    # Return the list of paragraphs
    return paragraphs

# Example usage




In [114]:
! pip install pikepdf
from pikepdf import Pdf
pdf = Pdf.open('/content/drive/MyDrive/Machine Learning - Tom Mitchell_compressed.pdf')
len(pdf.pages)



421

In [85]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-2.3.1-py3-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-2.3.1


In [86]:
def create_dataset(filename,  verbose = False):
  data  = pd.read_csv(filename, skipinitialspace=True, usecols=['Context','Question','Answer'] )
  result_df1  = pd.DataFrame(columns = ['context', 'question','answer'])
  result_df1 = data[0:177, : ]
  return result_df1

In [87]:
class QuestionGenerationDataset_New(Dataset):
    def __init__(self, tokenizer, data, max_len_inp=512,max_len_out=512):
        self.passage_column = "Context"
        self.answer = "Answer"
        self.question = "Question"

        # self.data = pd.read_csv(self.path)
        self.data = data
        self.max_len_input = max_len_inp
        self.max_len_output = max_len_out
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []
        self._build()

    def __len__(self):
        return len(self.inputs)


    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask = self.inputs[index]["attention_mask"].squeeze()  #squeeze to get rid of the batch dimension
        target_mask = self.targets[index]["attention_mask"].squeeze()  # convert [batch,dim] to [dim]


        labels = copy.deepcopy(target_ids)
        labels [labels==0] = -100

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask,"labels":labels}

    def _build(self):
        for rownum,val in tqdm(self.data.iterrows()): # Iterating over the dataframe

            passage,answer,target = val[self.passage_column],val[self.answer],val[self.question]

            input_ = f"context: {passage}" # T5 Input format for question answering tasks
            target = f"question: {str(target)} answer: {answer}" # Output format we require
            # tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [input_], max_length=self.max_len_input,padding='max_length',
                truncation = True,return_tensors="pt"
            )
            # tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=self.max_len_output,padding='max_length',
                truncation = True,
                return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

In [113]:
train_dataset1 = QuestionGenerationDataset_New(t5_tokenizer,df2.sample(frac = 0.7))
validation_dataset1 = QuestionGenerationDataset_New(t5_tokenizer,df2.drop(df2.sample(frac = 0.7).index))

124it [00:02, 58.72it/s]
53it [00:01, 52.17it/s]


In [121]:
import pytorch_lightning as pl
from torch.optim import AdamW
import argparse
from transformers import (
    get_linear_schedule_with_warmup
  )

class T5Tuner1(pl.LightningModule):

    def __init__(self,t5model, t5tokenizer,batchsize=4):
        super().__init__()
        self.model = t5model
        self.tokenizer = t5tokenizer
        self.batch_size = batchsize

    def forward( self, input_ids, attention_mask=None,
                decoder_attention_mask=None,
                lm_labels=None):

         outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

         return outputs

    def training_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log('train_loss',loss)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self.forward(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            decoder_attention_mask=batch['target_mask'],
            lm_labels=batch['labels']
        )

        loss = outputs[0]
        self.log("val_loss",loss)
        return loss

    def train_dataloader(self):
        return DataLoader(train_dataset1, batch_size=self.batch_size,
                          num_workers=2)

    def val_dataloader(self):
        return DataLoader(validation_dataset1,
                          batch_size=self.batch_size,
                          num_workers=2)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=3e-4, eps=1e-8)
        return optimizer

In [None]:
trained_model_path = '/content/drive/MyDrive/t5_trained_model'
trained_tokenizer = '/content/drive/MyDrive/t5_tokenizer'
device = 'cpu'

In [None]:
model = T5ForConditionalGeneration.from_pretrained(trained_model_path)
tokenizer = T5Tokenizer.from_pretrained(trained_tokenizer)

model1 = T5Tuner1(model,tokenizer)

trainer = pl.Trainer(max_epochs = 10,accelerator=device, log_every_n_steps=10)

trainer.fit(model1)

In [None]:
# saving the model
#!mkdir "t5_tokenizer1"
#!mkdir "t5_trained_model1"
model1.model.save_pretrained('/content/drive/MyDrive/t5_trained_model2')
t5_tokenizer.save_pretrained('/content/drive/MyDrive/t5_tokenizer2')

Fined Tuned with 10 Epochs & saved on this location

In [None]:
model1.model.save_pretrained('/content/drive/MyDrive/t5_trained_model2')
t5_tokenizer.save_pretrained('/content/drive/MyDrive/t5_tokenizer2')

Inference

In [104]:
trained_model_path1 = '/content/drive/MyDrive/t5_trained_model2'
trained_tokenizer1 = '/content/drive/MyDrive/t5_tokenizer2'
device = 'cpu'

In [105]:
model1 = T5ForConditionalGeneration.from_pretrained(trained_model_path1)
tokenizer1 = T5Tokenizer.from_pretrained(trained_tokenizer1)

In [106]:
def get_question1(sentence,mdl,tknizer):

  ''' function to generate questions. Takes a sentence,answer,
      model and tokenizer
  '''

  text = "context: {}".format(sentence)
  #print (text)
  max_len = 256
  encoding = tknizer.encode_plus(text,max_length=max_len, pad_to_max_length=False,truncation=True, return_tensors="pt")

  input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]

  outs = mdl.generate(input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  early_stopping=True,
                                  num_beams=5,
                                  num_return_sequences=1,
                                  no_repeat_ngram_size=2,
                                  max_length=72)
  #print("outputs")
  #print(outs)
  for beam_output in outs:
    sent = tokenizer.decode(beam_output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
    #print(sent)

  dec = [tknizer.decode(ids,skip_special_tokens=True) for ids in outs]
  #print("decs")
  #print(dec)

  Question = dec[0]
  #Question= Question.strip()
  #Answer= Answer.strip()
  #print(Answer)
  index = Question.find("answer:")

  # Extract the question and answer based on the position
  question = Question[10:index].strip()
  answer = Question[index + len("answer:"):].strip()
  #print("Question1:", question)
  #print("Answer1:", answer)

  return  question,answer

Testing the Model

In [107]:
context = "Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to many fields including large language models, computer vision, speech recognition, email filtering, agriculture, and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5] ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods. The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (parallel) field of study, focusing on exploratory data analysis through unsupervised learning.[7][8] From a theoretical point of view Probably approximately correct learning provides a framework for describing machine learning."
print("context: ",context)
ques,answer = get_question1(context,model1,tokenizer1)
print ("question: ",ques)
print ("answer: ",answer)

context:  Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions.[1] Recently, generative artificial neural networks have been able to surpass many previous approaches in performance.[2][3] Machine learning approaches have been applied to many fields including large language models, computer vision, speech recognition, email filtering, agriculture, and medicine, where it is too costly to develop algorithms to perform the needed tasks.[4][5] ML is known in its application across business problems under the name predictive analytics. Although not all machine learning is statistically based, computational statistics is an important source of the field's methods. The mathematical foundations of ML are provided by mathematical optimization (mathematical programming) methods. Data mining is a related (par

In [108]:
def divide_into_paragraphs(text, num_paragraphs):
    # Split the text into paragraphs based on the desired number
    words = text.split()
    words_per_paragraph = len(words) // num_paragraphs

    paragraphs = []
    start_index = 0
    for _ in range(num_paragraphs - 1):
        end_index = start_index + words_per_paragraph
        paragraph = " ".join(words[start_index:end_index])
        paragraphs.append(paragraph)
        start_index = end_index

    # Add the remaining words as the last paragraph
    last_paragraph = " ".join(words[start_index:])
    paragraphs.append(last_paragraph)

    return paragraphs


#Final Question & Answer Function without Similarity Check

In [110]:
!pip install PyPDF2
import PyPDF2
import math

def qa_func(start_page: int, end_page: int, num_qa: int):
  #qa_dict=dict.fromkeys('Question','Answer')
  qa_dict={}
  num_qa_per_pg = math.ceil(num_qa/(end_page-start_page+1))
  if (num_qa_per_pg>5): #5 is hyperparamer which means it will generate 5 questions per page
    print('Enter less number of questions')
  else:

      pdf_file_path = "/content/drive/MyDrive/Machine Learning - Tom Mitchell_compressed.pdf"
      with open(pdf_file_path, "rb") as file:
          pdf_reader = PyPDF2.PdfReader(file)

          # Find the start and end page numbers of the desired chapter from the index
          # start_page = 10  # Replace with the actual start page number of the chapter
          # end_page = 20    # Replace with the actual end page number of the chapter

          # Extract pages of the desired chapter
          extracted_pages = []
          count=0
          for page_num in range(start_page, end_page+1):
              page = pdf_reader.pages[page_num].extract_text()
              #print(page_num)

              paragraphs = divide_into_paragraphs(page, num_qa_per_pg)
              for i, paragraph in enumerate(paragraphs):

                if (count<num_qa):
                  ques,answer = get_question1(paragraph,model1,tokenizer1)
                  print(paragraph)
                  qa_dict[ques] = answer
                  '''print ("question: ",ques)
                  print ("answer: ",answer)'''
                  count=count+1
                else:
                  #print ("qa_dict: ",qa_dict)
                  break
  return qa_dict



Calling qa_func

In [111]:
my_dict={}
my_dict=qa_func(14,15,2)
print(my_dict)

CHAPTER 1 INTRODUCITON 3 0 Learning to recognize spoken words. All of the most successful speech recognition systems employ machine learning in some form. For example, the SPHINX system (e.g., Lee 1989) learns speaker-specific strategies for recognizing the primitive sounds (phonemes) and words from the observed speech signal. Neural network learning methods (e.g., Waibel et al. 1989) and methods for learning hidden Markov models (e.g., Lee 1989) are effective for automatically customizing to,individual speakers, vocabularies, microphone characteristics, background noise, etc. Similar techniques have potential applications in many signal-interpretation problems. 0 Learning to drive an autonomous vehicle. Machine learning methods have been used to train computer-controlled vehicles to steer correctly when driving on a variety of road types. For example, the ALVINN system (Pomerleau 1989) has used its learned strategies to drive unassisted at 70 miles per hour for 90 miles on public high

#Final Question & Answer Function with Similarity Checking Mechanism




In [129]:
!pip install PyPDF2
import PyPDF2
import math
import re

def qa_func(start_page: int, end_page: int, num_qa: int):
  #qa_dict=dict.fromkeys('Question','Answer')
  qa_dict={}
  figure_regex = r"\b(Fig(?:ure)?\.?\s\d+)\b"
  table_regex = r"\b(Table\s\d+)\b"
  num_qa_per_pg = math.ceil(num_qa/(end_page-start_page+1))
  if (num_qa_per_pg>5): #5 is hyperparamer which means it will generate 5 questions per page
    print('Enter less number of questions')
  else:

      pdf_file_path = "/content/drive/MyDrive/Machine Learning - Tom Mitchell_compressed.pdf"
      with open(pdf_file_path, "rb") as file:
          pdf_reader = PyPDF2.PdfReader(file)

          # Find the start and end page numbers of the desired chapter from the index
          # start_page = 10  # Replace with the actual start page number of the chapter
          # end_page = 20    # Replace with the actual end page number of the chapter

          # Extract pages of the desired chapter
          extracted_pages = []
          count=0
          for page_num in range(start_page, end_page+1):
              page = pdf_reader.pages[page_num].extract_text()
              print(page_num)

              # Remove figures, tables, author names, and titles

              page = re.sub(figure_regex, "", page, flags=re.IGNORECASE)

              page = re.sub(table_regex, "", page, flags=re.IGNORECASE)

              paragraphs = divide_into_paragraphs(page, num_qa_per_pg)
              for i, paragraph in enumerate(paragraphs):
                #print(i)

                if (count<num_qa):
                  print(count)
                  ques,answer = get_question1(paragraph,model1,tokenizer1)
                  print(ques)
                  #if(len(qa_dict) > 0):
                  if(process_question(ques,qa_dict.keys())==" "):
                    qa_dict[ques] = answer
                    count=count+1
                else:
                  break
  return qa_dict



In [None]:
my_dict={}
my_dict=qa_func(15,20,2)
print(my_dict)

Question_Similarity_Implementation

In [122]:
x = my_dict.keys()
print(len(x))
for i in x:
  print(i)
#print(x)
y = my_dict.items()
print(y)#Full list
z = my_dict.values()
print(z)

2
What are the advantages of machine learning in speech recognition systems?
How does the Computational complexity theory process handle the inherent complexity of different learning tasks?
dict_items([('What are the advantages of machine learning in speech recognition systems?', 'Machine learning methods have been used to train computer-controlled vehicles to steer correctly when driving on a variety of road types. These techniques have potential applications in signal-interpretation problems.'), ('How does the Computational complexity theory process handle the inherent complexity of different learning tasks?', "Optimal codes and their relationship to optimal training sequences for encoding a hypothesis are used to optimize predefined objectives and predict the next state of the process they are controlling. Occam's razor suggests that the simplest")])
dict_values(['Machine learning methods have been used to train computer-controlled vehicles to steer correctly when driving on a varie

In [134]:
!pip install torch
!pip install torchvision



In [135]:
class STSBertModel(torch.nn.Module):

    def __init__(self):

        super(STSBertModel, self).__init__()

        word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=128)
        pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
        self.sts_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    def forward(self, input_data):

        output = self.sts_model(input_data)

        return output

Accessing S Bert Model

In [None]:
!pip install sentence_transformers==2.2.1
import joblib
trained_model = joblib.load('/content/drive/MyDrive/Copy of Sbert_Model_1')

In [None]:
!pip install datasets
!pip install sentence-transformers
!pip install transformers
!pip install torchmetrics.functional
import torch
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import load_dataset
import matplotlib.pyplot as plt
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [126]:
# Function to predict test data
def predict_sts(texts):

  trained_model .to('cpu')
  trained_model.eval()
  test_input = tokenizer(texts, padding='max_length', max_length = 128, truncation=True, return_tensors="pt")
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']
  del test_input['token_type_ids']

  test_output = trained_model(test_input)['sentence_embedding']
  sim = torch.nn.functional.cosine_similarity(test_output[0], test_output[1], dim=0).item()

  return sim

In [127]:
def process_question(question,questions):
  similarity = False
  compare_questions = []
  similar_question = " "
  if(len(question)>0):
    for value in questions:
      if (similarity == False):
        compare_questions.append(question)
        #print('Q',question)
        compare_questions.append(value)
        #print('V',value)
        if predict_sts(compare_questions) > 0.8 :
          similarity = True
          similar_question = value
        compare_questions.clear()
  return similar_question


In [128]:
def process_answer(answer,answers):
  similar_answer = False
  compare_answers = []
  for value in answers:
    if (similarity == False):
      compare_answers.add(answer)
      compare_answers.add(value)
      if predict_sts(compare_answers) > 0.7:
        similar_answer = True
      compare_answers.clear()
  return similar_answer