# Generate question and answer pairs from text file using open-source LLMs

In [None]:
! pip install clean-text

import dataclasses
import logging
import math
import ast
import re
import os
import io
import sys
import time
import json
import tqdm
import copy
import pandas as pd

from typing import Optional, Sequence, Union
from cleantext import clean
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

## Load and preprocess

In [39]:
dir_cur = os.getcwd()
fname = "umich.txt"
with open(os.path.join(dir_cur, fname), "r") as file:
    context = file.read()


#### Personal Identification Information (PII) removal and other preprocessing using cleantext

In [40]:

f_clean = lambda context_raw : clean(context_raw,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
    no_urls=False,                  # replace all URLs with a special token
    no_emails=False,                # replace all email addresses with a special token
    no_phone_numbers=False,         # replace all phone numbers with a special token
    no_numbers=False,               # replace all numbers with a special token
    no_digits=False,                # replace all digits with a special token
    no_currency_symbols=False,      # replace all currency symbols with a special token
    no_punct=False,                 # remove punctuations
    replace_with_punct="",          # instead of removing punctuations you may replace them
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                       # set to 'de' for German special handling
)
pii_remove = False
if pii_remove:
    context = f_clean(context)


#### Dataset customized clenaup and split into list of paragraphs

In [41]:
context = context.lower() # Lowercase 
context = context.strip() # Remove leading/trailing whitespace
context = re.sub(r'[ \t]+', ' ', context) # Remove extra space and tabs while MAINTAINING NEW LINE CHARACTERS
context = re.compile('<.*?>').sub('', context) # Remove HTML tags/markups:

paragraphs = re.split(r'\n{2,}', context) # split it into paragraphs where there are 2+ consecutive newline characters
para_len_l = len(paragraphs)


In [42]:
paragraphs

['green card application process\nthere are essentially three steps in the employment-based green card application process:',
 '1. labor certification (perm)\nwith limited exceptions, all eb-2 and eb-3 green card applications require that the employer obtain a labor certification from the u.s. department of labor. for petitions requiring this step, the labor certification process is often the hardest and most arduous step. prior to being able to file the labor certification application, the employer must obtain a prevailing wage from the department of labor and prove that there are no minimally qualified u.s. workers available for the positions through the completion of a competitive recruitment process.',
 'in the case of positions that contain teaching duties, the employer must document that the selected applicant is the “best qualified” for the position. this process is commonly called “special handling.”',
 'in both the “basic” and the “special handling” process, the employer must 

## Question answer generation with lmqg

In [43]:
! pip install lmqg
from pprint import pprint
from lmqg import TransformersQG

# Download the en_core_web_sm model explicitly 
! python -m spacy download en_core_web_sm  # spacy is a counterpart of nltk

# initialize model
model = TransformersQG(model='lmqg/t5-base-squad-qg-ae', max_length=1024) # max length of a paragraph 
# paragraph to generate pairs of question and answer

context = context

question_answer = model.generate_qa(paragraphs)
# the output is a list of tuple (question, answer)
pprint(question_answer)
# [
#     ('Who was an English painter who specialised in watercolour landscapes?', 'William Turner'),
#     ('What is William Turner often known as?', 'William Turner of Oxford or just Turner of Oxford'),
#     ("What did many of Turner's paintings depict?", 'the countryside around Oxford'),
#     ("What is one of Turner's best known pictures?", 'a view of the city of Oxford from Hinksey Hill')
# ]

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m97.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 1683.41it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 31/31 [00:00<00:00, 1874.40it/s]


[[('How many steps are there in the employment-based green card application '
   'process?',
   'three')],
 [('What is the hardest and most arduous step for petitions requiring this '
   'step?',
   'labor certification'),
  ('What is often the hardest and most arduous step for petitions requiring '
   'this step?',
   'labor certification process'),
  ('What must an employer obtain before being able to file a labor '
   'certification application?',
   'prevailing wage')],
 [('What is special handling?',
   'the employer must document that the selected applicant is the “best '
   'qualified”'),
  ('What is the process called when an employer must document that an '
   'applicant is the best qualified for a position?',
   'special handling.')],
 [('What does the "basic" and "special handling" process require?',
   'the employer must complete a formal recruitment process to document that '
   'there are no minimally qualified u.s. workers available'),
  ('When must the formal recruitmen

In [44]:
 print (f"For {len(paragraphs)} paragraphs, {[len(qas) for qas in question_answer]} questions are generated respectively")

For 12 paragraphs, [1, 3, 2, 2, 3, 2, 2, 5, 2, 5, 3, 1] questions are generated respectively


In [45]:
QApair_flat = [qa for qas in question_answer for qa in qas]
len(QApair_flat)

31

## Write generated question answer pairs to csv file

In [46]:
QApair_df = pd.DataFrame(QApair_flat, columns=["Qustion", "Answer"])
dir_cur = os.getcwd()
QApair_df.to_csv(os.path.join(dir_cur, "output_qa_augment.csv"), index=False)