In this notebook we are downloading PDF file, converting it to TXT and doing some "pre-cleaning": removing not meaningful parts of document and leaving just the most valuable leftovers for our future generator.
THe outcome of the below code is pre-processed but still raw data.


"extracted_text" variable has "StringIO" type: The StringIO object is part of Python's io module and is a class that provides an in-memory file-like object that can be used for reading from or writing to strings as if they were files. It allows you to treat strings as file-like objects, which can be useful in various situations, such as when you want to read from or write to a string in a way that mimics file operations.


In [159]:
# import of libraries
from io import StringIO # extracted_text is the main variable, contains the whole text of document in stringIO format in memory
import requests
import re  # provides reg. exp. support
import math

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
import fitz
import nltk
from nltk.corpus import stopwords

import spacy
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import sentencepiece
from transformers import T5ForConditionalGeneration, T5Tokenizer, BertForQuestionAnswering, BertTokenizer
from transformers import AutoTokenizer
from keybert import KeyBERT
import gradio as gr # UI part for the quize
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize

In [None]:
#!pip install -U pip setuptools wheel
#!pip install -U spacy
#!python -m spacy download en_core_web_sm
#!pip install PyMuPDF # this is fitz
#!pip install gradio
#!pip install keybert
#!pip install sentencepiece

In [160]:
# downloading pdf to '/data/' folder
url = 'https://astqb.org/assets/documents/ISTQB_CTFL_Syllabus-v4.0.pdf'
r = requests.get(url, allow_redirects=True)
open('data/ISTQB_CTFL_Syllabus-v4.0.pdf', 'wb').write(r.content)

1113747

r"Page \d{4,74} of 74"

In [161]:
#converting pdf to text and saving into .txt file initial version
output_string = StringIO()
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0.txt'
with open('data/ISTQB_CTFL_Syllabus-v4.0.pdf', 'rb') as in_file, open(output_file_path, 'w', encoding='utf-8') as out_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
    # Getting the extracted text from StringIO, it means the entire text extracted from the PDF is stored as a single string in memory.
    extracted_text = output_string.getvalue()
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()


# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' saved to '{output_file_path}'")

Extracted text for 'The Certified Tester Foundation Level in Software Testing' saved to 'data/ISTQB_CTFL_Syllabus-v4.0.txt'


In [162]:
# let us check size of StringIO on the full size of converted file, just out of curiosity
size_bytes = len(extracted_text.encode('utf-8'))
print ('The length of string in bytes : ' + str (size_bytes))

# function's code is taken from stackoverflow ---
def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])
# ---
print("File size, document contains 70+ pages: ", convert_size(size_bytes))

The length of string in bytes : 198489
File size, document contains 70+ pages:  193.84 KB


In [163]:
# Looking up for the text to remove everything before it
target_text = "1.1. What is Testing?"

# Finding the position of the target text in the extracted text
start_position = extracted_text.find(target_text)

# Checking if the target text was found, just in case
if start_position != -1:
    # Removing everything before the target text
    extracted_text = extracted_text[start_position:]


# let us save the content to .txt file with prefix '_v0.1' for further debugging purpose and human evaluation process

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v01.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()

# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to '{output_file_path}'")



Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v01.txt'


In [None]:
# removing empty lines
# _ - is iterator, if s.strip(): This part of the list comprehension checks whether the line s contains any non-whitespace characters. 
# If it does, the line is included in the resulting list.

# extracted_text = "".join([_ for _ in extracted_text.strip().splitlines(True) if _.strip()])
# 
# output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v02.txt'
# with open('data/ISTQB_CTFL_Syllabus-v4.0_v01.txt', 'rb') as in_file, open(output_file_path, 'w', encoding='utf-8') as out_file:
 #   Writing the extracted text to the output file
    # out_file.write(extracted_text)
# 
#Closing the stream
# output_string.close()
# 
#Printing message to indicate that the text has been saved to the file
# print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.2 saved to '{output_file_path}'")

In [164]:
# Removing text from 'Page 56 of 74' till the end of the text

# Looking up for the text to remove everything after it
target_text = "Page 56 of 74"

# Finding the position of the target text in the extracted text
end_position = extracted_text.find(target_text)

# Checking if the target text was found, just in case
if end_position != -1:
    # Removing everything before the target text
    extracted_text = extracted_text[:end_position]


# let us save the content to .txt file with prefix '_v0.1' for further debugging purpose and human evaluation process

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v03.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()

# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v03.txt'


In [None]:
#Your stop words list
# stop_words = ["v4.0", "Page", "74", "18", "15", "of", "2023-04-21", "©", "Certified Tester", "Foundation", "Level", "International Software Testing Qualifications Board"]
# 
#Split the extracted_text into words
# words = extracted_text.split()
# 
#Filter out words that are in the stop words list
# filtered_words = [word for word in words if word.lower() not in stop_words]
# 
#Join the filtered words back into a text
# extracted_text = " ".join(filtered_words)
# 
#Print the cleaned text
#print(extracted_text)
# 
# 
# output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v05.txt'
# with open(output_file_path, 'w', encoding='utf-8') as out_file:
#    Writing the extracted text to the output file
    # out_file.write(extracted_text)
# 
#Closing the stream
# output_string.close()
# 
#Printing message to indicate that the text has been saved to the file
# print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.5 saved to '{output_file_path}'")

In [None]:
# convert to lower case all words in stringIO
#extracted_text = extracted_text.lower()

In [None]:

#punctuation
# Load the language model
#nlp = spacy.load("en_core_web_sm")

# Process the text with SpaCy
###doc = nlp(extracted_text)

# Create a list of tokens that are not punctuation
#filtered_tokens = [token.text for token in doc if not token.is_punct]

# Join the filtered tokens back into a text
#extracted_text = " ".join(filtered_tokens)

# Print the text without punctuation
#print(extracted_text)

#output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v04.txt'
#with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
#    out_file.write(extracted_text)

# Closing the stream
#output_string.close()

# Printing message to indicate that the text has been saved to the file
#print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to '{output_file_path}'")



Check results, looks like some parts are not removed

In [180]:
# built by chatgpt on provided context from my side, I used a part of text of file above, reviewed and customized by me as well
#stop_words = ["buxton", "a", "about", "above", "additional", "an", "and", "another", "are", "as", "be", "being", "by", "can", "common", "commonly", "do", "does", "each", "even", "for", "from", "has", "have", "in", "including", "is", "it", "its", "it's", "many", "may", "more", "most", "not", "of", "74", "often", "on", "or", "over", "such", "than", "that", "the", "there", "these", "this", "to", "under", "was", "we", "what", "when", "which", "who", "why", "will", "with", "within", "work", "you", "2023", "04", "21", "v4.0", "page", "2023-04-21", "©", "international", "qualifications", "board", "certified", "tester",  "foundation", "level", "FL-", "K2", "see", "section" , "didn't", "doesn't", "don't", "i.e.", "it's", "let's", "that's", "there's", "they're", "you're", "e.g."]
stop_words = [ "©", "15", "16", "17", "18", "19", "20", r"\b20\b", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36",
              "37", "38","39", "40","41", "42","43", "44","45", "46", "47", "48", "49", "50", "51", "52", "53", "54", 
              "International Software Testing Qualifications Board Certified Tester Foundation Level", "21.04.2023", "01.07.2021",
              "11.11.2019", "27.04.2018", "1.04.2011", "30.03.2010", "01.05.2007", "01.07.2005", "25.02.1999", "the", "market", "(", ")", "in", "or"]
 
# Regular expression pattern to match phrases like "15 74", "16 74", ..., "54 74"
pattern = re.compile(r"(?s)^v4.0.*Foundation Level$", re.DOTALL)
# Split the extracted_text into words
words = re.split(r'\s+', extracted_text)
# 
# Filter out words that match the regular expression pattern or are in the stop words list
filtered_words = [word for word in words if not re.match(pattern, word) and word.lower() not in stop_words]
# Join the filtered words back into a text
extracted_text = " ".join(filtered_words)

# Ph. removal
phrase_to_remove = "International Software Testing Qualifications Board Certified Tester Foundation Level"

# Replace the phrase with an empty string and comas removal (across the whole text)
extracted_text = extracted_text.replace(phrase_to_remove, "")
extracted_text = extracted_text.replace(",", "")

# Regular expression pattern to match and remove text inside brackets and brackets as well
pattern_brackets = r'\([^)]*\)'

# removal of text inside brackets and brackets
extracted_text = re.sub(pattern_brackets, '', extracted_text)

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v05.txt' 
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
output_string.close()
 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.5 saved to '{output_file_path}'")

Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.5 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v05.txt'


In [None]:
#pattern = r'[0-9]'

# Match all digits in the string and replace them with an empty string
#extracted_text = re.sub(pattern, '', extracted_text)

#extracted_text = ''.join((x for x in extracted_text if not x.isdigit()))


#output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v06.txt'
#with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
#    out_file.write(extracted_text)
# 
# Closing the stream
#output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
#print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.6 saved to '{output_file_path}'")


In [None]:



# # Load the language model
# nlp = spacy.load("en_core_web_sm")

# # Your text
# text = extracted_text

# # Process the text with SpaCy
# doc = nlp(text)

# # Create a StringIO object to store the NER results
# output_string = io.StringIO()

# # Extract named entities and write them to the StringIO object
# for ent in doc.ents:
#     output_string.write(f"Entity: {ent.text}, Type: {ent.label_}\n")

# # Get the NER results as a string
# ner_results = output_string.getvalue()

# output_file_path = 'data/NER.txt'
# with open(output_file_path, 'w', encoding='utf-8') as out_file:
#     # Writing the extracted text to the output file
#       out_file.write(ner_results)

# # Closing the stream
# output_string.close()

# # Printing message to indicate that the text has been saved to the file
# print(f"Extracted NER list for 'The Certified Tester Foundation Level in Software Testing; {output_file_path}")



At this point NER dict is saved into /data folder, edited manually and now let us import this file into stop_list StringIO

In [None]:


# Check the content of stop_list_stringio
#content = stop_list_stringio.getvalue()
#print(content)

In [None]:
#Markov Chain
# Sample text (replace with your extracted_text)
# Tokenize the text into words
#tokens = nltk.word_tokenize(extracted_text)

# Create a dictionary to store transition probabilities
#transition_probabilities = {}

# Build the transition probability matrix
#for i in range(len(tokens) - 1):
#    current_token = tokens[i]
#    next_token = tokens[i + 1]
    
#    if current_token in transition_probabilities:
#        transition_probabilities[current_token].append(next_token)
#    else:
#        transition_probabilities[current_token] = [next_token]

# Start with an initial word
#current_word = random.choice(tokens)

# Generate a sentence of a certain length
#generated_text = [current_word]
#sentence_length = 10

#for _ in range(sentence_length - 1):
#    if current_word in transition_probabilities:
#        next_word = random.choice(transition_probabilities[current_word])
#        generated_text.append(next_word)
#        current_word = next_word
#    else:
#        break

# Join the generated words into a sentence
#generated_sentence = " ".join(generated_text)
#print(generated_sentence)


In [166]:
# remove chapter 4 beginning

# Define the regular expression pattern for the text to remove
pattern = r'4\. Test Analysis and Design – 390 minutes.*?(K3) Use acceptance test-driven development (ATDD) to derive test cases'

# Use re.sub to replace the matched text with a marker (e.g., 'REMOVED')
extracted_text = re.sub(pattern, "", extracted_text, flags=re.DOTALL)

# Define the phrase you want to remove
phrase_to_remove = "Learning Objectives for Chapter 4:"

# Replace the phrase with an empty string
extracted_text = extracted_text.replace(phrase_to_remove, "")
# 
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v07.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.7 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.7 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v07.txt'


In [167]:

# remove chapter 4 beginning
# Define the regular expression pattern to remove the desired text
pattern = r'4\.1 Test Techniques Overview.*?4\.5\.3 \(K3\) Use acceptance test-driven development \(ATDD\) to derive test cases'

# Use re.sub to replace the matched text with an empty string
extracted_text = re.sub(pattern, '', extracted_text, flags=re.DOTALL)



output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v08.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.8 saved to '{output_file_path}'")



Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.8 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v08.txt'


In [168]:
# remove chapter 3 beginning
# Define the regular expression pattern for the text to remove
pattern = r'3\. Static Testing – 80 minutes.*?FL-3\.2\.5 \(K1\) Recall the factors that contribute to a successful review'

# Use re.sub to replace the matched text with a marker (e.g., 'REMOVED')
extracted_text = re.sub(pattern, "", extracted_text, flags=re.DOTALL)

# Define the phrase you want to remove
phrase_to_remove = "Learning Objectives for Chapter 4:"

# Replace the phrase with an empty string
extracted_text = extracted_text.replace(phrase_to_remove, "")
# 
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v09.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.9 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.9 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v09.txt'


In [169]:
# remove chapter 2 beginning

# Define the regular expression pattern for the text to remove
pattern = r'2\. Testing Throughout the Software Development Lifecycle.*?FL-2\.3\.1 \(K2\) Summarize maintenance testing and its triggers'
# Use re.sub to replace the matched text with a marker (e.g., 'REMOVED')
extracted_text = re.sub(pattern, "", extracted_text, flags=re.DOTALL)

# Define the phrase you want to remove
phrase_to_remove = "Learning Objectives for Chapter 4:"

# Replace the phrase with an empty string
extracted_text = extracted_text.replace(phrase_to_remove, "")
# 
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v10.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.10 saved to '{output_file_path}'")




Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.10 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v10.txt'


In [170]:
# remove chapter 2 beginning

# Define the regular expression pattern for the text to remove

pattern = r'5\. Managing the Test Activities – 335 minutes.*?FL-5\.5\.1 \(K3\) Prepare a defect report'
# Use re.sub to replace the matched text with a marker (e.g., 'REMOVED')
extracted_text = re.sub(pattern, "", extracted_text, flags=re.DOTALL)

# Define the phrase you want to remove
phrase_to_remove = "Learning Objectives for Chapter 4:"

# Replace the phrase with an empty string
extracted_text = extracted_text.replace(phrase_to_remove, "")
# 
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v11.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.11 saved to '{output_file_path}'")

Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.11 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v11.txt'


In [None]:


#pattern = r'\d+\.\d+\.\d+\.'
#matches = re.findall(pattern, extracted_text)

#for match in matches:
#    match_without_dot = match[:-1]  # Remove the last dot
#    print(match_without_dot)


In [172]:
# Remove bullet points using regular expressions
extracted_text = re.sub(r'•', '', extracted_text)

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v12.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.12 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.12 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v12.txt'


In [186]:
# Define a regular expression pattern to match section titles
#section_pattern = r'\d+\.\d+\.\d+\.'

# using combined reg. exp to extract 1.1.1. and 1.2.
section_pattern_3d = r'\d+\.\d+\.\d+\.'  # Pattern for "1.1.1."
section_pattern_2d = r'\d+\.\d+\.'    # Pattern for "1.2."
combined_pattern = f"({section_pattern_3d}|{section_pattern_2d})"

# Using re.finditer to find all section titles and their starting positions
section_matches = re.finditer(combined_pattern, extracted_text)

# Create lists to store sections
sections = []

# Iterate through section matches
for match in section_matches:
    start_pos = match.start()
    end_pos = (
        match.end()
        if match.end() < len(extracted_text)
        else len(extracted_text)
    )
    section_title = match.group().strip()
    
    # Remove the last dot from the section title
    section_title = section_title[:-1]  # Remove the last dot
    
    try:
        # Find the corresponding section content based on section title position
        next_match = next(section_matches)
        content_start = end_pos
        content_end = (
            next_match.start()
            if content_start < len(extracted_text)
            else len(extracted_text)
        )
        section_content = extracted_text[content_start:content_end].strip()
    except StopIteration:
        # Handle the case when there are no more matches
        section_content = extracted_text[end_pos:].strip()
    
    sections.append((section_title, section_content))

# Print the extracted sections
if sections:
    for section in sections:
        print("Section Title:", section[0])
        print("Section Content:", section[1])
        print("-" * 40)
else:
    print("No sections found in the text, you did something wrong check once more")


Section Title: 1.1
Section Content: What is Testing? Software systems are an integral part of our daily life. Most people have had experience with software that did not work as expected. Software that does not work correctly can lead to many problems including loss of money time business reputation and extreme cases even injury death. Software testing assesses software quality and helps reducing risk of software failure operation. Software testing is a set of activities to discover defects and evaluate quality of software artifacts. These artifacts when being tested are known as test objects. A common misconception about testing is that it only consists of executing tests . However software testing also includes other activities and must be aligned with software development lifecycle . Another common misconception about testing is that testing focuses entirely on verifying test object. Whilst testing involves verification i.e. checking whether system meets specified requirements it als

In [177]:
type(sections)

list

In [195]:
# adding name?

#text = "1.1.2. Testing and Debugging\nTesting and debugging are separate activities. Testing can trigger failures that are caused by defects in the software (dynamic testing) or can directly find defects in the test object (static testing). When dynamic testing (see chapter 4) triggers a failure, debugging is concerned with finding causes of this failure (defects), analyzing these causes, and eliminating the"

# Define a regular expression pattern to match section titles and names
# section_pattern = r'(\d+\.\d+\.\d+\.\s[^\n]+)'
# 
#Using re.finditer to find all section titles and their starting positions
# section_matches = re.finditer(section_pattern, extracted_text)
# 
#Create lists to store sections
# sections = []
# 
#Iterate through section matches
# for match in section_matches:
    # section_info = match.group(1).strip()
# 
 #   Find the corresponding section content based on section title position
    # start_pos = match.end()
    # end_pos = (
        # next(section_matches, None)
        # if start_pos < len(text)
        # else None
    # )
    # 
    # if end_pos:
        # section_content = text[start_pos:end_pos.start()].strip()
    # else:
        # section_content = text[start_pos:].strip()
# 
    # sections.append((section_info, section_content))
# 
#Print the extracted sections
# if sections:
    # for section in sections:
        # print("Section Info:", section[0])
        # print("Section Content:", section[1])
        # print("-" * 40)
# else:
    # print("No sections found in the text.")
# 

In [None]:
# DRAFT

# Sample text (replace this with your actual text)

# Define a regular expression pattern to match section titles
# section_pattern = r'\b\d+\.\d+(?:\.\d+)?(?=\s)'
# 
#Using re.finditer to find all section titles and their starting positions
# section_matches = re.finditer(section_pattern, extracted_text)
# 
#Create lists to store sections
# sections = []
# 
#Iterate through section matches
# for match in section_matches:
    # start_pos = match.start()
    # end_pos = (
        # match.end()
        # if match.end() < len(extracted_text)
        # else len(extracted_text)
    # )
    # section_title = match.group().strip()
    # 
#   Remove the last dot from the section title
    # section_title = section_title[:-1]  # Remove the last dot
    # 
#    Find the corresponding section content based on section title position
    # content_start = end_pos
    # content_end = (
        # next(section_matches).start()
        # if content_start < len(extracted_text)
        # else len(extracted_text)
    # )
    # section_content = extracted_text[content_start:content_end].strip()
    # 
    # sections.append((section_title, section_content))
# 
#Print the extracted sections
# if sections:
    # for section in sections:
        # print("Section Title:", section[0])
        # print("Section Content:", section[1])
        # print("-" * 40)
# else:
    # print("No sections found in the text.")
# 

!Base Modeling!

In [187]:
#T5

# Load the pre-trained T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Input text (use sections[0][1] as the content of the first section)
section_content = sections[1][1]

# Tokenize the input section
input_ids = tokenizer.encode("summarize: " + section_content, return_tensors="pt", max_length=1024, truncation=True)

# Generate the summary
summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Print the summary
print("Summary:", summary)


Summary: static testing can trigger failures that are caused by defects software can directly find defects test object. static testing can trigger failures that are caused by defects software can directly find defects test object.


In [None]:
# keywords extraction - missing so far, leads to HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out. A lot of hours are required. 
# Talk to teachers

# Load the pre-trained KeyBERT model
#model = KeyBERT("distilbert-base-nli-mean-tokens")

# Input text (use sections[0][1] as the content of the first section)
#section_content = sections[0][1]

# Extract keywords
#try:
#    keywords = model.extract_keywords(section_content, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, top_n=10, resume_download=True)
    
    # Print the extracted keywords
#    for keyword in keywords:
#        print(keyword)
#except Exception as e:
#    print("An error occurred:", e)


In [211]:
# Attempt of questions generation
#Construct questions using the n-grams,
#N-grams are continuous sequences of words or symbols or tokens in a document
#and are defined as the neighboring sequences of items in a document.
#https://www.scaler.com/topics/nlp/n-gram-model-in-nlp

# Tokenize the summary into sentences
sentences = sent_tokenize(summary)  # section_content = sections[0][1]
# Function to generate a fixed number of questions from sentences using trigrams
def generate_questions(text, num_questions=20):
    questions = []
    for sentence in text:
        # Tokenize each sentence into words
        words = nltk.word_tokenize(sentence)
        # Generate n-grams (trigrams) from the words
        n_grams = list(ngrams(words, 3))

        # Construct questions using the trigrams
        for n_gram in n_grams:
            question = f"What is {n_gram[0]} {n_gram[1]} {n_gram[2]}?"
            questions.append(question)

            # Stop generating questions if we reach the desired number
            if len(questions) >= num_questions:
                return questions

    return questions[:num_questions]  # Return only the specified number of questions

# Generate 20 questions from the sentences using trigrams
questions = generate_questions(sentences, num_questions=20)

# Print the generated questions
for i, question in enumerate(questions, start=1):
    print(f"Question {i}: {question}")


Question 1: What is static testing can?
Question 2: What is testing can trigger?
Question 3: What is can trigger failures?
Question 4: What is trigger failures that?
Question 5: What is failures that are?
Question 6: What is that are caused?
Question 7: What is are caused by?
Question 8: What is caused by defects?
Question 9: What is by defects software?
Question 10: What is defects software can?
Question 11: What is software can directly?
Question 12: What is can directly find?
Question 13: What is directly find defects?
Question 14: What is find defects test?
Question 15: What is defects test object?
Question 16: What is test object .?
Question 17: What is static testing can?
Question 18: What is testing can trigger?
Question 19: What is can trigger failures?
Question 20: What is trigger failures that?


In [210]:

sentences = sent_tokenize(summary) 

from nltk.corpus import stopwords

# Function to generate a fixed number of questions from sentences using trigrams
def generate_questions(text, num_questions=20):
    questions = []
    stop_words = set(stopwords.words('english'))
    

from nltk.corpus import stopwords

# Function to generate a fixed number of questions from sentences using trigrams
def generate_questions(text, num_questions=20):
    questions = []
    stop_words = set(stopwords.words('english'))
    
    # Define question templates with different prefixes
    question_templates = ["What is", "What can"]
    
    for sentence in text:
        # Tokenize each sentence into words
        words = nltk.word_tokenize(sentence)
        # Generate n-grams (trigrams) from the words
        n_grams = list(ngrams(words, 3))

        # Construct questions using the trigrams and different question templates
        for n_gram in n_grams:
            if (
                n_gram[-1].lower() not in stop_words 
                and n_gram[-1].lower() != n_gram[-2].lower()
                and "can" not in n_gram
            ):
                for template in question_templates:
                    question = f"{template} {n_gram[0]} {n_gram[1]} {n_gram[2]}?"
                    questions.append(question)

            # Stop generating questions if we reach the desired number
            if len(questions) >= num_questions:
                return questions

    return questions[:num_questions]  # Return only the specified number of questions

# Generate 20 questions from the sentences using trigrams and different prefixes
questions = generate_questions(sentences, num_questions=20)

# Print the generated questions
for i, question in enumerate(questions, start=1):
    print(f"Question {i}: {question}")




Question 1: What is that are caused?
Question 2: What can that are caused?
Question 3: What is caused by defects?
Question 4: What can caused by defects?
Question 5: What is by defects software?
Question 6: What can by defects software?
Question 7: What is directly find defects?
Question 8: What can directly find defects?
Question 9: What is find defects test?
Question 10: What can find defects test?
Question 11: What is defects test object?
Question 12: What can defects test object?
Question 13: What is test object .?
Question 14: What can test object .?
Question 15: What is that are caused?
Question 16: What can that are caused?
Question 17: What is caused by defects?
Question 18: What can caused by defects?
Question 19: What is by defects software?
Question 20: What can by defects software?


In [None]:
# !!! possible training of T5 !!!

import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

# Load your custom dataset using the datasets library
dataset = load_dataset('your_custom_dataset')

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
config = T5Config.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small", config=config)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['input_text'], examples['target_text'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=4,
    output_dir="./t5-fine-tuned",
    num_train_epochs=3,
    evaluation_strategy="steps",
    save_steps=10_000,
    eval_steps=10_000,
    save_total_limit=2,
)

# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=tokenized_datasets.data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("t5-fine-tuned")
tokenizer.save_pretrained("t5-fine-tuned")


In [None]:
# here comes gradio + manual selection of correct questions

In [None]:
# Load the pre-trained model and tokenizer
#model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Provide a passage and a question
#passage = extracted_text
#question = "Which of the following statements describe a valid test objective?"

#Which of the following statements describe a valid test objective?
#What does not work as expected?

# Tokenize the passage and question
#inputs = tokenizer(question, passage, return_tensors="pt", padding=True, truncation=True)

# Get the answer from the model
#start_scores, end_scores = model(**inputs, return_dict = False)
#start_idx = torch.argmax(start_scores)
#end_idx = torch.argmax(end_scores)

# Decode the answer from the tokenized output
#answer_tokens = inputs["input_ids"][0][start_idx:end_idx + 1]
#answer = tokenizer.decode(answer_tokens)

#print("Answer:", answer)


To process text and generate questions with answers, you can consider using pre-trained language models, such as GPT-3, GPT-4, BERT, T5, or similar models. Each of these models has its strengths and can be used for different aspects of question generation and answering:

T5 (Text-to-Text Transfer Transformer): T5 is a versatile language model that can be fine-tuned for various natural language processing tasks, including question generation. You can fine-tune a pre-trained T5 model on your specific dataset to generate high-quality questions.

GPT-3: OpenAI's GPT-3 is a powerful language model known for its natural language generation capabilities. You can prompt GPT-3 to generate questions based on your input text. It can produce contextually relevant questions, but it may require careful instruction and filtering of the generated output.

BART (Bidirectional and Auto-Regressive Transformers): BART is another transformer-based model that can be fine-tuned for question generation tasks. It excels in text generation tasks and can produce coherent and meaningful questions.

XLNet: XLNet is a transformer model that has achieved strong performance in various NLP tasks. It can be fine-tuned for question generation, and its bidirectional context modeling can lead to better question generation.

BERT (Bidirectional Encoder Representations from Transformers): BERT can also be used for question generation by fine-tuning. While it was originally designed for understanding context, it can be adapted for question generation with appropriate training data.

UniLM: UniLM is a model that can be used for various text generation tasks, including question generation. It combines unidirectional, bidirectional, and sequence-to-sequence learning, making it versatile for NLP tasks.


In [None]:
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering
# from io import StringIO

# # Load the pre-trained model and tokenizer
# model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# # Create a StringIO object with your text
# text_io = StringIO()
# text_io.write("Your text goes here.")
# text_io.seek(0)  # Reset the StringIO object to the beginning

# # Read the text from the StringIO object and convert it to a regular string
# text = text_io.read()

# # Provide a question
# question = "What is the answer to my question?"

# # Tokenize the text and question
# inputs = tokenizer(question, text, return_tensors="pt", padding=True, truncation=True)

# # Get the answer from the model
# start_scores, end_scores = model(**inputs)
# start_idx = torch.argmax(start_scores)
# end_idx = torch.argmax(end_scores)

# # Decode the answer from the tokenized output
# answer_tokens = inputs["input_ids"][0][start_idx:end_idx + 1]
# answer = tokenizer.decode(answer_tokens)

# print("Answer:", answer)


In [None]:
# Step 1: Question Generation using Seq2Seq (T5)

# Load the pre-trained Seq2Seq model for question generation
# question_generation_model = T5ForConditionalGeneration.from_pretrained("t5-small")
# question_generation_tokenizer = T5Tokenizer.from_pretrained("t5-small")
# 
#ISTQB document (replace with your actual content)
# istqb_document = """
# 1.1. What is Testing? 
# 
# Software systems are an integral part of our daily life. Most people have had experience with software 
# that did not work as expected. Software that does not work correctly can lead to many problems, 
# including loss of money, time or business reputation, and, in extreme cases, even injury or death. 
# Software testing assesses software quality and helps reducing the risk of software failure in operation. 
# 
# Software testing is a set of activities to discover defects and evaluate the quality of software artifacts. 
# These artifacts, when being tested, are known as test objects. A common misconception about testing is 
# that it only consists of executing tests (i.e., running the software and checking the test results). However, 
# software testing also includes other activities and must be aligned with the software development lifecycle 
# (see chapter 2). 
# 
# Another common misconception about testing is that testing focuses entirely on verifying the test object. 
# Whilst testing involves verification, i.e., checking whether the system meets specified requirements, it also 
# involves validation, which means checking whether the system meets users’ and other stakeholders’ 
# needs in its operational environment. 
# """
# 
#Generate questions from the ISTQB document
# def generate_questions(document, max_length=64, num_questions=1):
    # inputs = question_generation_tokenizer.encode("generate questions: " + document, return_tensors="pt", max_length=max_length, truncation=True)
    # questions = question_generation_model.generate(inputs, max_length=max_length, num_return_sequences=num_questions)
    # return [question_generation_tokenizer.decode(question, skip_special_tokens=True) for question in questions]
# 
# generated_questions = generate_questions(istqb_document)
# 
#Print generated questions
# for question in generated_questions:
    # print("Question:", question)
# 

Here is template for Quize layout, needs to be re-worked

In [189]:
import gradio as gr

# Replace this with your actual list of section titles and content
#sections = {
#    ("1.1.1.", "Content of Section 1.1.1..."),
#    ("1.2.", "Content of Section 1.2..."),
    # Add more sections as needed
#}

def display_sections(section):
    # Find the selected section in the list based on the section title
    selected_section = next((s for s in sections if s[0] == section), None)

    if selected_section:
        section_title, section_content = selected_section
        return f"Section Title: {section_title}\nSection Content: {section_content}"
    else:
        return "Section not found"

# Get a list of section titles from the sections list
section_titles = [section[0] for section in sections]

# Create a Gradio interface with a dropdown list of section titles
iface = gr.Interface(
    fn=display_sections,
    inputs=gr.inputs.Dropdown(section),
    outputs="text",
    css=".gradio-container {background-color: lightblue}"
)

iface.launch(share=False)


  inputs=gr.inputs.Dropdown(section),
  inputs=gr.inputs.Dropdown(section),


Running on local URL:  http://127.0.0.1:7894

To create a public link, set `share=True` in `launch()`.




In [None]:
# draft metrics

import nltk

# Reference questions (human-generated)
reference_questions = [
    "What is the test objective?",
    "How do objectives vary?",
    "What does the context include?",
    # Add more reference questions here
]

# Automatically generated questions
generated_questions = [
    "What is test objectives?",
    "What is objectives vary?",
    "How do objectives depend?",
    # Add more generated questions here
]

# Initialize the NLTK BLEU scorer
bleu_scorer = nltk.translate.bleu_score.SmoothingFunction()

# Calculate BLEU score (a measure of similarity)
bleu_scores = [nltk.translate.bleu_score.sentence_bleu([r.split()], g.split(), smoothing_function=bleu_scorer.method1) for r, g in zip(reference_questions, generated_questions)]

# Calculate accuracy rate (percentage of questions that match reference questions)
accuracy_rate = sum(score == 1.0 for score in bleu_scores) / len(bleu_scores) * 100

print("Accuracy Rate: {:.2f}%".format(accuracy_rate))
