In this notebook we are downloading PDF file, converting it to TXT and doing some "pre-cleaning": removing not meaningful parts of document and leaving just the most valuable leftovers for our future generator.
THe outcome of the below code is pre-processed but still raw data.


"extracted_text" variable has "StringIO" type: The StringIO object is part of Python's io module and is a class that provides an in-memory file-like object that can be used for reading from or writing to strings as if they were files. It allows you to treat strings as file-like objects, which can be useful in various situations, such as when you want to read from or write to a string in a way that mimics file operations.


In [220]:
# import of libraries
from io import StringIO # extracted_text is the main variable, contains the whole text of document in stringIO format in memory
import requests
import re  # provides reg. exp. support
import math

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

import nltk
from nltk.corpus import stopwords


In [221]:
# downloading pdf to '/data/' folder
url = 'https://astqb.org/assets/documents/ISTQB_CTFL_Syllabus-v4.0.pdf'
r = requests.get(url, allow_redirects=True)
open('data/ISTQB_CTFL_Syllabus-v4.0.pdf', 'wb').write(r.content)

1113747

In [222]:
#converting pdf to text and saving into .txt file initial version
output_string = StringIO()
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0.txt'
with open('data/ISTQB_CTFL_Syllabus-v4.0.pdf', 'rb') as in_file, open(output_file_path, 'w', encoding='utf-8') as out_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
    # Getting the extracted text from StringIO, it means the entire text extracted from the PDF is stored as a single string in memory.
    extracted_text = output_string.getvalue()
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()


# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' saved to '{output_file_path}'")

Extracted text for 'The Certified Tester Foundation Level in Software Testing' saved to 'data/ISTQB_CTFL_Syllabus-v4.0.txt'


In [223]:
# let us check size of StringIO on the full size of converted file, just out of curiosity
size_bytes = len(extracted_text.encode('utf-8'))
print ('The length of string in bytes : ' + str (size_bytes))

# function's code is taken from stackoverflow ---
def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])
# ---
print("Re-calculated size: ", convert_size(size_bytes))

The length of string in bytes : 200296
Re-calculated size:  195.6 KB


In [224]:
# Looking up for the text to remove everything before it
target_text = "1.1. What is Testing?"

# Finding the position of the target text in the extracted text
start_position = extracted_text.find(target_text)

# Checking if the target text was found, just in case
if start_position != -1:
    # Removing everything before the target text
    extracted_text = extracted_text[start_position:]


# let us save the content to .txt file with prefix '_v0.1' for further debugging purpose and human evaluation process

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v01.txt'
with open('data/ISTQB_CTFL_Syllabus-v4.0.txt', 'rb') as in_file, open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()

# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to '{output_file_path}'")



Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v01.txt'


In [225]:
# removing empty lines
# _ - is iterator, if s.strip(): This part of the list comprehension checks whether the line s contains any non-whitespace characters. 
# If it does, the line is included in the resulting list.

extracted_text = "".join([_ for _ in extracted_text.strip().splitlines(True) if _.strip()])

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v02.txt'
with open('data/ISTQB_CTFL_Syllabus-v4.0_v01.txt', 'rb') as in_file, open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()

# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.2 saved to '{output_file_path}'")

Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.2 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v02.txt'


In [226]:
# Removing text from 'Page 56 of 74' till the end of the text

# Looking up for the text to remove everything after it
target_text = "Page 56 of 74"

# Finding the position of the target text in the extracted text
end_position = extracted_text.find(target_text)

# Checking if the target text was found, just in case
if end_position != -1:
    # Removing everything before the target text
    extracted_text = extracted_text[:end_position]


# let us save the content to .txt file with prefix '_v0.1' for further debugging purpose and human evaluation process

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v03.txt'
with open('data/ISTQB_CTFL_Syllabus-v4.0_v02.txt', 'rb') as in_file, open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()

# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v03.txt'


In [227]:
# convert to lower case all words in stringIO
extracted_text = extracted_text.lower()

In [228]:
#  stop words dictionary
stop_words = {
    "2.1.3.": True,  # Section number
    "TDD": True,  # Acronym for Test-Driven Development
    "ATDD": True,  # Acronym for Acceptance Test-Driven Development
    "BDD": True,  # Acronym for Behavior-Driven Development
    "are": True,  # Common verb
    "as": True,  # Common conjunction
    "each": True,  # Common determiner
    "for": True,  # Common preposition
    "in": True,  # Common preposition
    "is": True,  # Common verb
    "of": True,  # Common preposition
    "on": True,  # Common preposition
    "the": True,  # Common determiner
    "to": True,  # Common preposition
    "which": True,  # Common pronoun
    "with": True,  # Common preposition
    "and": True,  # Common conjunction
    "an": True,  # Common determiner
    "by": True,  # Common preposition
    "in": True,  # Common preposition
    "then": True,  # Common adverb
    "of": True,  # Common preposition
    "as": True,  # Common conjunction
    "the": True,  # Common determiner
    "which": True,  # Common pronoun
    "to": True,  # Common preposition
    "for": True,  # Common preposition
    "all": True,  # Common determiner
    "above": True,  # Common preposition
    "an": True,  # Common determiner
    "as": True,  # Common conjunction
    "in": True,  # Common preposition
    "is": True,  # Common verb
    "of": True,  # Common preposition
    "on": True,  # Common preposition
    "the": True,  # Common determiner
    "to": True,  # Common preposition
    "which": True,  # Common pronoun
    "with": True,  # Common preposition
    "a": True,  # Common determiner
    "by": True,  # Common preposition
    "in": True,  # Common preposition
    "then": True,  # Common adverb
    "of": True,  # Common preposition
    "as": True,  # Common conjunction
    "the": True,  # Common determiner
    "which": True,  # Common pronoun
    "to": True,  # Common preposition
    "for": True,  # Common preposition
    "may": True,  # Modal verb
    "in": True,  # Common preposition
    "as": True,  # Common conjunction
    "to": True,  # Common preposition
    "in": True,  # Common preposition
    "by": True,  # Common preposition
    "to": True,  # Common preposition
    "and": True,  # Common conjunction
    "to": True,  # Common preposition
    "a": True,  # Common determiner
    "in": True,  # Common preposition
    "the": True,  # Common determiner
    "for": True,  # Common preposition
    "to": True,  # Common preposition
    "by": True,  # Common preposition
    "for": True,  # Common preposition
    "as": True,  # Common conjunction
    "in": True,  # Common preposition
    "the": True,  # Common determiner
    "of": True,  # Common preposition
    "on": True,  # Common preposition
    "to": True,  # Common preposition
    "for": True,  # Common preposition
    "all": True,  # Common determiner
    "above": True,  # Common preposition
}

# Apply the stop words to your text
# Iterate through your text and remove stop words
filtered_text = []
words = extracted_text.split()  # Split text into words
for word in words:
    # Check if the word is not a stop word (case-insensitive)
    if word.lower() not in stop_words:
        filtered_text.append(word)

# Rejoin the filtered words to form the filtered


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Elena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

#removing . , : etc

import re
from io import StringIO

# Assuming you have the extracted_text StringIO object with the text
# ...

# Create a StringIO object with the text
string_io = StringIO(extracted_text)

# Initialize a variable to store the cleaned text
cleaned_text = ""

# Flag to indicate whether we are in the header/footer section
in_header_footer = False

# Regular expression pattern to match the header pattern
header_pattern = r"v4\.0\nPage \d+ of \d+\n\d{4}-\d{2}-\d{2}\n© International Software Testing Qualifications Board\nCertified Tester\nFoundation Level"

# Iterate through each line of the StringIO object
for line in string_io:
    # Check if the line matches the header pattern
    if re.match(header_pattern, line) and not in_header_footer:
        in_header_footer = True
    # Check if the line is empty or contains only whitespace
    elif not line.strip():
        in_header_footer = False
    # If not in the header/footer section, add the line to cleaned_text
    if not in_header_footer:
        cleaned_text += line

# Update the extracted_text with the cleaned_text
extracted_text = cleaned_text

# Printing the cleaned text
print(extracted_text)
