In this notebook we are downloading PDF file, converting it to TXT and doing some "pre-cleaning": removing not meaningful parts of document and leaving just the most valuable leftovers for our future generator.
THe outcome of the below code is pre-processed but still raw data.


"extracted_text" variable has "StringIO" type: The StringIO object is part of Python's io module and is a class that provides an in-memory file-like object that can be used for reading from or writing to strings as if they were files. It allows you to treat strings as file-like objects, which can be useful in various situations, such as when you want to read from or write to a string in a way that mimics file operations.


In [84]:
# import of libraries
from io import StringIO # extracted_text is the main variable, contains the whole text of document in stringIO format in memory
import requests
import re  # provides reg. exp. support
import math
import api
from selenium import webdriver

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
import fitz
import nltk
from nltk.corpus import stopwords

import spacy
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import sentencepiece
from transformers import T5ForConditionalGeneration, T5Tokenizer, BertForQuestionAnswering, BertTokenizer
from transformers import AutoTokenizer
from keybert import KeyBERT
import gradio as gr # UI part for the quize
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize
import fuzzywuzzy

# libraries for conversion from csv to json, results of "Flag" button click CSV -> JSON
import csv
import json

In [None]:
#!pip install -U pip setuptools wheel
#!pip install -U spacy
#!python -m spacy download en_core_web_sm
#!pip install PyMuPDF # this is fitz
#!pip install gradio
#!pip install keybert
#!pip install sentencepiece

In [85]:
# downloading pdf to '/data/' folder
url = 'https://astqb.org/assets/documents/ISTQB_CTFL_Syllabus-v4.0.pdf'
r = requests.get(url, allow_redirects=True)
open('data/ISTQB_CTFL_Syllabus-v4.0.pdf', 'wb').write(r.content)

1113747

r"Page \d{4,74} of 74"

In [86]:
#converting pdf to text and saving into .txt file initial version
output_string = StringIO()
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0.txt'
with open('data/ISTQB_CTFL_Syllabus-v4.0.pdf', 'rb') as in_file, open(output_file_path, 'w', encoding='utf-8') as out_file:
    parser = PDFParser(in_file)
    doc = PDFDocument(parser)
    rsrcmgr = PDFResourceManager()
    device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    for page in PDFPage.create_pages(doc):
        interpreter.process_page(page)
    # Getting the extracted text from StringIO, it means the entire text extracted from the PDF is stored as a single string in memory.
    extracted_text = output_string.getvalue()
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()


# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' saved to '{output_file_path}'")

Extracted text for 'The Certified Tester Foundation Level in Software Testing' saved to 'data/ISTQB_CTFL_Syllabus-v4.0.txt'


In [87]:
# let us check size of StringIO on the full size of converted file, just out of curiosity
size_bytes = len(extracted_text.encode('utf-8'))
print ('The length of string in bytes : ' + str (size_bytes))

# function's code is taken from stackoverflow ---
def convert_size(size_bytes):
   if size_bytes == 0:
       return "0B"
   size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
   i = int(math.floor(math.log(size_bytes, 1024)))
   p = math.pow(1024, i)
   s = round(size_bytes / p, 2)
   return "%s %s" % (s, size_name[i])
# ---
print("File size, document contains 70+ pages: ", convert_size(size_bytes))

The length of string in bytes : 198489
File size, document contains 70+ pages:  193.84 KB


In [88]:
# Looking up for the text to remove everything before it
target_text = "1.1. What is Testing?"

# Finding the position of the target text in the extracted text
start_position = extracted_text.find(target_text)

# Checking if the target text was found, just in case
if start_position != -1:
    # Removing everything before the target text
    extracted_text = extracted_text[start_position:]


# let us save the content to .txt file with prefix '_v0.1' for further debugging purpose and human evaluation process

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v01.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()

# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to '{output_file_path}'")



Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v01.txt'


In [None]:
# removing empty lines
# _ - is iterator, if s.strip(): This part of the list comprehension checks whether the line s contains any non-whitespace characters. 
# If it does, the line is included in the resulting list.

# extracted_text = "".join([_ for _ in extracted_text.strip().splitlines(True) if _.strip()])
# 
# output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v02.txt'
# with open('data/ISTQB_CTFL_Syllabus-v4.0_v01.txt', 'rb') as in_file, open(output_file_path, 'w', encoding='utf-8') as out_file:
 #   Writing the extracted text to the output file
    # out_file.write(extracted_text)
# 
#Closing the stream
# output_string.close()
# 
#Printing message to indicate that the text has been saved to the file
# print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.2 saved to '{output_file_path}'")

In [89]:
# Removing text from 'Page 56 of 74' till the end of the text

# Looking up for the text to remove everything after it
target_text = "Page 56 of 74"

# Finding the position of the target text in the extracted text
end_position = extracted_text.find(target_text)

# Checking if the target text was found, just in case
if end_position != -1:
    # Removing everything before the target text
    extracted_text = extracted_text[:end_position]


# let us save the content to .txt file with prefix '_v0.1' for further debugging purpose and human evaluation process

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v03.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)

# Closing the stream
output_string.close()

# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.3 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.3 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v03.txt'


In [None]:
#Your stop words list
# stop_words = ["v4.0", "Page", "74", "18", "15", "of", "2023-04-21", "©", "Certified Tester", "Foundation", "Level", "International Software Testing Qualifications Board"]
# 
#Split the extracted_text into words
# words = extracted_text.split()
# 
#Filter out words that are in the stop words list
# filtered_words = [word for word in words if word.lower() not in stop_words]
# 
#Join the filtered words back into a text
# extracted_text = " ".join(filtered_words)
# 
#Print the cleaned text
#print(extracted_text)
# 
# 
# output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v05.txt'
# with open(output_file_path, 'w', encoding='utf-8') as out_file:
#    Writing the extracted text to the output file
    # out_file.write(extracted_text)
# 
#Closing the stream
# output_string.close()
# 
#Printing message to indicate that the text has been saved to the file
# print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.5 saved to '{output_file_path}'")

In [None]:
# convert to lower case all words in stringIO
#extracted_text = extracted_text.lower()

In [None]:

#punctuation
# Load the language model
#nlp = spacy.load("en_core_web_sm")

# Process the text with SpaCy
###doc = nlp(extracted_text)

# Create a list of tokens that are not punctuation
#filtered_tokens = [token.text for token in doc if not token.is_punct]

# Join the filtered tokens back into a text
#extracted_text = " ".join(filtered_tokens)

# Print the text without punctuation
#print(extracted_text)

#output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v04.txt'
#with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
#    out_file.write(extracted_text)

# Closing the stream
#output_string.close()

# Printing message to indicate that the text has been saved to the file
#print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.1 saved to '{output_file_path}'")



Check results, looks like some parts are not removed

In [90]:
# built by chatgpt on provided context from my side, I used a part of text of file above, reviewed and customized by me as well
#stop_words = ["buxton", "a", "about", "above", "additional", "an", "and", "another", "are", "as", "be", "being", "by", "can", "common", "commonly", "do", "does", "each", "even", "for", "from", "has", "have", "in", "including", "is", "it", "its", "it's", "many", "may", "more", "most", "not", "of", "74", "often", "on", "or", "over", "such", "than", "that", "the", "there", "these", "this", "to", "under", "was", "we", "what", "when", "which", "who", "why", "will", "with", "within", "work", "you", "2023", "04", "21", "v4.0", "page", "2023-04-21", "©", "international", "qualifications", "board", "certified", "tester",  "foundation", "level", "FL-", "K2", "see", "section" , "didn't", "doesn't", "don't", "i.e.", "it's", "let's", "that's", "there's", "they're", "you're", "e.g."]
stop_words = [ "©", "15", "16", "17", "18", "19", "20", r"\b20\b", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36",
              "37", "38","39", "40","41", "42","43", "44","45", "46", "47", "48", "49", "50", "51", "52", "53", "54", 
              "International Software Testing Qualifications Board Certified Tester Foundation Level", "21.04.2023", "01.07.2021",
              "11.11.2019", "27.04.2018", "1.04.2011", "30.03.2010", "01.05.2007", "01.07.2005", "25.02.1999", "the", "market", "(", ")", "in", "or"]
 
# Regular expression pattern to match phrases like "15 74", "16 74", ..., "54 74"
pattern = re.compile(r"(?s)^v4.0.*Foundation Level$", re.DOTALL)
# Split the extracted_text into words
words = re.split(r'\s+', extracted_text)
# 
# Filter out words that match the regular expression pattern or are in the stop words list
filtered_words = [word for word in words if not re.match(pattern, word) and word.lower() not in stop_words] #match
# Join the filtered words back into a text
extracted_text = " ".join(filtered_words)

# Ph. removal
phrase_to_remove = "International Software Testing Qualifications Board Certified Tester Foundation Level"
phrase_to_remove_v = "v4.0 Page of 74 2023-04-21"

# Replace the phrase with an empty string and comas removal (across the whole text)
extracted_text = extracted_text.replace(phrase_to_remove, "")
#extracted_text = extracted_text.replace(",", "")
extracted_text = extracted_text.replace(phrase_to_remove_v, "")
# Regular expression pattern to match and remove text inside brackets and brackets as well
pattern_brackets = r'\([^)]*\)'

# removal of text inside brackets and brackets
extracted_text = re.sub(pattern_brackets, '', extracted_text)

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v05.txt' 
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
output_string.close()
 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.5 saved to '{output_file_path}'")

Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.5 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v05.txt'


In [None]:
#pattern = r'[0-9]'

# Match all digits in the string and replace them with an empty string
#extracted_text = re.sub(pattern, '', extracted_text)

#extracted_text = ''.join((x for x in extracted_text if not x.isdigit()))


#output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v06.txt'
#with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
#    out_file.write(extracted_text)
# 
# Closing the stream
#output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
#print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.6 saved to '{output_file_path}'")


In [None]:



# # Load the language model
# nlp = spacy.load("en_core_web_sm")

# # Your text
# text = extracted_text

# # Process the text with SpaCy
# doc = nlp(text)

# # Create a StringIO object to store the NER results
# output_string = io.StringIO()

# # Extract named entities and write them to the StringIO object
# for ent in doc.ents:
#     output_string.write(f"Entity: {ent.text}, Type: {ent.label_}\n")

# # Get the NER results as a string
# ner_results = output_string.getvalue()

# output_file_path = 'data/NER.txt'
# with open(output_file_path, 'w', encoding='utf-8') as out_file:
#     # Writing the extracted text to the output file
#       out_file.write(ner_results)

# # Closing the stream
# output_string.close()

# # Printing message to indicate that the text has been saved to the file
# print(f"Extracted NER list for 'The Certified Tester Foundation Level in Software Testing; {output_file_path}")



At this point NER dict is saved into /data folder, edited manually and now let us import this file into stop_list StringIO

In [None]:


# Check the content of stop_list_stringio
#content = stop_list_stringio.getvalue()
#print(content)

In [None]:
#Markov Chain
# Sample text (replace with your extracted_text)
# Tokenize the text into words
#tokens = nltk.word_tokenize(extracted_text)

# Create a dictionary to store transition probabilities
#transition_probabilities = {}

# Build the transition probability matrix
#for i in range(len(tokens) - 1):
#    current_token = tokens[i]
#    next_token = tokens[i + 1]
    
#    if current_token in transition_probabilities:
#        transition_probabilities[current_token].append(next_token)
#    else:
#        transition_probabilities[current_token] = [next_token]

# Start with an initial word
#current_word = random.choice(tokens)

# Generate a sentence of a certain length
#generated_text = [current_word]
#sentence_length = 10

#for _ in range(sentence_length - 1):
#    if current_word in transition_probabilities:
#        next_word = random.choice(transition_probabilities[current_word])
#        generated_text.append(next_word)
#        current_word = next_word
#    else:
#        break

# Join the generated words into a sentence
#generated_sentence = " ".join(generated_text)
#print(generated_sentence)


In [91]:
# remove chapter 4 beginning

# Define the regular expression pattern for the text to remove
pattern = r'4\. Test Analysis and Design – 390 minutes.*?(K3) Use acceptance test-driven development (ATDD) to derive test cases'

# Use re.sub to replace the matched text with a marker (e.g., 'REMOVED')
extracted_text = re.sub(pattern, "", extracted_text, flags=re.DOTALL)

# Define the phrase you want to remove
phrase_to_remove = "Learning Objectives for Chapter 4:"

# Replace the phrase with an empty string
extracted_text = extracted_text.replace(phrase_to_remove, "")
# 
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v07.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.7 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.7 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v07.txt'


In [92]:

# remove chapter 4 beginning
# Define the regular expression pattern to remove the desired text
pattern = r'4\.1 Test Techniques Overview.*?4\.5\.3 \(K3\) Use acceptance test-driven development \(ATDD\) to derive test cases'

# Use re.sub to replace the matched text with an empty string
extracted_text = re.sub(pattern, '', extracted_text, flags=re.DOTALL)



output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v08.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.8 saved to '{output_file_path}'")



Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.8 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v08.txt'


In [93]:
# remove chapter 3 beginning
# Define the regular expression pattern for the text to remove
pattern = r'3\. Static Testing – 80 minutes.*?FL-3\.2\.5 \(K1\) Recall the factors that contribute to a successful review'

# Use re.sub to replace the matched text with a marker (e.g., 'REMOVED')
extracted_text = re.sub(pattern, "", extracted_text, flags=re.DOTALL)

# Define the phrase you want to remove
phrase_to_remove = "Learning Objectives for Chapter 4:"

# Replace the phrase with an empty string
extracted_text = extracted_text.replace(phrase_to_remove, "")
# 
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v09.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.9 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.9 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v09.txt'


In [94]:
# remove chapter 2 beginning

# Define the regular expression pattern for the text to remove
pattern = r'2\. Testing Throughout the Software Development Lifecycle.*?FL-2\.3\.1 \(K2\) Summarize maintenance testing and its triggers'
# Use re.sub to replace the matched text with a marker (e.g., 'REMOVED')
extracted_text = re.sub(pattern, "", extracted_text, flags=re.DOTALL)

# Define the phrase you want to remove
phrase_to_remove = "Learning Objectives for Chapter 4:"

# Replace the phrase with an empty string
extracted_text = extracted_text.replace(phrase_to_remove, "")
# 
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v10.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.10 saved to '{output_file_path}'")




Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.10 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v10.txt'


In [95]:
# remove chapter 2 beginning

# Define the regular expression pattern for the text to remove

pattern = r'5\. Managing the Test Activities – 335 minutes.*?FL-5\.5\.1 \(K3\) Prepare a defect report'
# Use re.sub to replace the matched text with a marker (e.g., 'REMOVED')
extracted_text = re.sub(pattern, "", extracted_text, flags=re.DOTALL)

# Define the phrase you want to remove
phrase_to_remove = "Learning Objectives for Chapter 4:"

# Replace the phrase with an empty string
extracted_text = extracted_text.replace(phrase_to_remove, "")
# 
output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v11.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.11 saved to '{output_file_path}'")

Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.11 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v11.txt'


In [None]:


#pattern = r'\d+\.\d+\.\d+\.'
#matches = re.findall(pattern, extracted_text)

#for match in matches:
#    match_without_dot = match[:-1]  # Remove the last dot
#    print(match_without_dot)


In [96]:
# Remove bullet points using regular expressions
extracted_text = re.sub(r'•', '', extracted_text)

output_file_path = 'data/ISTQB_CTFL_Syllabus-v4.0_v12.txt'
with open(output_file_path, 'w', encoding='utf-8') as out_file:
    # Writing the extracted text to the output file
    out_file.write(extracted_text)
# 
# Closing the stream
output_string.close()
# 
# Printing message to indicate that the text has been saved to the file
print(f"Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.12 saved to '{output_file_path}'")


Extracted text for 'The Certified Tester Foundation Level in Software Testing' pre processed version 0.12 saved to 'data/ISTQB_CTFL_Syllabus-v4.0_v12.txt'


In [None]:
# Define a regular expression pattern to match section titles
#section_pattern = r'\d+\.\d+\.\d+\.'

# using combined reg. exp to extract 1.1.1. and 1.2.
#section_pattern_3d = r'\d+\.\d+\.\d+\.'  # Pattern for "1.1.1."
#section_pattern_2d = r'\d+\.\d+\.'    # Pattern for "1.2."
#combined_pattern = f"({section_pattern_3d}|{section_pattern_2d})"#

## Using re.finditer to find all section titles and their starting positions
#section_matches = re.finditer(combined_pattern, extracted_text)#

## Create lists to store sections
#sections = []#

## Iterate through section matches
#for match in section_matches:
#    start_pos = match.start()
#    end_pos = (
#        match.end()
#        if match.end() < len(extracted_text)
#        else len(extracted_text)
#    )
#    section_title = match.group().strip()
#    
#    # Remove the last dot from the section title
#    section_title = section_title[:-1]  # Remove the last dot
#    
#    try:
#        # Find the corresponding section content based on section title position
#        next_match = next(section_matches)
#        content_start = end_pos
#        content_end = (
#            next_match.start()
#            if content_start < len(extracted_text)
#            else len(extracted_text)
#        )
#        section_content = extracted_text[content_start:content_end].strip()
#    except StopIteration:
#        # Handle the case when there are no more matches
#        section_content = extracted_text[end_pos:].strip()
#    
#    sections.append((section_title, section_content))#

## Print the extracted sections
#if sections:
#    for section in sections:
#        print("Section Title:", section[0])
#        print("Section Content:", section[1])
#        print("-" * 40)
#else:
#    print("No sections found in the text, you did something wrong check once more")#


In [None]:
type(sections)

In [None]:
# adding name?

#text = "1.1.2. Testing and Debugging\nTesting and debugging are separate activities. Testing can trigger failures that are caused by defects in the software (dynamic testing) or can directly find defects in the test object (static testing). When dynamic testing (see chapter 4) triggers a failure, debugging is concerned with finding causes of this failure (defects), analyzing these causes, and eliminating the"

# Define a regular expression pattern to match section titles and names
# section_pattern = r'(\d+\.\d+\.\d+\.\s[^\n]+)'
# 
#Using re.finditer to find all section titles and their starting positions
# section_matches = re.finditer(section_pattern, extracted_text)
# 
#Create lists to store sections
# sections = []
# 
#Iterate through section matches
# for match in section_matches:
    # section_info = match.group(1).strip()
# 
 #   Find the corresponding section content based on section title position
    # start_pos = match.end()
    # end_pos = (
        # next(section_matches, None)
        # if start_pos < len(text)
        # else None
    # )
    # 
    # if end_pos:
        # section_content = text[start_pos:end_pos.start()].strip()
    # else:
        # section_content = text[start_pos:].strip()
# 
    # sections.append((section_info, section_content))
# 
#Print the extracted sections
# if sections:
    # for section in sections:
        # print("Section Info:", section[0])
        # print("Section Content:", section[1])
        # print("-" * 40)
# else:
    # print("No sections found in the text.")
# 

In [None]:
# DRAFT

# Sample text (replace this with your actual text)

# Define a regular expression pattern to match section titles
# section_pattern = r'\b\d+\.\d+(?:\.\d+)?(?=\s)'
# 
#Using re.finditer to find all section titles and their starting positions
# section_matches = re.finditer(section_pattern, extracted_text)
# 
#Create lists to store sections
# sections = []
# 
#Iterate through section matches
# for match in section_matches:
    # start_pos = match.start()
    # end_pos = (
        # match.end()
        # if match.end() < len(extracted_text)
        # else len(extracted_text)
    # )
    # section_title = match.group().strip()
    # 
#   Remove the last dot from the section title
    # section_title = section_title[:-1]  # Remove the last dot
    # 
#    Find the corresponding section content based on section title position
    # content_start = end_pos
    # content_end = (
        # next(section_matches).start()
        # if content_start < len(extracted_text)
        # else len(extracted_text)
    # )
    # section_content = extracted_text[content_start:content_end].strip()
    # 
    # sections.append((section_title, section_content))
# 
#Print the extracted sections
# if sections:
    # for section in sections:
        # print("Section Title:", section[0])
        # print("Section Content:", section[1])
        # print("-" * 40)
# else:
    # print("No sections found in the text.")
# 

!Base Modeling!

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Read the text from the StringIO file (replace with your own text)
#extracted_text = "Your text goes here."

# Tokenize the text
inputs = tokenizer("summarize: " + extracted_text, return_tensors="pt", max_length=512, truncation=True)

# Generate keywords one at a time
num_keywords = 150  # Adjust as needed
keywords = []

for _ in range(num_keywords):
    output = model.generate(inputs["input_ids"], max_length=len(keywords) + 1, num_return_sequences=1, no_repeat_ngram_size=2)
    keyword = tokenizer.decode(output[0], skip_special_tokens=True)
    keywords.append(keyword)

# Print the generated keywords
print("Keywords:", keywords)


In [None]:
#paraphrasing text --> takes too long

from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
model_name = "t5-large"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)
 
# Tokenize the text
inputs = tokenizer("paraphrase: " + extracted_text, return_tensors="pt", max_length=512, truncation=True)

# Generate paraphrased sentences
num_paraphrases = 155  # Adjust as needed
paraphrases = []

for _ in range(num_paraphrases):
    output = model.generate(inputs["input_ids"], max_length=512, num_return_sequences=1, no_repeat_ngram_size=2)
    paraphrase = tokenizer.decode(output[0], skip_special_tokens=True)
    paraphrases.append(paraphrase)

# Print the generated paraphrases
for i, paraphrase in enumerate(paraphrases):
    print(f"Paraphrase {i + 1}: {paraphrase}")


In [None]:
# generation of summary
#Check if there are sections available before accessing them
if len(sections) >= 2:
    # Load the pre-trained T5 model and tokenizer
    model_name = "t5-large"
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    tokenizer = T5Tokenizer.from_pretrained(model_name)

    for section_index, (section_title, section_content) in enumerate(sections):
        # Tokenize the input section
        input_ids = tokenizer.encode("summarize: " + section_content, return_tensors="pt", max_length=1024, truncation=True)

        # Generate the summary
        summary_ids = model.generate(input_ids, max_length=150, min_length=30, length_penalty=2.0, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

        # Print the summary for each section
        print(f"Summary for Section {section_index + 1} - Title: {section_title}")
        print("Summary:", summary)
        print("-" * 40)
else:
    print("Not enough sections found in the list.")


In [97]:
# summary + keywords 0 T5 questions

from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the T5 model and tokenizer
model_name = "t5-small"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = T5Tokenizer.from_pretrained(model_name)

# Sample summary and keywords (replace with your own data)
summary = "Software testing is the process of evaluating software to identify defects."
keywords = ["Software testing", "process", "defects"]

# Prepare input for question generation
input_text = f"Summary: {summary} Keywords: {', '.join(keywords)}"

# Tokenize the input
inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)

# Generate questions
output = model.generate(inputs["input_ids"], max_length=512, num_return_sequences=1, no_repeat_ngram_size=2)

# Decode and print the generated questions
generated_questions = [tokenizer.decode(question, skip_special_tokens=True) for question in output]
for i, question in enumerate(generated_questions, 1):
    print(f"Question {i}: {question}")


Question 1: software testing is the process of evaluating software to identify defects. Keywords: Software testing, process, defects, defect detection, and defect identification.


In [None]:
type(summary)

In [None]:
# questions generation

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the pre-trained model and tokenizer for question generation
tokenizer = AutoTokenizer.from_pretrained("PrimeQA/mt5-base-tydi-question-generator")
model = AutoModelForSeq2SeqLM.from_pretrained("PrimeQA/mt5-base-tydi-question-generator")

# Function to generate a question for a given summary
def generate_question(summary, max_length=64):
    # Tokenize the input text and generate the question
    features = tokenizer([summary], return_tensors='pt', padding=True, truncation=True, max_length=512)
    output = model.generate(input_ids=features['input_ids'], 
                            attention_mask=features['attention_mask'],
                            max_length=max_length,
                            num_return_sequences=1)
    
    return tokenizer.decode(output[0])

# Example usage:
summary = "defect management process includes a workflow for handling individual anomalies from their discovery to their closure and rules for their classification. process must be followed by all involved stakeholders. a defect report logged during dynamic testing typically includes: Unique identifier Title with a short summary of anomaly being reported."
question = generate_question(summary)
print("Generated Question:", question)



In [None]:
#experiment!!!

from transformers import BartTokenizer, BartForConditionalGeneration

# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"  # You can choose a different model size if needed
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Function to generate an answer for a given question and context
def generate_answer(question, context, max_length=64):
    # Prepare the input text by combining the question and context
    input_text = f"question: {question} context: {context}"

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate the answer
    answer_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True, num_return_sequences=1)

    # Decode the generated answer
    answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)

    return answer

# Example usage:
#context = "defect management process includes a workflow for handling individual anomalies from their discovery to their closure and rules for their classification. process must be followed by all involved stakeholders. a defect report logged during dynamic testing typically includes: Unique identifier Title with a short summary of anomaly being reported."
#question = "What is the most common defect management process?"

# Generate the answer
generated_answer = generate_answer(question, context)
print("Generated Answer:", generated_answer)









In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

# Load the pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
model = BartForConditionalGeneration.from_pretrained(model_name)
tokenizer = BartTokenizer.from_pretrained(model_name)

# Function to generate an answer for a given question and context
def generate_answer(question, context, max_length=64):
    # Prepare the input text by combining the question and context
    input_text = f"question: {question} context: {context}"

    # Tokenize the input text
    input_ids = tokenizer.encode(input_text, return_tensors="pt", max_length=1024, truncation=True)

    # Generate the answer
    answer_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True, num_return_sequences=1)

    # Decode the generated answer
    answer = tokenizer.decode(answer_ids[0], skip_special_tokens=True)

    return answer

# Example usage:
#context = "defect management process includes a workflow for handling individual anomalies from their discovery to their closure and rules for their classification. process must be followed by all involved stakeholders. a defect report logged during dynamic testing typically includes: Unique identifier Title with a short summary of anomaly being reported."
context = "testing provides a cost-effective means of detecting defects."
question = "What provides a cost-effective means of detecting defects?"

# Generate the answer
generated_answer = generate_answer(question, context)
print("Generated Answer:", generated_answer)


In [None]:
# keywords extraction - missing so far, leads to HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out. A lot of hours are required. 
# Talk to teachers

# Load the pre-trained KeyBERT model
#model = KeyBERT("distilbert-base-nli-mean-tokens")

# Input text (use sections[0][1] as the content of the first section)
#section_content = sections[0][1]

# Extract keywords
#try:
#    keywords = model.extract_keywords(section_content, keyphrase_ngram_range=(1, 2), stop_words='english', use_mmr=True, top_n=10, resume_download=True)
    
    # Print the extracted keywords
#    for keyword in keywords:
#        print(keyword)
#except Exception as e:
#    print("An error occurred:", e)


In [None]:

sentences = sent_tokenize(summary) 

from nltk.corpus import stopwords

# Function to generate a fixed number of questions from sentences using trigrams
def generate_questions(text, num_questions=20):
    questions = []
    stop_words = set(stopwords.words('english'))
    

from nltk.corpus import stopwords

# Function to generate a fixed number of questions from sentences using trigrams
def generate_questions(text, num_questions=30):
    questions = []
    stop_words = set(stopwords.words('english'))
    
    # Define question templates with different prefixes
    question_templates = ["What", "What is", "What can", "What does"]
    
    for sentence in text:
        # Tokenize each sentence into words
        words = nltk.word_tokenize(sentence)
        # Generate n-grams (trigrams) from the words
        n_grams = list(ngrams(words, 3))

        # Construct questions using the trigrams and different question templates
        for n_gram in n_grams:
            if (
                n_gram[-1].lower() not in stop_words 
                and n_gram[-1].lower() != n_gram[-2].lower()
                and "can" not in n_gram
            ):
                for template in question_templates:
                    question = f"{template} {n_gram[0]} {n_gram[1]} {n_gram[2]}?"
                    questions.append(question)

            # Stop generating questions if we reach the desired number
            if len(questions) >= num_questions:
                return questions

    return questions[:num_questions]  # Return only the specified number of questions

# Generate 20 questions from the sentences using trigrams and different prefixes
questions = generate_questions(sentences, num_questions=30)

# Print the generated questions
for i, question in enumerate(questions, start=1):
    print(f"Question {i}: {question}")



In [None]:
#distractors generation

import random

def generate_distractors(correct_answer, num_distractors, word_embedding_model=None):
    """
    Generate distractors for a given correct answer.

    Args:
    - correct_answer (str): The correct answer.
    - num_distractors (int): The number of distractors to generate.
    - word_embedding_model: A pre-trained word embedding model (e.g., Word2Vec or GloVe).

    Returns:
    - distractors (list): A list of generated distractors.
    """
    distractors = []
    
    # You can start by making simple modifications to the correct answer, such as replacing words.
    for _ in range(num_distractors):
        # Randomly select a distractor generation strategy
        strategy = random.choice(["replace", "synonym"])
        
        if strategy == "replace":
            # Replace a random word in the correct answer with a random word
            correct_answer_words = correct_answer.split()
            word_to_replace = random.choice(correct_answer_words)
            replacement_word = "replacement_word"  # Replace this with logic to select a random word
            distractor = correct_answer.replace(word_to_replace, replacement_word)
        elif strategy == "synonym" and word_embedding_model:
            # Find a synonym for a random word in the correct answer using word embeddings
            correct_answer_words = correct_answer.split()
            word_to_replace = random.choice(correct_answer_words)
            synonyms = find_synonyms(word_to_replace, word_embedding_model)
            if synonyms:
                replacement_word = random.choice(synonyms)
                distractor = correct_answer.replace(word_to_replace, replacement_word)
            else:
                # If no synonyms are found, use a fallback strategy (e.g., random replacement)
                distractor = generate_random_distractor(correct_answer)
        else:
            # Use a fallback strategy (e.g., random replacement) if no other strategy is chosen
            distractor = generate_random_distractor(correct_answer)

        distractors.append(distractor)
    
    return distractors

def find_synonyms(word, word_embedding_model):
    """
    Find synonyms for a word using a word embedding model.

    Args:
    - word (str): The target word.
    - word_embedding_model: A pre-trained word embedding model (e.g., Word2Vec or GloVe).

    Returns:
    - synonyms (list): A list of synonyms for the target word.
    """
    # Replace this with code to find synonyms using the word embedding model
    # This may involve querying the model's word vectors to find similar words.
    synonyms = []
    return synonyms

def generate_random_distractor(correct_answer):
    """
    Generate a random distractor by replacing a random word in the correct answer.

    Args:
    - correct_answer (str): The correct answer.

    Returns:
    - distractor (str): The generated distractor.
    """
    correct_answer_words = correct_answer.split()
    word_to_replace = random.choice(correct_answer_words)
    replacement_word = "replacement_word"  # Replace this with logic to select a random word
    distractor = correct_answer.replace(word_to_replace, replacement_word)
    return distractor

# Example usage:
#correct_answer = "testing provides a Cost-effective Means of Detecting Defects."
num_distractors = 3
distractors = generate_distractors(correct_answer, num_distractors)

# Print the correct answer and distractors
print("Correct Answer:", correct_answer)
print("Distractors:", distractors)


In [None]:
# !!! possible training of T5 !!!

#import torch
#from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config
#from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
#from datasets import load_dataset
#
## Load your custom dataset using the datasets library
#dataset = load_dataset('your_custom_dataset')
#
## Initialize the tokenizer and model
#tokenizer = T5Tokenizer.from_pretrained("t5-small")
#config = T5Config.from_pretrained("t5-small")
#model = T5ForConditionalGeneration.from_pretrained("t5-small", config=config)
#
## Tokenize the dataset
#def tokenize_function(examples):
#    return tokenizer(examples['input_text'], examples['target_text'], padding='max_length', truncation=True)
#
#tokenized_datasets = dataset.map(tokenize_function, batched=True)
#
## Define training arguments
#training_args = Seq2SeqTrainingArguments(
#    per_device_train_batch_size=4,
#    output_dir="./t5-fine-tuned",
#    num_train_epochs=3,
#    evaluation_strategy="steps",
#    save_steps=10_000,
#    eval_steps=10_000,
#    save_total_limit=2,
#)
#
## Initialize the trainer
#trainer = Seq2SeqTrainer(
#    model=model,
#    args=training_args,
#    data_collator=tokenized_datasets.data_collator,
#    train_dataset=tokenized_datasets["train"],
#    eval_dataset=tokenized_datasets["validation"],
#)
#
## Fine-tune the model
#trainer.train()
#
## Save the fine-tuned model
#model.save_pretrained("t5-fine-tuned")
#tokenizer.save_pretrained("t5-fine-tuned")
#

In [None]:
# here comes gradio + manual selection of correct questions

In [None]:
# Load the pre-trained model and tokenizer
#model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
#model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Provide a passage and a question
#passage = extracted_text
#question = "Which of the following statements describe a valid test objective?"

#Which of the following statements describe a valid test objective?
#What does not work as expected?

# Tokenize the passage and question
#inputs = tokenizer(question, passage, return_tensors="pt", padding=True, truncation=True)

# Get the answer from the model
#start_scores, end_scores = model(**inputs, return_dict = False)
#start_idx = torch.argmax(start_scores)
#end_idx = torch.argmax(end_scores)

# Decode the answer from the tokenized output
#answer_tokens = inputs["input_ids"][0][start_idx:end_idx + 1]
#answer = tokenizer.decode(answer_tokens)

#print("Answer:", answer)


To process text and generate questions with answers, you can consider using pre-trained language models, such as GPT-3, GPT-4, BERT, T5, or similar models. Each of these models has its strengths and can be used for different aspects of question generation and answering:

T5 (Text-to-Text Transfer Transformer): T5 is a versatile language model that can be fine-tuned for various natural language processing tasks, including question generation. You can fine-tune a pre-trained T5 model on your specific dataset to generate high-quality questions.

GPT-3: OpenAI's GPT-3 is a powerful language model known for its natural language generation capabilities. You can prompt GPT-3 to generate questions based on your input text. It can produce contextually relevant questions, but it may require careful instruction and filtering of the generated output.

BART (Bidirectional and Auto-Regressive Transformers): BART is another transformer-based model that can be fine-tuned for question generation tasks. It excels in text generation tasks and can produce coherent and meaningful questions.

XLNet: XLNet is a transformer model that has achieved strong performance in various NLP tasks. It can be fine-tuned for question generation, and its bidirectional context modeling can lead to better question generation.

BERT (Bidirectional Encoder Representations from Transformers): BERT can also be used for question generation by fine-tuning. While it was originally designed for understanding context, it can be adapted for question generation with appropriate training data.

UniLM: UniLM is a model that can be used for various text generation tasks, including question generation. It combines unidirectional, bidirectional, and sequence-to-sequence learning, making it versatile for NLP tasks.


In [None]:
# from transformers import AutoTokenizer, AutoModelForQuestionAnswering
# from io import StringIO

# # Load the pre-trained model and tokenizer
# model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# # Create a StringIO object with your text
# text_io = StringIO()
# text_io.write("Your text goes here.")
# text_io.seek(0)  # Reset the StringIO object to the beginning

# # Read the text from the StringIO object and convert it to a regular string
# text = text_io.read()

# # Provide a question
# question = "What is the answer to my question?"

# # Tokenize the text and question
# inputs = tokenizer(question, text, return_tensors="pt", padding=True, truncation=True)

# # Get the answer from the model
# start_scores, end_scores = model(**inputs)
# start_idx = torch.argmax(start_scores)
# end_idx = torch.argmax(end_scores)

# # Decode the answer from the tokenized output
# answer_tokens = inputs["input_ids"][0][start_idx:end_idx + 1]
# answer = tokenizer.decode(answer_tokens)

# print("Answer:", answer)


In [None]:
# Step 1: Question Generation using Seq2Seq (T5)

# Load the pre-trained Seq2Seq model for question generation
# question_generation_model = T5ForConditionalGeneration.from_pretrained("t5-small")
# question_generation_tokenizer = T5Tokenizer.from_pretrained("t5-small")
# 
#ISTQB document (replace with your actual content)
# istqb_document = """
# 1.1. What is Testing? 
# 
# Software systems are an integral part of our daily life. Most people have had experience with software 
# that did not work as expected. Software that does not work correctly can lead to many problems, 
# including loss of money, time or business reputation, and, in extreme cases, even injury or death. 
# Software testing assesses software quality and helps reducing the risk of software failure in operation. 
# 
# Software testing is a set of activities to discover defects and evaluate the quality of software artifacts. 
# These artifacts, when being tested, are known as test objects. A common misconception about testing is 
# that it only consists of executing tests (i.e., running the software and checking the test results). However, 
# software testing also includes other activities and must be aligned with the software development lifecycle 
# (see chapter 2). 
# 
# Another common misconception about testing is that testing focuses entirely on verifying the test object. 
# Whilst testing involves verification, i.e., checking whether the system meets specified requirements, it also 
# involves validation, which means checking whether the system meets users’ and other stakeholders’ 
# needs in its operational environment. 
# """
# 
#Generate questions from the ISTQB document
# def generate_questions(document, max_length=64, num_questions=1):
    # inputs = question_generation_tokenizer.encode("generate questions: " + document, return_tensors="pt", max_length=max_length, truncation=True)
    # questions = question_generation_model.generate(inputs, max_length=max_length, num_return_sequences=num_questions)
    # return [question_generation_tokenizer.decode(question, skip_special_tokens=True) for question in questions]
# 
# generated_questions = generate_questions(istqb_document)
# 
#Print generated questions
# for question in generated_questions:
    # print("Question:", question)
# 

Here is template for Quize layout, needs to be re-worked

In [None]:
# Define a function to display sections
def display_sections(section_title, sections):
    # Find the selected section in the list based on the section title
    selected_section = next((s for s in sections if s[0] == section_title), None)

    if selected_section:
        _, section_content = selected_section
        return section_content
    else:
        return "Section not found"

# Get a list of section titles from the sections list
section_titles = [section for section in sections]

# Create a Gradio interface with a dropdown list of section titles and a textbox for content
iface = gr.Interface(
    fn=display_sections,
    inputs=[gr.inputs.Dropdown(section_titles, label="Select a Section"), gr.inputs.Textbox(lines = 10, default="Selected Section Content", label="Section Content")],
    outputs="text",
    flagging_options = ['yes', 'no'],
    theme=gr.themes.Soft()
)

iface.launch(share=False)


JSON converter from CSVlogger() to JSON, this is when the previous step w questions selection is finished via Flag button

In [None]:
# set path to both files
csv_file = 'flagged/log.csv'
json_file = 'flagged/questions_answers.json'

# empty list to store the JSON data
json_data = []

# Open the CSV file for reading
with open(csv_file, 'r') as csvfile:
    # Create a CSV reader object
    csvreader = csv.DictReader(csvfile)
    
    # Iterate through each row in the CSV file
    for row in csvreader:
        # Append each row as a dictionary to the JSON data list
        json_data.append(row)

# Open the JSON file for writing and save the JSON data
with open(json_file, 'w') as jsonfile:
    json.dump(json_data, jsonfile, indent=4)

print(f"CSV data has been converted to JSON successfully and saved as {json_file}. The file contains final result of Quiz generator. ")


In [None]:
# draft metrics

import nltk

# Reference questions (human-generated)
reference_questions = [
    "What is the test objective?",
    "How do objectives vary?",
    "What does the context include?",
    # Add more reference questions here
]

# Automatically generated questions
generated_questions = [
    "What is test objectives?",
    "What is objectives vary?",
    "How do objectives depend?",
    # Add more generated questions here
]

# Initialize the NLTK BLEU scorer
bleu_scorer = nltk.translate.bleu_score.SmoothingFunction()

# Calculate BLEU score (a measure of similarity)
bleu_scores = [nltk.translate.bleu_score.sentence_bleu([r.split()], g.split(), smoothing_function=bleu_scorer.method1) for r, g in zip(reference_questions, generated_questions)]

# Calculate accuracy rate (percentage of questions that match reference questions)
accuracy_rate = sum(score == 1.0 for score in bleu_scores) / len(bleu_scores) * 100

print("Accuracy Rate: {:.2f}%".format(accuracy_rate))


In [None]:
#from selenium import webdriver

#driver=webdriver.Chrome()

#driver.get("https://www.facebook.com/") 

In [None]:
#!pip show selenium
#!pip install selenium==4.12.0


In [None]:
#extract api key

#from pyChatGPT import ChatGPT
#from selenium.webdriver.common.by import By
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.support import expected_conditions as EC
#from selenium.webdriver.common.action_chains import ActionChains
#from selenium.webdriver.support.ui import WebDriverWait##

#session_token = "eyJhbGciOiJkaXIiLCJlbmMiOiJBMjU2R0NNIn0..RSoIbYuWt0dZzv_n.yzUd3McMevS948NpJcruJnsILZCqj30VJnxYzcJRnQ38WO9xCyF1LazX-6kARlOnQccDdoakpHjCQ_1NsMO-8MMLm1RKVFdWG3QmH5CCfFAFwZlGiVo-Fj04fVnxFZCvw3j4ouaqA2XwELxW0m9Q_fhCqy8ZLaiF4YpJCmjubt4A9HAJZ0pbrNknQq-OL62DJXJuOL92t_pE-jZpgMIZA3jTZHdvZUzZqfcqM26KKikwJg_WD5wAmLAqz_whTt2p6mNei2Yt6reJQ_uP_5Cwr6Ae9uEF3rX-h0ylmz_di8Ntexgk5nlN2dU4gHEWoNUo0Nf8tqXQMHfoQn6LS3AnFIcDEAAA5s7QTmJZ4hkyCAUk1TOXRG3afrEJD1snnXvrJkv7skXMQfDYhneBE8lUnTQpTJzRxW3KUfXbx89vRXCcboP-LvhTZ_q3adKGiQT5ZhJ6Gb5pCrxFVS34Y7996VfseEbf7duaSW58UNp8mG95YyQtQM1JwaegR9TdE37L3-oNVpoJtS2CzbH6UyS4Ddk6z6IDT-3-5EMLOzO2Bz16CV87DsB---SCsnOll62LXaRHYrdc7Y43u7JMCWgQtLeO0Zm-8H25HkfYhm5YvqBW7pUTeOo43cWZPFDTIcDulunZh68S065c3GzvokIVoAkMieZBzwZQ-KAFjisxntaO69cYVJ_OVjX7aJvXHvsCZcX0jcHwlkooIqn_dbaXgl7718We_QGgLVz6hsDvUlsT3FWfKOP77PSWY20Vvd3Vn3LMzBkxIhLIENpPLHx6kHa_BIBlLdUDJaZKTGCLKvm7GGzgv7kAaQe4bIC-uXGjZDifyP__Wj5M3xxkwY7rAF-voYEvGGoArczz5sQuX2qWBLyQ7FpvgC2lZwarkz9ZWoLBWMgv2jf7Ypm7cBuXnLhzK1TV9tGMinrkWXqjNA9cFLqU3oQ-212hVorUOBj2gG8MO5Z6vIi1GGtrUGN4zvKyDikL1WrBLFZPOPnp2pNX63i2F-6PjqMI9u1X1ZKMqN-viz8JOuFmW6KMayCUWGrKtKXn0_wk8SkB8bzOmZ4w6WbZavlsS23v8NOJkwm2sZOoovKEsHNOQEbnZGGckGZwXNKHnkK4nLJrW-00MfBs3Lq597UbDpGKGqPHh8kOCjGnaACg27CCWckmglgnNwsU6NNv360jIKgFAuT5mJFFr1h8fUFWB0T7V3QbdXOGV9J4g0NP2cIGE-tXOiQquI84kpxjjOVOE_jPAVNEVF0hrBC6Eg4xMYas0mVQ2K-fmZrRtmNKi1UMq2iQALbrPm0UA0LToAqURFhAdUdyYoXj4hGVbvmKBGN8P-0fjEi4lYKBnPsBiWkejTnogTJc_5T2RePoHkopy6_9q3lCD2JcNHkkUlJbCYfmAC7NJNsM-3N-SJ0Glb7KmYx8sY3MqRJHvLPLeo7I5QxAgKeKurgRr3lxxNJbcliL28A5hauqEjUMJnWf6BqhpplcQXLQYlsLn4u9DLuMXXdhbNUp-CvGM0SrydLpy1Y819YaZI7ds88bNB5Hq4e35AqRBo_uDuU2MY5infiyv4-2DvoWcJVl6Dcmag_KwXaGvZewm7lPbG8dxDNPfWmxMBsqDRk62rbzLe3o-EphkTkyEfTreWOMrQSTkrEwdGMzG8sXiV8hHcfbAT_qL4vzNIrs0gxsKqt0O0ecGUG1wbW6PXhArlA0Z_ot-NK7g7aP3ntgdsdaIrqw6FCnKzQnXT35oVD0InGO4EbZnIcg4YyTepjWs1LrUjtTVagWPagr3c-kICpJfjw4jERhdsvQFS1V_QFG_SjT00XW52FvU5r5IeMLjz0r7Khphq5B8XveUAvl65X36mvS-npVHGdfrCBsCIxSXpKVT8o_acGzM9vFpVtdALy32dQap92Lqv-YtQfyMCkyfYXaH4nlEYpMfxNLzD_M98nD35PTr3ovA85JtYMP7fWzV7QiQzx5sqgtsXlbBQeMTA0httZZj8qZ7VtWKGv0BbWkPTQRlh-9JbPYJS0KB791Djdw-nfdRS1sSi7972WykK91W6iCrRff_BCWXzn83-YCk1_F4rn420cDLVlB3hDvIO-SiQmCuhH9xCnX9a0PKGPT82YvCYVuO3Ets0mYHyLd4GLpqvF0KHjqGnIOAzuzU7QfECz6JNCkm1yLa8QVY978gyi68pxJAVVVAVs546FwP2pLjv3RQkt5cfzGpkP4wMt-tXVJR7SCy-Sntqfpn5L4heisOjo6EHkpgP4OTyv-kUPa5HOxkIAVrmG7g2Qwvn-SjfkEG_jZHqFkl4unsECiSKN1fZMaTTIC9qOJ8YFGPsBijhR-toUd5EmtYQJ_rW8aosszMkQmPy1MRKv0Iwab6QYm1ncWyqLzfJMTFtYRRDnyclfqr1oEwS-TbBFRJZADN8I677a7CNiq_JIphlx5w8kB7ADtk--zAHmCVlycAza6EgArS4WvgRJ18QYNjN1-e-3v1ISrxSNjXRK8PAIGqInHGJ7EpkmXDmS5I4HlbDakdtPeAiLTLuaXvctbbSWg5vSgLLIBVwskpcLBhCpotHuhou_ZLTU6q40bfee8u0Gt99mTswb5BkS9DCJlpGNAr99GVSYpvhyPSbQfnScN4EnBK-aoySeoaqnTVdLCqTlhgJqLKlVyNf6-nASXvBiEQ0pjcdk3AxQ_j_7aCGj4Eo0hhSndlhZoXaU22y_DV6CY8CxDqJrbwU2NZDKvJZjZmVza6RH4FAVHgnlwsgKCb-I7Ccpw93EhrOmgpObGD3fEIhr0TKTxVeqxkjWK9wz672kndN__FQEnOA9Q664rCQcro349u0yOs50XtNZZ6nnuV3WfQZ1pkPxlmMyfgp0T.FCUbJELvyDNKws3nIQugCg"
##session_token = "eyJhbGciOiJkaXIiLCJlbmMiOiJBMjU2R0NNIn0..Tp3P5x2i7af5GTfN.y6_A8C625dFSW8oSdmTLRQvvjYS8QmcPFKEDpLl-PHD1KngeOZFs78Jmpcrx_AgAxNcK4aWi_EHRiRGHUcUx4fnpFH0artO90rhECxVFoVJ4iouZ0477aTbDpfC19-y7G9xnCdlmb3CHTbz1l7ifsq3N9HwThAuybG_uYzAkYH74HKtG28REuxEea5EfTSS-1gracdfxuFtGWb1DvXJ_xpczUvbI_7vrHeuD-wWyrnH9a1NU484r1YntPLG9hhwHmbQ8YbdQyDv-Jl7LqHM1GGSFx8Q0ebW1Y3nxYyQnZ13zEYSosudoQQ4pd80WD3a2VXbdDpSWlFIjDt5V9pXISzc85KUha45JLAEr0EPK5eJaaWRdHIgEYrg6vRWbMaGQ4kz7YWDgd_VzOyHYQeU11UMeFuqdlafQWBo-dv2FOb-GbDKy85vtwLYvIoTgRH3e7ixNWVzbnjWtR_bSBpMEBjQHJG0E7KWHFkamPoav1SyUy-SiOVJKBPbRgxnFC25LWfiJGmXgRV1L7wIYorMxJ3lJKjXH7in8cEl64dn2l7FJ5EH9eeZg1nIeIsSO5lSATkG1ueLY8cuQeEJPL6LZzVuVJV7v-K2C5vrgft0yoU9-LEyXry9bU5WUZf1PKOSr8VS37MRZwMlWmbBrLkQaqHSAB2l3HS5T9kO2ZafyGc6d8luoVvIYxeIXlPrI6j6k3PaPPclh9ACi8pIay17TmVfSbBVGgTZp2VbP37Yfri5tG0dYkZEjXDUkd-NDdxVfe1ChQLwLeN6NUHu-IgTOUK3sqPNU9HYptdz1yD3GfpJY5qOKtGt4D-OoECahIeSzxNYlo5JY4rXviSeGeW2PLvq3pkD4MN5jrgzjCZ9getdAua8yEGvT5_A-uzN8w9gFw8kdhgK3BwOhdfU1cY_OUDoDTIZ12MJbkAH9BzcMXU6P9CoNshyQM8dE1-D_WNxySwVcTSwfOuaZ9MQIhLWa8geqvhVqNDjLzYfKjaydyr6wPwDsBLh9dTqQmmp88aD3VJ-Is6aqRjUIUlSOTsA8eNL5vy-cNZufRX4OrXB9p_Az0PfFNvjSptzoMGmH-0YXiyP4yENYtNSTagawc_PlB66IuUlOmCuyXmVXlzRKTEiAbwNkVrfbBB2ezw6g4cQ6ROROEuCXfBCKh5GjQ-IJgxQRjf6OOaczo-G_YYUnZhuZK2NgEPeXfS18QT_dAKdJLB4PySxitP-MKjOGoWOWbm858Uod1NwQQUMmO7JYAzqxrfNxxFkEA60XTxM-QkZnssgsdjFy0uGEXuwBaoYuYX6_WX9k71ixZ47nNfhRVfTPvXb_8GWVuf_QmwKLYg6ZEcGgGyDiIjhdypr2bretXyR5AfYj1Cludvmc5frm2IXGEJTQIHUkx5sW70ycXtOWNqHfE5sXlt9zyM12FywccPoxUZjZM4YdSleaL-smbApxa-AKjxzTWIgPJHpd-nxputQJdRYjAHLCqPVCycNse5q1y5oVJUCkaDgd4maMA0kBRJf3ftQG1UnG_W2Ao_sf-Qqrflzu80xiv30AjTmoHwGih8u4rB-iMS6Arj2Bbcl3EO24svdqsVCCS-smAasHsDP2pVPCw2HWAXmNNv5IDTI4TlNYhabCnm1eG50rmOXWy_2Qhp9sI1trOrxFqyEmkMyK_iypLFIL-DlrpJ8KZc5KPW6s_prVv7rGRcvEvf_AZVfoJ9UWLxg0KtUH3xoFAFhpqpW1NO3CdjzeBbEbs9xKU7zLFeQwbZU3oZCzjJ3HAKc1enlU48D64aRRw2ZaTnlLHw5tJDSpchBUJX5jyAo9WXVhvjLCe3Coj-OfwqsUyPkkCal33h7O8twSIIFe7THN0rPux-Fjxk1s11GfB9QvppgibwV7L7m6ODTelMofI17eeMGxJcBCePZzVfSVIp3SbTsQhkkU-UErI27TdO3j0_r4d4ceBKSF2kGxRFVCqoKQ2O6ifv_vMdiOlDlCFHgiSs933MuAF5OrWavrGp59gWamUfJXMsX_CqeP8B8Fu-gI3nWR-qX5Xyu7gpphdkn60Dsnw_I174OY9p0RbS01kGiRGCA5wzv5OQaJYcvIOzLr9O9Pa_nWK6jIx4u7REN6Z6-B4Q6dK7Pc1nWPuUQNS1rPkxVYHq2gUDE7kLch5JvCLUImRFceTpiw1QeScTi_p3GHQXIzvrjHv0PlZ5c7fs2VTwM9wUcFEX24C-lJhUnKnPeyI3gK15hQSHe0EClUzrdf_159KKY6VgE4ig3lwAQYVU4zD_ZJTrs3clRRVNufg-XEDFEJ_MxCme-tjskTxUUglTDRKwD4Ij8Ok31yQiAOcflsF-CG6AY3HyVObVLqp3jCS9kPRzQ4gJRuhd4Zhh3QpmzVk4Q0iCjom4jfzVEUWAhtl8399LG6b3kNnbBWX_ON_1aj58JlU-QYeGHuZ7olbrKVkphljS9-lUjTMp5DJmG3D_L4yIVP25VDVceM4K1TGW4m_FaGjuKsmhLHgP4IMBRYgvHAeLoHqfwXORBnQ1M0mGAHsFt1i7kTOBVwJx1v8slmuFGdYgIJtRi7qPjOPvqaBmFg7_PT-tLC1R_8D0zZNBCkZAHYq7JQM09HV6qNj7s_zShfxN2crakn6J8VPDashRiYAysOq3V-ZB1Efai3bkWl90ae1WaWZtCH7ZVc3Z7ie3anyMSxYfLzGlaRm_dOgA0_SUws90cYbhWkzHBunGmUMCbf50PwC-InQ6-6Y1LIPEkH591wITeiqgI7SYcx30c12kA06-iRQ7yPZzD-bmWiEXd1_KQgb85vMf3wR3E-haRiVBYHVF_CHc-JM28GuVFmKrDilWt7VyEy5XwpVaZw0p4l0ukVUgkv.V0DZCdRFRZKFkglRGB0WIg"
#api = ChatGPT(session_token)#

##textbox_xpath = '//*[@id="radix-:rs:"]/div[2]/div/div[4]/button'
##script = 'alert("Hello, from JavaScript!");'
##script = f'document.evaluate(\'{textbox_xpath}\', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.click();'
##api.driver.execute_script(script)
##button = WebDriverWait(driver, 10).until(
##    EC.element_to_be_clickable((By.XPATH, '//*[@id="radix-:rs:"]/div[2]/div/div[4]/button'))
##)#

## Click the button
##button.click()#
#

## Click the button
##button.click()
#resp = api.send_message('Write an essay on Generative AI')#

## Locate and scroll to the "Okay, let's go" button
#button = api.driver.find_element(By.XPATH, '//*[@id="radix-:rl:"]/div[2]/div/div[4]/button')
##api.driver.execute_script("arguments[0].scrollIntoView();", button)#

## Click the button
#button.click()#
#
#

#resp = api.send_message(' AI')
##button1 = api.driver.find_element(By.XPATH, '//*[@id="__next"]/div[1]/div[2]/div/main/div[1]/div[2]/form/div/div[2]/div/button')
##button.click()
#api.driver.quit()#

## Close the browser when done
##api.driver.quit()#
#

##print(resp['message'])
##api.refresh_auth()  # refresh the authorization token
##api.reset_conversation()  # reset the conversation#

##button_xpath = '//*[@id="radix-:rs:"]/div[2]/div/div[4]/button/div'
##button_element = driver.find_element(By.XPATH, button_xpath)
##button_element.click()#