In [None]:
from transformers import AutoTokenizer
from transformers import pipeline
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import transformers

%pip install pypdf
%pip install python-docx
from docx import Document
import pypdf

## Loading of the files
Loading the files the same way as done during the data understanding. We make a Path object, which we then can use to loop through every file in the "Trial" directory. This directory holds 2 files used during testing, to mainly check and test the natural language processor out. Next, basic meta deta for each of these files is stored inside of a frame.

In [None]:
files = []
path = Path('Trial')

for file in path.rglob("*"):
  if file.is_file():
    files.append({
      "path": str(file),
      "project name": file.relative_to(path).parts[0],
      "extension": file.suffix.lower(),
      "size_bytes": file.stat().st_size,
      "size_mb": file.stat().st_size / (1024 * 1024),
      "folder_depth": len(file.relative_to(path).parents)
    })

In [None]:
files_df = pd.DataFrame(files)
files_df

## Looping through the dataframe using pydf
Looping through the dataframe through each row.
Creating a reader object using pypdf pdfreader.
Looping through each page in the readerobject and counting the lenght of the page
Counting the words by splitting the text of each page in the readerobject and summing it.
Extracting the text using extract_text from pydf.

First we start by looping through every file in the given directory/dataframe. For each file, it first checks what the file type is, after which several points of information are gathered:
1. Checks language: based on most occuring words in both english and dutich vocabularies, the language is selected depending on which words occurs most
2. Number of pages: we can simply look at the amount of values in the pages property of the reader object.
3. Word count: for each page, the total amount of words is added to the total amount of words
4. Text: here, the actual content of the documents is stored



In [None]:
complete_file_df = files_df.copy()
pdf_files = []

# maybe not the best way to determine language, but it seems to work perfectly
most_occuring_english_words = ["the", "be", "to", "and"]
most_occuring_dutch_words = ["de", "het", "of", "een"]

complete_file_df['num_pages'] = None
complete_file_df['word_count'] = None
for i, document in complete_file_df.iterrows():
  
  if document['extension'] == '.pdf':
    reader_object = pypdf.PdfReader(document['path'])
    language_text = ""
    for page in reader_object.pages:
      if page_text := page.extract_text():
        language_text += page_text
    
    english_score = 0
    dutch_score = 0
    for word in language_text.split():
      if word.lower() in most_occuring_english_words:
        english_score += 1
      elif word.lower() in most_occuring_dutch_words:
        dutch_score += 1
    
    complete_file_df.at[i, 'language'] = 'english' if english_score > dutch_score else 'dutch' if dutch_score > english_score else 'unknown'


    # since if you use ['column_name'].iloc[index], you get FutureWarning: ChainedAssignmentError
    # its the reason why i use .at[index, 'column_name'] instead
    complete_file_df.at[i, 'num_pages'] = len(reader_object.pages)
    complete_file_df.at[i, 'word_count'] = sum(len(text.split()) for page in reader_object.pages if (text := page.extract_text()))
    complete_file_df.at[i, 'text'] = language_text
 

In [None]:
# can be used as validation to check if the amount of words are even somewhat close to the files.
# for the 2 test files used, it was quite accurate (first file 10 words off, the second 500-ish)

for i,row in complete_file_df.iterrows():
    print(len(complete_file_df.at[i, 'text'].split()))

# preparing the text
After the text has been succesfully loaded, we can start actually preparing the text. For this, the following things were done:

- translation (found another currently better alternative): we initialy used GoogleTranslator to translate the text. There were however 2 issues with this: 1. 5000 max character limit | 2. very slow, taking atleast 30 min for the 2 documents. The code for this is currently commented out.
- tokenizing: instead of using the raw text as imput, we first tokanize it. While tokens are similar to words, they are not the same.
- chunking: based on the chunksize, the text is split into several chunks (for example, chunksize of 500 would return pieces of text 500 tokens long)

Regarding the warning, as far as we understand this shouldn't be a big issue:
since we tokanize the text first before chunking, it gives the warning to make sure the chunksize shouldn't be larger than 500 since the limit is something like 512? However, because we chunk after tokanising, we split the tokens anyways and we don't have a single chunk of 43192 tokens (which (as far as we know) the warning is about).

In [None]:
from deep_translator import GoogleTranslator

chunks = []
chunksize = 500
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")


for i, document in complete_file_df.iterrows():
    print(f"document {i+1} of {len(complete_file_df)}")
    text = document['text']

    # if document['language'] == 'dutch':
    #     words = text.split()
    #     sentences = []
    #     current = ""
    #     max_characters = 5000

    #     for word in words:
    #         if len(current) + len(word) + (1 if current else 0) <= max_characters:
    #             current += (" " if current else "") + word
    #         else:
    #             sentences.append(current)
    #             current = word
    #         if current:
    #             sentences.append(current)
    
    #     print(f"  split into {len(sentences)} sentences")
    #     text = ""
    #     for sentence in sentences:
    #         translate_text = GoogleTranslator(source='auto', target='en').translate(sentence)
    #         text += translate_text + " "

    #     print(f"  translated to {document['language']}")


    tokens = tokenizer.tokenize(text)
    print(f"tokenized into {len(tokens)} tokens")
    
    for j in range(0, len(tokens),chunksize):
        chunk = tokens[j:j+chunksize]
        chunk = tokenizer.convert_tokens_to_string(chunk)
        chunks.append(chunk)
        # print(chunk)
    print(" ")

qa_pipeline = pipeline("question-answering", model="deepset/xlm-roberta-base-squad2",tokenizer=tokenizer )

# defining the model
Here, we implement and use the question & answer pipeline from `from transformers import pipeline`. For this, we first defined our nlp model. For this, `deepset/xlm-roberta-base-squad2` seemed to handle the different languages. 
We first use lower to change everything from the given question into lowercase. next:
1. loop throug every single chunk
2. simply check if the chunk has matching key words
3. if any keywords match, they go through the qa_pipeline mentioned above.
4. this pipeline returns the expected answer with the highest confidence, along with the chunk the answer was in.
5. At last, the top 3 answers with the highest confidence is stored inside `best_answer`

example:
- question: `What is the budget of water nexus`
- response: `{'score': 1.6871259659528732, 'start': 1138, 'end': 1154, 'answer': '6.6 Million Euro', 'chunk': 1}`

which is so close to correct, but the model missed the additional 1.2 something million mentioned a few sentences later

In [None]:
best_answer = [
    {"score": 0.0, "answer": "No relevant information found.", "chunk": 0},
    {"score": 0.0, "answer": "No relevant information found.", "chunk": 0},
    {"score": 0.0, "answer": "No relevant information found.", "chunk": 0}
]

def document_qa(question, text_chunks, qa_model_pipeline):
    question_words = set(question.lower().split())

    for i, chunk in enumerate(text_chunks):
        # Simple check: if a chunk contains any key words, process it
        if any(word in chunk.lower() for word in question_words):
            try:
                result = qa_model_pipeline(question=question, context=chunk)
                
                # If this chunk's answer is better than the current best answer, save it.
                if result['score'] > best_answer[0]["score"]:
                    best_answer[0] = result
                    best_answer[0]['chunk'] = i
                elif result['score'] > best_answer[1]["score"]:
                    best_answer[1] = result
                    best_answer[1]['chunk'] = i
                elif result['score'] > best_answer[2]["score"]:
                    best_answer[2] = result
                    best_answer[2]['chunk'] = i
            except Exception as e:
                print(f"Error processing chunk {i}: {e}")
                continue
                 
# Example Question:
question = "What is the budget of water nexus"
# question = "What is the yield of growing vegetables in open ground?"
# question="Wat is een boon"

result = document_qa(question, chunks, qa_pipeline)

# Print the result
print(f"Question: {question}")
for answer in best_answer:
    print(f"Response: {answer}")