### Load necessary packages

In [3]:
import os
import numpy as np
import faiss
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import fitz  # PyMuPDF for extracting text from PDFs
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
import certifi
import spacy
from nltk.corpus import wordnet

### Ensure NLTK and Dependencies are Installed: Make sure you have NLTK installed correctly in the current virtual environment

In [4]:
import ssl

# Set the SSL context
ssl._create_default_https_context = ssl._create_unverified_context

# Now download some dependencies for NLTK
try:
    nltk.download('punkt')      # download the Punkt tokenizer models for sentence tokenization
    nltk.download('punkt_tab')  # specifically download punkt_tab for tokenization of tab-delimited files (e.g., TSV)
    nltk.download('wordnet')    # download the WordNet corpus for synsets and lemmas
    nltk.download('omw-1.4')    # download the Open Multilingual WordNet (OMW) corpus for synsets in multiple languages
except Exception as e:
    print(f"Error downloading NLTK data: {e}")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/cyrillekonzeu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/cyrillekonzeu/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cyrillekonzeu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cyrillekonzeu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### 1 - Extracts text from all PDF files in the folder called "pdfs"

In [5]:
def extract_text_from_pdfs(pdf_folder):
    all_texts = []
    for filename in os.listdir(pdf_folder):
        if filename.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, filename)
            with fitz.open(file_path) as pdf_file:
                text = ""
                for page in pdf_file:
                    text += page.get_text()
                all_texts.append(text)
    return all_texts

### 2 - Breaks the extracted text into paragraphs or sections

In [6]:
def break_into_paragraphs(text, min_length=50):
    paragraphs = text.split("\n")
    meaningful_paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > min_length]
    return meaningful_paragraphs

### 3 - Creates a vector database for the given paragraphs using TF-IDF vectors

In [7]:
def create_vector_database(paragraphs):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(paragraphs).toarray()

    # Create FAISS index
    dimension = vectors.shape[1]
    index = faiss.IndexFlatL2(dimension)  # Using L2 distance
    index.add(np.array(vectors).astype('float32'))  # Convert to float32 for FAISS

    return index, vectorizer

### 4 - Summarizes the given text using the TextRank algorithm from the `sumy` library.

In [8]:
def summarize_text(text, sentence_count=3):
    parser = PlaintextParser.from_string(text, Tokenizer("german"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentence_count)
    summarized_response = " ".join(str(sentence) for sentence in summary)
    return summarized_response

### 5 - Queries the vector database and retrieves the most relevant paragraphs based on the query.

In [9]:
def query_vector_db(query, index, vectorizer):
    query_vector = vectorizer.transform([query]).toarray().astype('float32')
    _, indices = index.search(query_vector, k=5)  # Retrieve top 5 paragraphs
    return indices.flatten()

### Implement a function that takes a text query, vectorizes it, and retrieves the most relevant passages from the vector database. Combine the retrieved passages to create a coherent response to the query. Make sure that the response is meaningful and understandable.

In [15]:
from gensim.summarization import summarize

def main(pdf_folder, query, sentence_count=3):
    # Step 1: Extract text from the PDF documents
    all_texts = extract_text_from_pdfs(pdf_folder)

    # Combine all extracted texts into a single string
    combined_text = " ".join(all_texts)

    # Step 2: Break the extracted text into meaningful paragraphs
    paragraphs = break_into_paragraphs(combined_text)

    # Step 3: Create a vector database for the paragraphs
    index, vectorizer = create_vector_database(paragraphs)

    # Step 4: Query the vector database
    print(f"Querying for: {query}")
    indices = query_vector_db(query, index, vectorizer)

    # Retrieve and summarize the most relevant paragraphs
    relevant_paragraphs = [paragraphs[i] for i in indices]
    combined_relevant_text = " ".join(relevant_paragraphs)

    # Step 5: Summarize the relevant paragraphs
    # summarized_text = summarize_text(combined_relevant_text, sentence_count=sentence_count)
    summarized_text = summarize(combined_relevant_text, ratio=0.2)
    
    return summarized_text

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

###  Testing the Function

In [14]:
pdf_folder = "./pdfs"
query = "Wie hoch ist die Grundzulage?"
response = main(pdf_folder, query, sentence_count=3)
print(response)

Querying for: Wie hoch ist die Grundzulage?
Die monatliche Rente aus der Teilkapitalverrentung muss mindestens so hoch sein, wie Die monatliche Rente aus der Teilkapitalverrentung muss mindestens so hoch sein, wie Die monatliche Rente aus der Teilkapitalverrentung muss mindestens so hoch sein, eigenbeitrag, ist die für dieses Beitragsjahr zustehende Altersvorsorgezulage (Grundzulage eigenbeitrag, ist die für dieses Beitragsjahr zustehende Altersvorsorgezulage (Grundzulage
