In [2]:

from dotenv import load_dotenv
import os
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma


# Remove the OPENAI_API_KEY from the environment
# Load environment variables from the .env file
load_dotenv(dotenv_path='.env')

# Retrieve the API key from the environment variables
api_key = os.getenv("OPENAI_API_KEY")

# Check if the API key is retrieved successfully
if not api_key:
    raise ValueError("API key not found. Please ensure the .env file contains the OPENAI_API_KEY.")

print(api_key)

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


In [26]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, StorageContext, load_index_from_storage
from llama_index.core.node_parser import SimpleNodeParser
import openai
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
import os
import pickle
from tqdm import tqdm
import json
import time

# Define chunk size parameters
CHUNK_SIZE = 512
CHUNK_OVERLAP = 50

def analyze_document_metadata(text_chunks, llm, max_retries=3):
    """Analyze document chunks to determine party and document type using LLM."""
    system_prompt = """You are an expert in German political documents. Analyze the given text and determine:
    1. The German political party this document belongs to
    2. Whether this is a party manifesto or OTHER type of document
    
    Return your analysis in JSON format with two fields:
    - party: The full name of the German political party
    - doc_type: Either "manifesto" or "OTHER"
    """
    
    combined_text = "\n".join(text_chunks[:3])  # Use first 3 chunks
    user_prompt = f"Analyze this text from a German political document:\n\n{combined_text}"
    
    for attempt in range(max_retries):
        try:
            response = llm.chat.completions.create(
                model="gpt-4",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=0.2,
                timeout=30  # Add timeout parameter
            )
            
            try:
                metadata = json.loads(response.choices[0].message.content)
                return metadata
            except json.JSONDecodeError:
                if attempt == max_retries - 1:
                    return {"party": "Unknown", "doc_type": "Unknown"}
                time.sleep(2)  # Wait before retrying
                
        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Error analyzing document: {str(e)}")
                return {"party": "Unknown", "doc_type": "Unknown"}
            time.sleep(2)  # Wait before retrying

# Load PDFs
print("Loading PDFs from the folder...")
pdf_folder = '../downloaded_pdfs'
reader = SimpleDirectoryReader(pdf_folder, filename_as_id=True)
documents = reader.load_data()
print(f"Loaded {len(documents)} documents.")

# Initialize parser and LLM
parser = SimpleNodeParser.from_defaults(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

llm = openai.OpenAI(api_key=api_key)

nodes = []
# Progress bar showing processed PDFs count
with tqdm(total=len(documents), desc=f"Processing PDFs (0/{len(documents)})", unit="pdf") as pdf_pbar:
    for i, doc in enumerate(documents, 1):
        try:
            # First get chunks for the document
            doc_nodes = parser.get_nodes_from_documents([doc])
            
            # Get text from first 3 chunks for analysis
            chunk_texts = [node.text for node in doc_nodes[:3]]
            
            # Analyze document using LLM with retry logic
            #metadata = analyze_document_metadata(chunk_texts, llm)
            
            # Add metadata to each node
            for node in doc_nodes:
                node.metadata = {
                    "file_name": doc.metadata.get("file_name", "Unknown"),
                    # "party": metadata["party"],
                    # "doc_type": metadata["doc_type"]
                    "party": "Christian Democratic Union and Christian Social Union",
                    "doc_type": "manifesto"
                }
            nodes.extend(doc_nodes)
            
            # Print metadata for each document
            print(f"\nDocument {i} Metadata:")
            print(f"File: {doc.metadata.get('file_name', 'Unknown')}")
            print(f"Party: Christian Democratic Union and Christian Social Union")
            print(f"Type: manifesto")
            
            pdf_pbar.set_description(f"Processing PDFs ({i}/{len(documents)})")
            pdf_pbar.update(1)
            
            # Add small delay between API calls
            time.sleep(1)
            
        except KeyboardInterrupt:
            print("\nProcessing interrupted by user. Saving progress...")
            break
        except Exception as e:
            print(f"\nError processing document {i}: {str(e)}")
            continue

print(f"Generated {len(nodes)} nodes (chunks) from the documents.")
index = VectorStoreIndex(nodes)

# Define the directory for saving embeddings
embeddings_directory = "./embeddings"

# Create the directory if it doesn't exist
if not os.path.exists(embeddings_directory):
    print(f"Creating directory for embeddings at {embeddings_directory}...")
    os.makedirs(embeddings_directory)

# Save the index to the directory using pickle
print("Saving the index to the directory using pickle...")
with open(os.path.join(embeddings_directory, 'index.pkl'), 'wb') as f:
    pickle.dump(index, f)
print("Index saved successfully.")


Loading PDFs from the folder...
Loaded 81 documents.


Processing PDFs (1/81):   0%|          | 0/81 [00:00<?, ?pdf/s]


Document 1 Metadata:
File: politikwechsel-fuer-deutschland-wahlprogramm-von-cdu-csu-1.pdf
Party: Christian Democratic Union and Christian Social Union
Type: manifesto


Processing PDFs (2/81):   2%|▏         | 2/81 [00:01<00:39,  1.98pdf/s]


Document 2 Metadata:
File: politikwechsel-fuer-deutschland-wahlprogramm-von-cdu-csu-1.pdf
Party: Christian Democratic Union and Christian Social Union
Type: manifesto


Processing PDFs (3/81):   4%|▎         | 3/81 [00:02<00:55,  1.40pdf/s]


Document 3 Metadata:
File: politikwechsel-fuer-deutschland-wahlprogramm-von-cdu-csu-1.pdf
Party: Christian Democratic Union and Christian Social Union
Type: manifesto


Processing PDFs (4/81):   5%|▍         | 4/81 [00:03<01:03,  1.21pdf/s]


Document 4 Metadata:
File: politikwechsel-fuer-deutschland-wahlprogramm-von-cdu-csu-1.pdf
Party: Christian Democratic Union and Christian Social Union
Type: manifesto


Processing PDFs (5/81):   6%|▌         | 5/81 [00:04<01:07,  1.12pdf/s]


Document 5 Metadata:
File: politikwechsel-fuer-deutschland-wahlprogramm-von-cdu-csu-1.pdf
Party: Christian Democratic Union and Christian Social Union
Type: manifesto


Processing PDFs (6/81):   7%|▋         | 6/81 [00:05<01:09,  1.07pdf/s]


Document 6 Metadata:
File: politikwechsel-fuer-deutschland-wahlprogramm-von-cdu-csu-1.pdf
Party: Christian Democratic Union and Christian Social Union
Type: manifesto


Processing PDFs (7/81):   9%|▊         | 7/81 [00:06<01:10,  1.04pdf/s]


Document 7 Metadata:
File: politikwechsel-fuer-deutschland-wahlprogramm-von-cdu-csu-1.pdf
Party: Christian Democratic Union and Christian Social Union
Type: manifesto


Processing PDFs (7/81):   9%|▊         | 7/81 [00:06<01:11,  1.04pdf/s]


Processing interrupted by user. Saving progress...
Generated 13 nodes (chunks) from the documents.





Saving the index to the directory using pickle...
Index saved successfully.


In [27]:
# Define system message for LLM
SYSTEM_MESSAGE = """You are an expert political analyst specializing in German politics and policy.
Your role is to analyze political documents and provide clear, factual insights about their content, 
focusing on party positions, policy proposals, and ideological stances.
Base your responses strictly on the provided document context rather than general knowledge.
Be objective and precise in your analysis."""

# Define query function
def query_index(query):
    print(f"Querying the index with: {query}")
    query_engine = index.as_query_engine(
        llm=llm,
        similarity_top_k=4,  # Specify how many chunks to retrieve
        system_prompt=SYSTEM_MESSAGE
    )
    response = query_engine.query(query)
    
    # Print metadata from source nodes
    print("\nSource Document Metadata:")
    for idx, source_node in enumerate(response.source_nodes):
        print(f"\nSource {idx + 1}:")
        print(f"Party: {source_node.node.metadata.get('party', 'Unknown')}")
        print(f"Document Type: {source_node.node.metadata.get('doc_type', 'Unknown')}")
        print(f"Score: {source_node.score}")
    
    return response

# LLM setup
class QueryModel(BaseModel):
    query: str

print("Setting up the language model...")
llm = ChatOpenAI(model='gpt-4o')
print("Language model setup complete.")


Setting up the language model...
Language model setup complete.


In [28]:

# Example query
query = 'What is the main topic of the document?'
response = query_index(query)
print("Query response received:")
print(response)

Querying the index with: What is the main topic of the document?

Source Document Metadata:

Source 1:
Party: Christian Democratic Union and Christian Social Union
Document Type: manifesto
Score: 0.7813683702128842

Source 2:
Party: Christian Democratic Union and Christian Social Union
Document Type: manifesto
Score: 0.7725054737388198

Source 3:
Party: Christian Democratic Union and Christian Social Union
Document Type: manifesto
Score: 0.7719914901034813

Source 4:
Party: Christian Democratic Union and Christian Social Union
Document Type: manifesto
Score: 0.77194446728305
Query response received:
The main topic of the document is a political manifesto outlining the Christian Democratic Union and Christian Social Union's plan for a political change in Germany. It covers various areas such as economic prosperity, security, social justice, and international relations, with a focus on improving the country's future in terms of freedom, peace, and societal cohesion.
