In [9]:
import os
import json
import re
import google.generativeai as genai
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
import pypdf
from dotenv import load_dotenv
from transformers import pipeline

In [3]:
def extract_text_from_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    text = " ".join([doc.page_content for doc in documents])
    return text

pdf_files = [
    "data/1_FinancialResults_05022025142214.pdf",
    "data/Amaar raja Earnings Summary.pdf"
]

pdf_texts = {pdf: extract_text_from_pdf(pdf) for pdf in pdf_files}


In [4]:
from langchain_huggingface import HuggingFaceEmbeddings 
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en")

In [5]:
vectorstore = FAISS.from_texts(list(pdf_texts.values()), embeddings)

In [7]:

retriever = vectorstore.as_retriever()

In [8]:
def clean_gemini_response(response):
    """
    Cleans Gemini's response to correct formatting issues before JSON parsing.
    - Fixes incorrect number formatting.
    - Replaces ":" in numbers with ".".
    - Fixes incorrectly formatted financial periods.
    """
    if not response or not response.strip():
        return '{"error": "No data extracted"}'
    
    response = re.sub(r'(\d):(\d)', r'\1.\2', response)
    response = response.replace('(31(12/2024)', '(31/12/2024)')
    
    return response

In [None]:
from transformers import pipeline
from huggingface_hub import login

# Authenticate (only needed once per session)
login(token="Your hugging face token")

# Load model with authentication
llm_pipeline = pipeline(
    "text-generation",
    model="mistralai/Mistral-7B-v0.1",
    token="Your hugging face token"
)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def extract_financial_data(text):
    query = f"""
    You are an AI tasked with extracting financial details from a report.
    
    Extract and return the following details as a **valid JSON object**:
    - Company Name
    - Report Date
    - Profit Before Tax
    - Revenue
    - Total Expenses
    - Net Profit
    - Tax Expense
    - Any declared Dividends (if available)

    Report:
    {text}

    **Output must be strictly in JSON format.** 
    """

    response = query_open_source_llm(query)

    if not response or not response.strip():
        print("❌ LLM returned an empty response! Check model.")
        return {"error": "No data extracted"}

    cleaned_response = clean_gemini_response(response)  # Keep your existing cleaning logic

    try:
        return json.loads(cleaned_response)
    except json.JSONDecodeError as e:
        print(f"❌ JSON Parsing Error: {e}")
        print("🔹 Raw Cleaned Response:\n", cleaned_response)
        return {"error": "Invalid JSON format"}


In [None]:
with open("extracted_financial_data.json", "w", encoding="utf-8") as f:
    json.dump(extracted_data, f, indent=4)

print("✅ JSON file saved: extracted_financial_data.json")
