In [6]:
import os
import time
import json
import torch
import requests
import fitz  # PyMuPDF
import pandas as pd
from pdf2image import convert_from_path
from transformers import AutoModelForCausalLM, AutoTokenizer
import pytesseract


In [9]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

^C


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.3.1%2Bcu118-cp310-cp310-win_amd64.whl (4.0 MB)
     ---------------------------------------- 0.0/4.0 MB ? eta -:--:--
     ----- ---------------------------------- 0.6/4.0 MB 11.5 MB/s eta 0:00:01
     ----------- ---------------------------- 1.1/4.0 MB 11.9 MB/s eta 0:00:01
     ---------------- ----------------------- 1.7/4.0 MB 13.3 MB/s eta 0:00:01
     ------------------- -------------------- 2.0/4.0 MB 10.3 MB/s eta 0:00:01
     ---------------------------- ----------- 2.8/4.0 MB 12.9 MB/s eta 0:00:01
     ------------------------------------- -- 3.7/4.0 MB 13.1 MB/s eta 0:00:01
     ---------------------------------------- 4.0/4.0 MB 12.0 MB/s eta 0:00:00
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.3.1%2Bcu118-cp310-cp310-win_amd64.whl (2673.0 MB)
     ---------------------------------------- 0.0/2.7 

  You can safely remove it manually.


In [10]:
!pip install torch torchvision




In [11]:
import torch
print(torch.__version__)


2.3.1+cpu


In [13]:
!pip install transformers



In [14]:

# Set up Tesseract
pytesseract.pytesseract.tesseract_cmd = r'C:/Program Files/Tesseract-OCR/tesseract.exe'
os.environ['TESSDATA_PREFIX'] = r'C:/Program Files/Tesseract-OCR/tessdata'  # Update this path to your Tesseract tessdata directory

def extract_text_from_pdf(pdf_path, timeout=60):
    doc = fitz.open(pdf_path)
    text = ""
    start_time = time.time()
    for page in doc:
        if time.time() - start_time > timeout:
            print(f"Timeout reached for {pdf_path}")
            break
        text += page.get_text()
    return text

def extract_text_from_images(pdf_path, poppler_path=r'C:/Program Files/poppler/Release-24.02.0-0/poppler-24.02.0/Library/bin'):
    if poppler_path:
        images = convert_from_path(pdf_path, poppler_path=poppler_path)
    else:
        images = convert_from_path(pdf_path)
    
    text = ""
    for image in images:
        text += pytesseract.image_to_string(image, lang='deu')  # Use 'deu' for German texts
    return text

def predict_NuExtract(model, tokenizer, text, schema, example=["","",""]):
    schema = json.dumps(json.loads(schema), indent=4)
    input_llm =  "\n### Template:\n" +  schema + "\n"
    for i in example:
        if i != "":
            input_llm += "### Example:\n" + json.dumps(json.loads(i), indent=4) + "\n"
    
    input_llm +=  "### Text:\n" + text + "\n\n"
    input_ids = tokenizer(input_llm, return_tensors="pt", truncation=True, max_length=4000).to("cuda")

    output = tokenizer.decode(model.generate(**input_ids)[0], skip_special_tokens=True)
    return output.split("### Text:")[1].split("<|end-output|>")[0]

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained("numind/NuExtract-tiny", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract-tiny", trust_remote_code=True)
model.to("cuda")
model.eval()

model = AutoModelForCausalLM.from_pretrained("numind/NuExtract-tiny", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("numind/NuExtract-tiny", trust_remote_code=True)
#model.to("cuda")
model.eval()

schema = """{
    "Vertragsnummer": "Hier soll die Vertragsnummer stehen. Diese ist in der Regel ein Integer, könnte aber in Ausnahmefällen auch andere Zeichen enthalten",
    "Adresse Verbrauchsstelle": "Das hier ist die Adresse der Verbrauchsstelle. Verwechsele sie nicht mit der Rechnungsadresse! Sie besteht aus Straße, Hausnummer, Postleitzahl und Ort",
    "Verbrauchte Menge": "Hier soll die verbrauchte Menge in Kilowattstunden (kWh) stehen. Wenn die Menge in Megawattstungen (MWh) angegeben ist, multipliziere mal 1000",
    "Start abgerechneter Zeitraum": "Gebe hier den Anfang des Abgerechneten Zeitraums als JJJJ-MM-TT an",
    "Ende abgerechneter Zeitraum": "Gebe hier das Ende des Abgerechneten Zeitraums als JJJJ-MM-TT an",
    "Energieart": "Hier soll entweder Gas oder Strom stehen",
    "Fehler": "Gib hier an, ob du bestimmte Daten nicht gefunden hast. Gib dann True an, ansonsten False. Damit wollen wir fehlerhafte Daten filtern"
}"""

def extract_data_from_text(text):
    prediction = predict_NuExtract(model, tokenizer, text, schema)
    return prediction

def categorize_invoice(pdf_path):
    keywords_electricity = ["Strom", "kWh", "Netzbetreiber", "Energie"]
    keywords_gas = ["Gas", "m³", "Heizkosten", "Gasanbieter", "therm", "brennwert", "kWh/m³", "Gasverbrauch", "Gasrechnung"]

    with fitz.open(pdf_path) as doc:
        text = ""
        for page in doc:
            text += page.get_text()

    text = text.lower()

    electricity_score = sum(keyword.lower() in text for keyword in keywords_electricity)
    gas_score = sum(keyword.lower() in text for keyword in keywords_gas)

    if electricity_score > gas_score:
        return "Strom"
    elif gas_score > electricity_score:
        return "Gas"
    else:
        return "Nicht eindeutig"

def process_pdfs(pdf_folder):
    extracted_data = []

    pdf_files = [os.path.join(pdf_folder, f) for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

    for file_path in pdf_files:
        text = extract_text_from_pdf(file_path)
        if not text.strip():
            text = extract_text_from_images(file_path)  # Use OCR if no text was found
        data = extract_data_from_text(text)
        print(f"Extracted data from text for {file_path}:\n{data}\n")  # Debugging output
        extracted_data.append(data)

        time.sleep(5)  # Pause for 5 seconds between requests to avoid rate limiting

    return extracted_data

def extract_dict_from_response(response):
    response = response.split('{', 1)[1].split('}', 1)[0]
    response = '{' + response + '}'
    return json.loads(response)

def create_excel_file(data, output_file_path):
    data_dict = {'Vertragsnummer': [], 'Adresse Verbrauchsstelle': [], 'Verbrauchte Menge': [], 'Start abgerechneter Zeitraum': [], 'Ende abgerechneter Zeitraum':[], 'Energieart': [], 'Fehler':[]}

    for entry in data:
        temp_dict = extract_dict_from_response(entry)
        print(temp_dict)
        data_dict['Vertragsnummer'].append(temp_dict['Vertragsnummer'])
        data_dict['Adresse Verbrauchsstelle'].append(temp_dict['Adresse Verbrauchsstelle'])
        data_dict['Verbrauchte Menge'].append(temp_dict['Verbrauchte Menge'])
        data_dict['Start abgerechneter Zeitraum'].append(temp_dict['Start abgerechneter Zeitraum'])
        data_dict['Ende abgerechneter Zeitraum'].append(temp_dict['Ende abgerechneter Zeitraum'])
        data_dict['Energieart'].append(temp_dict['Energieart'])
        data_dict['Fehler'].append(temp_dict['Fehler'])

    df = pd.DataFrame(data_dict)
    print(df)
    df.to_excel(output_file_path, index=False)
    print(f'Excel file created: {output_file_path}')

if __name__ == "__main__":
    pdf_folder = '../data'  # Folder where the PDF files are located

    if not os.path.exists(pdf_folder):
        print(f"The folder '{pdf_folder}' does not exist.")
    else:
        # Process PDFs and extract data
        extracted_data = process_pdfs(pdf_folder)

        # Create and save the Excel file
        output_file_path = 'extracted_data.xlsx'
        create_excel_file(extracted_data, output_file_path)


ImportError: 
AutoModelForCausalLM requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.
