In [2]:
# Load necessary libraries
import pdfplumber
import json
import os
import regex as re
import requests
import textwrap

In [3]:
# Define the API URL
url = "http://llama-max-ollama.ai.wu.ac.at/api/tags"

# Make the GET request
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    data = response.json()  # Convert response to JSON
    
    # Extract the list of models
    models = data.get("models", [])
    
    # Format and print each model's details
    print("\nAvailable Models:\n" + "="*50)
    for model in models:
        print(f"🔹 Model Name: {model['name']}")
        print(f"   🔸 Latest Version: {model['model']}")
        print(f"   🔸 Last Modified: {model['modified_at']}")
        print(f"   🔸 Size: {model['size'] / (1024**3):.2f} GB")  # Convert bytes to GB
        print(f"   🔸 Parameter Size: {model['details'].get('parameter_size', 'Unknown')}")
        print(f"   🔸 Quantization Level: {model['details'].get('quantization_level', 'Unknown')}")
        print(f"   🔸 Model Family: {model['details'].get('family', 'Unknown')}")
        print("="*50)

else:
    print(f"Error {response.status_code}: {response.text}")


Available Models:
🔹 Model Name: deepseek-r1:latest
   🔸 Latest Version: deepseek-r1:latest
   🔸 Last Modified: 2025-01-22T11:28:16.966385195Z
   🔸 Size: 4.36 GB
   🔸 Parameter Size: 7.6B
   🔸 Quantization Level: Q4_K_M
   🔸 Model Family: qwen2
🔹 Model Name: mistral:latest
   🔸 Latest Version: mistral:latest
   🔸 Last Modified: 2024-11-18T13:04:07.029310353Z
   🔸 Size: 3.83 GB
   🔸 Parameter Size: 7.2B
   🔸 Quantization Level: Q4_0
   🔸 Model Family: llama
🔹 Model Name: llama3.1:latest
   🔸 Latest Version: llama3.1:latest
   🔸 Last Modified: 2024-11-18T13:04:06.322299249Z
   🔸 Size: 4.34 GB
   🔸 Parameter Size: 8.0B
   🔸 Quantization Level: Q4_0
   🔸 Model Family: llama
🔹 Model Name: falcon2:latest
   🔸 Latest Version: falcon2:latest
   🔸 Last Modified: 2024-11-18T13:04:05.01127866Z
   🔸 Size: 5.94 GB
   🔸 Parameter Size: 11B
   🔸 Quantization Level: Q4_0
   🔸 Model Family: falcon
🔹 Model Name: gemma2:latest
   🔸 Latest Version: gemma2:latest
   🔸 Last Modified: 2024-11-18T13:04:05.674

In [6]:
# Extract the text
def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Test PDF extraction
pdf_text = extract_text_from_pdf("./reports_ie/IE-6218-200111 Collision RRME Rosslare.pdf")
print("Extracted Text Sample:", pdf_text[:500])  # Print first 500 characters

Extracted Text Sample: Railway Accident
Investigation Unit
Ireland
INVESTIGATION REPORT
Collision between an Iarnród Éireann passenger
train and rail-mounted maintenance equipment,
Rosslare, Wexford, 11th January 2020
RAIU Investigation Report No: 2020 – R004
Issued: 16th December 2020
Collision between an IÉ passenger train and RMME, Rosslare, Wexford, 11th January 2020
Report Description
Report publication
This report is published by the Railway Accident Investigation Unit (RAIU). The copyright in
the enclosed repor


In [None]:
pdf_text

In [7]:
# Define the API URL
url = "http://llama-max-ollama.ai.wu.ac.at/api/generate"

# Define the payload
payload = {
    "model": "deepseek-r1:latest",  # Ensure correct model name
    "prompt": f"Consider this context: {[pdf_text[:5000]]}. Please extract the entities date, time, and location. Output the information in JSON format.",
    "stream": False  # If 'raw' is unnecessary, remove it
}

# Set headers
headers = {"Content-Type": "application/json"}

# Send POST request
response = requests.post(url, json=payload, headers=headers)

# Handle response
if response.status_code == 200:
    try:
        data = response.json()  # Parse response JSON
        if "response" in data:
            formatted_response = textwrap.fill(data["response"], width=80)  # Wrap text for readability
            print("\n" + "="*80)
            print("Generated Summary:")
            print("="*80)
            print(formatted_response)
            print("="*80)
        else:
            print("No 'response' key found in the JSON.")
    except json.JSONDecodeError:
        print(f"Invalid JSON response: {response.text}")
else:
    print(f"Error {response.status_code}: {response.text}")


Generated Summary:
<think> Alright, I need to figure out how to extract the date, time, and
location from this given context. Let's start by reading through the text
carefully.  First, I'll look for any dates mentioned. Scanning through, I see
"11th January 2020." That seems like a clear date: day, month, year. So that's
one part done.  Next, checking for time information. The context doesn't mention
any specific times when the collision occurred or when the report was issued. It
only states the date of the collision and the publication date of the report. No
hours or minutes are provided here, so I can’t extract a time from this text.
Now, for location. The collision happened in Rosslare, Wexford. Wexford is a
city in Ireland, so that gives me both the town and the country. Since it
doesn't specify a specific address within the town, just the general area as
Rosslare, that should be sufficient for the location.  Putting this all
together, I have the date as 11th January 2020, no time