In [3]:
import json
import fitz  
import re

from openai import OpenAI
key = 
client = OpenAI(
  api_key=key,  
)


In [31]:
def remove_special_characters(text):
    cleaned_text = re.sub(r'[^\x00-\x7F]+', '', text)  
    return cleaned_text

def extract_text(pdf_path):
    with fitz.open(pdf_path) as pdf_file:
        text = ""
        for page_num in range(pdf_file.page_count):
            page = pdf_file[page_num]
            text += page.get_text()

    return remove_special_characters(text)

In [23]:
def process_text(extracted_text, prompt_template):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that converts text into JSON format."},
            {"role": "user", "content": prompt_template.format(extracted_text)}
        ],
        temperature=0.3,
        max_tokens=16384,
    )
    return response.choices[0].message.content

pdf_path = 'minutes/mn20230124.pdf'
extracted_text = extract_text(pdf_path)


prompt_template = """
The following text contains meeting minutes. Please carefully extract **all** text and format it into a detailed JSON object with the following structure:


{{
  "meeting": {{
    "date": "YYYY-MM-DD",            // The date of the meeting
    "members": [
      {{
        "name": "Member Name",       // Name of each member
        "role": "Position"           // Role or title of the member
      }}
    ],
    "counsel": [
      {{
        "name": "Counsel Name",      // Name of counsel members
        "role": "Position"
      }}
    ],
    "staff": [
      {{
        "name": "Staff Name",        // Name of staff members
        "role": "Position"
      }}
    ],
    "others": [
      {{
        "name": "Other Name",        // Name of others (e.g., guests or external participants)
        "role": "Position or Affiliation"
      }}
    ]
  }},
  "meeting_items": [
    {{
      "item_number": "A1",           // Item number (e.g., 'A1', 'B2')
      "title": "Item Title",         // Title or description of the item
      "action_item": true,           // true if it's an action item, false otherwise
      "text": "all text with this item"
      "result": "Approved"           // Result or outcome of the agenda item (e.g., 'Approved', 'Deferred', 'No Action Taken')
    }}
  ]
}}

Make sure you extract **all text** between each item

Focus on preserving the structure and details of the meeting minutes and reflect them accurately in the JSON.

Text:
{}
"""

json_output = process_text(extracted_text, prompt_template)

output_file = 'output.json'
with open(output_file, 'w') as json_file:
    json.dump(json_output, json_file, indent=4)

print(f"Data extracted and saved to {output_file}")
print(json_output)

Data extracted and saved to output.json
```json
{
  "meeting": {
    "date": "2023-01-24",
    "members": [
      {
        "name": "Ms. Dawn N. S. Chang",
        "role": "Chairperson"
      },
      {
        "name": "Mr. Michael Buck",
        "role": "Commissioner"
      },
      {
        "name": "Mr. Neil Hannahs",
        "role": "Commissioner"
      },
      {
        "name": "Dr. Aurora Kagawa-Viviani",
        "role": "Commissioner"
      },
      {
        "name": "Mr. Wayne Katayama",
        "role": "Commissioner"
      },
      {
        "name": "Mr. Paul Meyer",
        "role": "Commissioner"
      },
      {
        "name": "Ms. Kathleen Ho",
        "role": "Commissioner"
      }
    ],
    "counsel": [
      {
        "name": "Ms. Julie China, Esq.",
        "role": "Counsel"
      }
    ],
    "staff": [
      {
        "name": "Deputy M. Kaleo Manuel",
        "role": "Deputy Director"
      },
      {
        "name": "Mr. Barrett Won",
        "role": "Staff"
     

In [32]:
pdf_path = 'minutes/mn20230124.pdf'
extracted_text = extract_text(pdf_path)

In [33]:
extracted_text



In [34]:
def structure_to_json(extracted_text, date_pattern):
    
    sections = re.split(f"({date_pattern} \\d+:\\d+:\\d+)", extracted_text)

    structured_data = []

    def clean_content(content):
        return [line for line in content.split("\n") if line.strip()]

   
    if sections[0].strip():
        structured_data.append({
            "Date": "N/A",
            "Content": clean_content(sections[0].strip())
        })

    for i in range(1, len(sections), 2):
        date = sections[i].strip()  
        content = sections[i+1].strip()  
        structured_data.append({
            "Date": date,
            "Content": clean_content(content)  
        })
    
    return structured_data

date_pattern = "012423"
structured_data = structure_to_json(extracted_text, date_pattern)
json_output = json.dumps(structured_data, indent=4)

print(json_output)

[
    {
        "Date": "N/A",
        "Content": [
            "MINUTES ",
            "FOR THE MEETING OF ",
            " THE COMMISSION ON WATER RESOURCE MANAGEMENT ",
            "DATE: ",
            "January 24, 2023 ",
            "TIME: ",
            "2:00 pm ",
            "PLACE: ",
            "DLNR Boardroom ",
            "1151 Punchbowl Street, 1st Floor ",
            "Honolulu, Hawaii 96813 ",
            "& Online via Zoom-Meeting ID: 880 5691 5295 ",
            "Chairperson Chang called the meeting of the Commission on Water Resource Management to order at 2:00 ",
            "p.m. and stated it is a hybrid meeting being held in the Kalanimoku Building boardroom, remotely via Zoom ",
            "and others are viewing via YouTube.  It was noted that people may testify via the information provided ",
            "online.  Chairperson Chang reminded the public not to use the chat feature for any comments as it presents ",
            "a Sunshine Law issue.  Chairper