In [6]:
import pandas as pd

# The path to your CSV file
file_path = "../evaluation_data/documents.xlsx"

In [7]:
try:
    # Read the file into a DataFrame
    # Most common alternative for non-UTF-8 files
    df = pd.read_excel(file_path, names=["name", "content"], header=None)

    print("✅ Successfully connected to and read the CSV file.")
    print("\n--- First 5 Rows ---")
    print(df.head())
    print("\n--- Data Structure ---")
    print(df.info())

except FileNotFoundError:
    print(f"❌ Error: The file at '{file_path}' was not found.")
except pd.errors.EmptyDataError:
    print("❌ Error: The file is empty.")

✅ Successfully connected to and read the CSV file.

--- First 5 Rows ---
     name                                            content
0  Test_A  <START OF DOCUMENT: Test_A.docx AltName:Source...
1  Test_C  <START OF DOCUMENT: Test_C.docx AltName:Source...
2  Test_D  <START OF DOCUMENT: Test_D.docx AltName:Source...
3  Test_F  <START OF DOCUMENT: Test_F.docx AltName:Source...

--- Data Structure ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     4 non-null      object
 1   content  4 non-null      object
dtypes: object(2)
memory usage: 196.0+ bytes
None


In [21]:
df.shape

(3, 2)

In [24]:
from scripts.preprocess import clean_document_for_llm
for i in range(df.shape[0]):
    raw_text = df["content"].iloc[i]
    clean_text =clean_document_for_llm(raw_text)
    print(f"✅ {i} text length: {len(clean_text)}")

# clean_text = print(clean_document_for_llm(raw_text))

✅ 0 text length: 3311
✅ 1 text length: 1666
✅ 2 text length: 2053


In [13]:
from scripts.read_file import parse_label_document, parse_document_to_json

file_path = "evaluation_data/labels.txt"

In [17]:
try:
    with open(file_path, "r", encoding="utf-8") as file:
        # The 'r' mode stands for read mode
        # The 'encoding' is vital to prevent UnicodeDecodeError (UTF-8 is standard)

        # Read the entire content of the file into the 'content' variable
        raw_labels = file.read()

    print("✅ File successfully read.")
    print("\n--- File Content Snippet ---")
    print(raw_labels[:500] + "...")  # Print first 500 characters

except FileNotFoundError:
    print(f"❌ Error: The file at '{file_path}' was not found.")
except UnicodeDecodeError:
    print(
        "❌ Error: Could not decode using 'utf-8'. Try specifying a different encoding (e.g., 'latin-1')."
    )

✅ File successfully read.

--- File Content Snippet ---
Test A

###Name #Orval O'Riocht
###Name #Liam O'Riocht
###Name #Michael Murphy
###Name #Sean O'Malley
###Company_Name #The Right Brothers
###Company_Name #Bank of Ireland
###Address #15 Grafton Street, Dublin 2, Ireland
###Address #42 Merrion Square, Dublin 2, Ireland
###Address #28 Fitzwilliam Square, Dublin 2, Ireland
###Address #17 Stephen's Green, Dublin 2, Ireland
###Address #Unit 7, Dublin Industrial Estate, Glasnevin, Dublin 11, Ireland
###PPS_Number #8472639T
###PPS_Number #6159287K
###L...


In [None]:
structured_dict = parse_document_to_json(raw_labels)

# Final step: Convert the Python dictionary into a formatted JSON string
json_output = json.dumps(structured_dict, indent=4)

print("\n--- Parsed and Structured JSON Output Snippet ---")
print(json_output[:1000] + "...")


--- Parsed and Structured JSON Output Snippet ---
{
    "Test A": {
        "Name": [
            "Orval O'Riocht",
            "Liam O'Riocht",
            "Michael Murphy",
            "Sean O'Malley"
        ],
        "Company_Name": [
            "The Right Brothers",
            "Bank of Ireland"
        ],
        "Address": [
            "15 Grafton Street, Dublin 2, Ireland",
            "42 Merrion Square, Dublin 2, Ireland",
            "28 Fitzwilliam Square, Dublin 2, Ireland",
            "17 Stephen's Green, Dublin 2, Ireland",
            "Unit 7, Dublin Industrial Estate, Glasnevin, Dublin 11, Ireland"
        ],
        "PPS_Number": [
            "8472639T",
            "6159287K"
        ],
        "License_Number": [
            "8472639",
            "AML-IE-8472639",
            "6159287",
            "CA-IE-6159287"
        ],
        "Phone_Number": [
            "+353-1-485-2739",
            "+353-1-842-5731",
            "+353-1-673-4821",
            "+353

In [20]:
import json

output_file_path = "parsed_data.json"

try:
    # Open the file in write mode ('w')
    with open(output_file_path, "w", encoding="utf-8") as f:
        # Use json.dump() to write the dictionary to the file stream.
        # indent=4 makes the JSON human-readable with 4-space indentation.
        json.dump(structured_dict, f, indent=4, ensure_ascii=False)

    print(f"✅ Successfully saved structured data to: **{output_file_path}**")

except Exception as e:
    print(f"❌ Error occurred while saving the file: {e}")

✅ Successfully saved structured data to: **parsed_data.json**
