In [3]:
!pip install -q -U google-genai

In [26]:
import re
from google import genai
from typing import Dict, List
import pandas as pd
import json

# Set your OpenAI API key

def extract_structured_data(text: str) -> Dict:
    """
    Extracts structured transaction data from unstructured text using hybrid approach
    (regex for simple fields + LLM for complex parsing)
    """
    # Basic regex patterns for simple fields
    patterns = {
        'transaction_id': r'Transaction ID:\s*([^\n]+)',
        'date': r'Date:\s*([^\n]+)',
        'amount': r'Amount:\s*([^\n]+)',
        'receiver_country': r'Receiver:.*\n.*Address:[^\n]*,\s*([^\n,]+)'
    }
    
    extracted = {}
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.DOTALL)
        if match:
            extracted[field] = match.group(1).strip()
    
    # Use LLM for complex entity extraction
    llm_prompt = f"""
    Extract the following fields from this transaction in key value format:
    - payer_name
    - receiver_name
    - transaction_details
    - amount (numeric only)
    - receiver_country (ISO country code)
    
    Transaction Text:
    {text}
    """
    
    client = genai.Client(api_key="AIzaSyAKg2xdzoEI9G5nCMaaAdKy3Lqe8bkquyQ")
    response = client.models.generate_content(
        model='gemini-2.0-flash',
        contents=llm_prompt,
    )
    
    llm_data = json.loads(response.text.strip("```json").strip("```").strip())

#     llm_data = {
#       "payer_name": "Global Horizons Consulting LLC",
#       "receiver_name": "Bright Future Nonprofit Inc",
#       "transaction_details": "Consulting fees for project Aurora Charitable Donation â€“ Ref #DR-2023-0815 Urgent transfer approved by Mr. Ali Al-Mansoori (Director). Linked invoice missing. Processed via intermediary Quantum Holdings Ltd (BVI).",
#       "amount": 49850.00,
#       "receiver_country": "KY"
#     }
    
    # Merge regex and LLM extractions
    final_data = {
        "Transaction ID": extracted.get('transaction_id', ''),
        "Payer Name": llm_data.get('payer_name', ''),
        "Receiver Name": llm_data.get('receiver_name', ''),
        "Transaction Details": llm_data.get('transaction_details', ''),
        "Amount": f"${llm_data.get('amount', 0):,.2f}",
        "Receiver Country": llm_data.get('receiver_country', '')
    }
    
    # Post-processing
    if 'Cayman' in final_data["Receiver Country"]:
        final_data["Receiver Country"] = "Cayman Islands"
    
    return final_data

def process_unstructured_file(file_path: str) -> pd.DataFrame:
    """Processes unstructured transaction file into structured DataFrame"""
    with open(file_path, 'r') as f:
        content = f.read()
    
    # Split into individual transactions
    transactions = re.split(r'---+\n', content)
    
    structured_data = []
    for tx in transactions:
        if tx.strip():
            try:
                structured_data.append(extract_structured_data(tx))
            except Exception as e:
                print(f"Error processing transaction: {e}")
                continue
    
    return pd.DataFrame(structured_data)

# Example Usage
if __name__ == "__main__":
    df = process_unstructured_file("unstructured_dataset.txt")
    print("\nStructured Output:")
    print(df.to_markdown(index=False))
    
    # Save to CSV
    df.to_csv("structured_transactions.csv", index=False)


Structured Output:
| Transaction ID   | Payer Name                     | Receiver Name               | Transaction Details                                    | Amount      | Receiver Country   |
|:-----------------|:-------------------------------|:----------------------------|:-------------------------------------------------------|:------------|:-------------------|
| TXN-2023-5A9B    | Global Horizons Consulting LLC | Bright Future Nonprofit Inc | Consulting fees for project Aurora Charitable Donation | $49,850.00  | KY                 |
| TXN-2023-7C2D    | Quantum Holdings Ltd           | Golden Sands Trading FZE    | Commodity Trade Settlement â€“ Contract #DX-889        | $950,000.00 | AE                 |
