In [None]:
#mount the drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

In [None]:
INPUT_EXCEL_FILE1 ="/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready_facts_AI_sampled.xlsx"
OUTPUT_EXCEL_FILE1 ="/content/drive/MyDrive/worker_comp_work/WC_Final/research_ready_facts_AI_sampled_openai_gpt_4.1_mini.xlsx"

In [None]:
!pip install --upgrade openai

In [None]:
!pip install openai pandas openpyxl python-dotenv

In [None]:
import openai
import pandas as pd
from google.colab import userdata

In [None]:
openai_api_key = userdata.get("OPENAI_API_KEY")

In [None]:
!pip install python-dotenv

In [None]:
import openai
import pandas as pd
import time
import os
import json # To parse JSON output from the model
from dotenv import load_dotenv # For loading API key from .env file

# --- Configuration ---

# Load environment variables from .env file (optional but recommended)
load_dotenv()

# 1. OpenAI API Key Setup
# Make sure you have OPENAI_API_KEY set in your .env file
# or replace os.getenv("OPENAI_API_KEY") with your actual key in quotes.
openai.api_key = openai_api_key

# 2. Specify the OpenAI Model
# Note: "gpt-4.1-mini" and "o4-mini" are not standard OpenAI model names.
# I'll use "gpt-4o" as a capable model for extraction.
MODEL_NAME_LIST = ["gpt-4.1-mini"] # Changed to a commonly available model

# 3. Excel File Paths
INPUT_EXCEL_FILE = INPUT_EXCEL_FILE1 # Your input file
# Changed output name to reflect the new prediction task
OUTPUT_EXCEL_FILE = OUTPUT_EXCEL_FILE1

# 4. Column Names (Adjust if different in your Excel)
FACTS_COLUMN = 'Annonymized_Facts'

# 5. Time Delays (in seconds)
# Delay AFTER processing each row (in the main loop)
MAIN_LOOP_DELAY_SECONDS = 0
# Delay BEFORE each API call (inside the function)
API_CALL_DELAY_SECONDS = 0  # Increased slightly for robustness

# --- Function to get predictions from OpenAI ---
def get_openai_predictions(facts_text, MODEL_NAME):
    """
    Sends anonymized facts to OpenAI and asks it to predict Case ID, Year,
    Plaintiff Name, and Defendant Name.

    Args:
        facts_text (str): The text from the 'Annonymized_Facts' column.
        MODEL_NAME (str): The OpenAI model to use for the prediction.

    Returns:
        dict: A dictionary with predicted 'Case ID', 'Year', 'Plaintiff Name',
              'Defendant Name'. Returns None or default values if an error occurred.
    """
    if not facts_text or not isinstance(facts_text, str) or len(facts_text.strip()) == 0:
        print("Warning: Empty or invalid facts text provided. Returning default predictions.")
        return {
            "Predicted_ID": "N/A",
            "Predicted_Year": "N/A",
            "Predicted_Plaintiff": "N/A",
            "Predicted_Defendent": "N/A"
        }

    # New prompt to extract specific entities
    system_prompt = """
    You are an expert at extracting case information from legal summaries. Based solely on your internal training data and knowledge (no web search), identify the following for each anonymized workers' compensation case:
    "Case_ID": (string, predicted Case ID, or "Unknown" if not found)
    "Year": (string, predicted Year the case was heard, or "Unknown" if not found)
    "Plaintiff_Name": (string, predicted Plaintiff's Name, or "Unknown" if not found)
    "Defendant_Name": (string, predicted Defendant's Name, or "Unknown" if not found)

    If a piece of information is explicitly stated as anonymized or cannot be confidently extracted, use "Unknown" for that specific key.
    Do NOT include any other text or explanation in your response, only the JSON object.
    """

    user_prompt = f"""
    You are given the following anonymized workers compensation case. Now predict Case ID, Year of this Case heard, Plaintiff Name and Defendant Name.

    Anonymized Case Facts:
    ---
    {facts_text}
    ---
    """

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

    predicted_values = {
        "Predicted_ID": "Error",
        "Predicted_Year": "Error",
        "Predicted_Plaintiff": "Error",
        "Predicted_Defendent": "Error"
    }

    try:
        print(f"Waiting for {API_CALL_DELAY_SECONDS} second(s) before API call...")
        time.sleep(API_CALL_DELAY_SECONDS)

        print(f"Making API call to {MODEL_NAME} for extraction...")
        response = openai.chat.completions.create(
            model=MODEL_NAME,
            messages=messages,
            temperature=1, # Use 0 for deterministic extraction
            response_format={"type": "json_object"} # Instructs model to return JSON
        )
        print("API call complete.")

        response_content = response.choices[0].message.content.strip()

        # Attempt to parse the JSON response
        try:
            parsed_json = json.loads(response_content)
            predicted_values["Predicted_ID"] = parsed_json.get("Case_ID", "Unknown")
            predicted_values["Predicted_Year"] = parsed_json.get("Year", "Unknown")
            predicted_values["Predicted_Plaintiff"] = parsed_json.get("Plaintiff_Name", "Unknown")
            predicted_values["Predicted_Defendent"] = parsed_json.get("Defendant_Name", "Unknown")
        except json.JSONDecodeError:
            print(f"Warning: Could not parse JSON from response: '{response_content}' for facts: '{facts_text[:100]}...'")
            # If JSON parsing fails, keep "Error" or set to "Parsing Error"
            predicted_values["Predicted_ID"] = "Parsing Error"
            predicted_values["Predicted_Year"] = "Parsing Error"
            predicted_values["Predicted_Plaintiff"] = "Parsing Error"
            predicted_values["Predicted_Defendent"] = "Parsing Error"

    except openai.RateLimitError as e:
        print(f"OpenAI API rate limit exceeded: {e}. Consider increasing API_CALL_DELAY_SECONDS.")
        # Propagate error state
    except openai.AuthenticationError as e:
        print(f"OpenAI Authentication Error: {e}. Check your API key.")
        exit() # No point in continuing if auth fails
    except openai.APIError as e:
        print(f"OpenAI API returned an API Error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during OpenAI API call for facts: '{facts_text[:100]}...'. Error: {e}")

    return predicted_values

# --- Main Processing Logic ---

print(f"\nReading Excel file: {INPUT_EXCEL_FILE}")
try:
    df = pd.read_excel(INPUT_EXCEL_FILE)
    print(f"Successfully read {len(df)} rows.")
except FileNotFoundError:
    print(f"Error: Input file not found at {INPUT_EXCEL_FILE}")
    exit()
except Exception as e:
    print(f"Error reading Excel file: {e}")
    exit()

# Check for input column
if FACTS_COLUMN not in df.columns:
    print(f"Error: Column '{FACTS_COLUMN}' not found in the Excel file.")
    exit()

# Initialize new prediction columns
# We'll use the specific desired output column names directly
output_columns = ["Predicted_ID", "Predicted_Year", "Predicted_Plaintiff", "Predicted_Defendent"]
for col in output_columns:
    if col not in df.columns:
        df[col] = "Not Processed" # Initialize with a placeholder

for selected_model in MODEL_NAME_LIST:
    print(f"\n===== Starting processing for Model: {selected_model} =====")
    # Loop for runs is simplified since we are not doing multiple runs for win/loss
    # If you want to run multiple times for average or error checking, keep the loop.
    # For now, assuming one run for extraction
    for run_number in range(1, 2): # Just one run for prediction
        print(f"\n--- Model: {selected_model}, Run: {run_number} ---")
        print(f"Predicting details into columns: {', '.join(output_columns)}")

        print(f"\nProcessing {len(df)} rows using OpenAI model: {selected_model} (Run {run_number})...")

        total_rows = len(df)
        for index, row in df.iterrows():
            print(f"\n--- Processing row {index + 1} of {total_rows} (Model: {selected_model}, Run: {run_number}) ---")
            facts = str(row[FACTS_COLUMN]) if pd.notna(row[FACTS_COLUMN]) else ""

            # Get the predictions from OpenAI
            predictions = get_openai_predictions(facts, selected_model)

            # Update the DataFrame with the predicted values
            df.loc[index, "Predicted_ID"] = predictions.get("Predicted_ID", "Error")
            df.loc[index, "Predicted_Year"] = predictions.get("Predicted_Year", "Error")
            df.loc[index, "Predicted_Plaintiff"] = predictions.get("Predicted_Plaintiff", "Error")
            df.loc[index, "Predicted_Defendent"] = predictions.get("Predicted_Defendent", "Error")

            print(f"Row {index + 1}: Predicted ID: {predictions.get('Predicted_ID', 'Error')}, "
                  f"Year: {predictions.get('Predicted_Year', 'Error')}, "
                  f"Plaintiff: {predictions.get('Predicted_Plaintiff', 'Error')}, "
                  f"Defendant: {predictions.get('Predicted_Defendent', 'Error')}")

            if index < total_rows - 1:
                if MAIN_LOOP_DELAY_SECONDS > 0:
                    print(f"Waiting for {MAIN_LOOP_DELAY_SECONDS} second(s) before next row...")
                    time.sleep(MAIN_LOOP_DELAY_SECONDS)

        print(f"\n--- Finished Run {run_number} for Model: {selected_model} ---")

        # Save the DataFrame after each run (or after all runs if not looping)
        print(f"Saving results to {OUTPUT_EXCEL_FILE}...")
        try:
            df.to_excel(OUTPUT_EXCEL_FILE, index=False)
            print(f"Successfully saved results to: {OUTPUT_EXCEL_FILE}")
        except Exception as e:
            print(f"Error saving results to Excel file: {e}")

    print(f"\n===== Finished all runs for Model: {selected_model} =====")

print("\nAll processing complete. Script finished.")