In [None]:
import openai
from dotenv import load_dotenv
import os
import streamlit as st
from openai import OpenAI, beta
import pandas as pd
import sys
import io
from pydantic import BaseModel, Field
import json
from IPython.display import Markdown as md
import streamlit
%matplotlib inline

load_dotenv()

MODEL = os.getenv('MODEL_CHOICE', 'gpt-4o-mini')
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
CLIENT = OpenAI(api_key=OPENAI_API_KEY)

# structured response
class PandasCode(BaseModel):
    code: str = Field(description="Pandas code to be implemented")
    explanation: str = Field(description="String explaining the code")

class SynthesizeFinalAnswer(BaseModel):
    original_question: str = Field(description="The original question asked by the user")
    final_answer: str = Field(description="The final answer synthesized by the model")
    analysis: str = Field(description="Additional analysis or insights as a result of the final answer")
    method: str = Field(description="The method used to synthesize the final answer")

def generate_pandas_code(nl_instruction=None, df_preview=None, client=None,model=None):
    """
    Given a natural language instruction and a preview of the DataFrame,
    call OpenAI to generate Python code that operates on an existing DataFrame (df)
    and assigns the final output to a variable named 'result'.
    """

    # prompting
    system_prompt = """You are an expert Python programmer. "
                "Your response must contain only valid Python code that uses pandas. "
                "Assume a pandas DataFrame called 'df' already exists. "
                "IMPORTANT: Your code must end with an assignment to a variable named 'result'. "
                "Do not include any markdown formatting, explanations, or extra text."""
                
    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": (
                f"Below is a preview of a pandas DataFrame:\n{df_preview}\n\n"
                f"Write Python code to accomplish the following task:\n{nl_instruction}"
            )
        }
    ]

    response = client.beta.chat.completions.parse(
        model=model,
        messages=messages,
        temperature=0.0,
        max_tokens=8000,
        response_format = PandasCode
    )
    response_string = response.choices[0].message.content.strip()
    response_json = json.loads(response_string)
    code = response_json['code']
    explanation = response_json['explanation']
    return code

def correct_code(code_str=None, nl_instruction=None, df_preview=None, client=None, model=None, error_message=""):
    """
    Send the error message along with the original code and instruction to OpenAI
    to generate a corrected version of the code.
    """
    system_prompt = f"""You are an expert Python programmer. 
                The following code produced an error during execution. 
                Your task is to correct the code so that it runs successfully. 
                The code should use pandas and assume a DataFrame named 'df' exists. 
                Ensure the final corrected code assigns its output to a variable named 'result'.
                Verify the code answers the following task or question: {nl_instruction}, modifying the code as needed.
                Output only the corrected Python code, with no explanations."""

    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": (
                f"Original code:\n{code_str}\n\n"
                f"Error encountered:\n{error_message}\n\n"
                f"DataFrame preview:\n{df_preview}\n\n"
                f"Task description:\n{nl_instruction}"
            )
        }
    ]

    response = client.beta.chat.completions.parse(
        model=MODEL,
        messages=messages,
        temperature=0.0,
        max_tokens=8000,
        response_format = PandasCode
    )
    response_string = response.choices[0].message.content.strip()
    response_json = json.loads(response_string)
    corrected_code = response_json['code']
    explanation = response_json['explanation']
    return corrected_code

def execute_generated_code(code_str=None, exec_env=None):
    """
    Execute the provided Python code in the provided environment (a dict) and
    return the value of the variable 'result', if defined.
    """
    try:
        corrected_code = code_str
        exec(corrected_code, exec_env)
    except Exception as e:
        print(f"Error during code execution: {e}")
        print(f"Code that failed: {corrected_code}")
        print(f"Execution environment: {exec_env}")
        print(f"Attempting to correct the code...")
        corrected_code = correct_code(corrected_code, exec_env.get("task_description", ""), exec_env.get("df_preview", ""), str(e))
        print(f"Corrected code: {corrected_code}")
        try:
            exec(corrected_code, exec_env)
        except Exception as e:
            print(f"Error during code execution: {e}")
            print(f"Code that failed: {corrected_code}")
            print(f"Execution environment: {exec_env}")
            print(f"Attempting to correct the code...")
            corrected_code = correct_code(corrected_code, exec_env.get("task_description", ""), exec_env.get("df_preview", ""), str(e))
            print(f"Corrected code: {corrected_code}")
            try:
                exec(corrected_code, exec_env)
            except Exception as e:
                print(f"Error during code execution: {e}")
                print(f"Code that failed: {corrected_code}")
                print(f"Execution environment: {exec_env}")
                print(f"Attempting to correct the code...")
                corrected_code = correct_code(code_str, exec_env.get("task_description", ""), exec_env.get("df_preview", ""), str(e))
                print(f"Corrected code: {corrected_code}")
                try:
                    exec(corrected_code, exec_env)
                except Exception as e:
                    print(f"Error during code execution: {e}")
                    return None, str(e)
    return exec_env.get("result", None), corrected_code

def synthesize_final_answer(original_question=None, execution_result=None, client = None, model = None):
    """
    Call OpenAI to synthesize a final answer based on the original question
    and the result produced by the executed code.
    """
    system_prompt = """You are an expert data analyst. "
                "Based on the following execution result from a pandas operation and the original question, "
                "provide a clear and concise final answer.
                "Expand upon the answer to draw out insights or implications as needed, but keep it relevant to the original question."""
    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": (
                f"Original question: {original_question}\n\n"
                f"Execution result: {execution_result}"
            )
        }
    ]
    response = client.beta.chat.completions.parse(
        model=model,
        messages=messages,
        temperature=0.0,
        max_tokens=4000,
        response_format = SynthesizeFinalAnswer
    )
    
    response_string = response.choices[0].message.content.strip()
    response_json = json.loads(response_string)
    question = response_json['original_question']
    answer = response_json['final_answer']
    analysis = response_json['analysis']
    method = response_json['method']
    return question, answer, analysis ,method

def process_user_instruction(instruction=None, csv_file_path=None , client = None, model=None):
    """
    Processes a natural language instruction to generate, correct, and execute pandas code,
    and synthesizes the final answer.

    Args:
        nl_instruction (str): The natural language instruction from the user.
        csv_file_path (str): The file path to the CSV file.

    Returns:
        dict: A dictionary containing the question, answer, analysis, method, and final code.
    """
    # Load the CSV file
    df = pd.read_csv(csv_file_path)

    # Preview the DataFrame
    df_preview = df.head().to_string()

    # Generate pandas code based on the instruction and preview
    code = generate_pandas_code(nl_instruction = instruction, df_preview=df_preview, client=client,model=model)

    # Correct the generated code
    corrected_code = correct_code(code_str=code, nl_instruction=instruction, df_preview=df_preview, client=client, model=model, error_message="")

    # Execute the corrected code
    exec_env = {"df": df}

    execution_result, final_code = execute_generated_code(code_str=corrected_code, exec_env=exec_env)

    # Synthesize the final answer
    question, answer, analysis, method = synthesize_final_answer(original_question=instruction, execution_result=execution_result, client = client, model = model)

    # Return the results
    return {
        "Question": question,
        "Answer": answer,
        "Analysis": analysis,
        "Method": method,
        "Code": final_code,
        "Execution_Result": execution_result
    }

def llm_analysis_app()
    st.title("LLM Augmented Analysis")

    instruction = st.text_area("Enter your instruction:")

    uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])

    if uploaded_file is not None and instruction.strip():
        # Assuming process_user_instruction returns a dictionary with the required keys
        result = process_user_instruction(
            instruction=instruction, 
            csv_file_path=uploaded_file, 
            client=CLIENT, 
            model=MODEL
        )

        if result:
            st.subheader("Results")
            
            # Display each category
            st.subheader("Question")
            st.write(result.get("Question", "No question generated."))

            st.subheader("Answer")
            st.write(result.get("Answer", "No answer generated."))

            st.subheader("Analysis")
            st.write(result.get("Analysis", "No analysis provided."))

            st.subheader("Method")
            st.write(result.get("Method", "No method provided."))

            st.subheader("Code")
            st.code(result.get("Code", "No code generated."), language="python")

            st.subheader("Execution Result")
            st.write(result.get("Execution_Result", "No execution result available."))
        else:
            st.error("No results were generated. Please check your input.")
    else:
        st.info("Please provide an instruction and upload a CSV file.")

In [267]:
result

{'Question': 'identify outliers in the data',
 'Answer': 'The identified outliers in the dataset are primarily characterized by their unusually high magnitudes and depths. Notably, the entries with magnitudes above 6.0, such as the earthquakes recorded at depths of 646.456 km and 394.464 km, stand out as significant outliers.',
 'Analysis': 'These outliers suggest that there are extreme seismic events occurring, which may indicate geological phenomena that warrant further investigation. The presence of earthquakes with such high magnitudes and depths could imply tectonic activity in less studied regions or unusual geological formations. Understanding these outliers can help in assessing seismic risks and improving predictive models for future earthquakes.',
 'Method': "The final answer was synthesized by analyzing the provided data for entries with extreme values in the 'mag' (magnitude) and 'depth' columns, identifying those that significantly deviate from the typical range observed i

In [258]:
# df = pd.read_csv('/Users/5616bit/Documents/DS/aidev/data/sample_earthquake_data.csv')
# df

In [208]:
# df_preview = df.head().to_string()

In [209]:
# nl_instruction = "What is the correlation bewtwen the magnitude and depth of the earthquake?"

In [210]:
# code = generate_pandas_code(nl_instruction, df_preview)
# print(code)

In [211]:
# corrected_code = correct_code(code_str=code, nl_instruction=nl_instruction, df_preview=df_preview, error_message="")
# corrected_code

In [212]:
# exec_env = {"df": df}

In [213]:
# execution_result = execute_generated_code(code_str=corrected_code, exec_env=exec_env)
# print(execution_result)

In [None]:
# examples
#"Visualize the earthquake magnitudes geo-spatially"

In [262]:
nl_instruction = "Identify outliers in the data" # input from user

df = pd.read_csv('/Users/5616bit/Documents/DS/aidev/data/sample_earthquake_data.csv') # input from user

df_preview = df.head().to_string()

code = generate_pandas_code(nl_instruction, df_preview)

corrected_code = correct_code(code_str=code, nl_instruction=nl_instruction, df_preview=df_preview, error_message="")

exec_env = {"df": df}

execution_result, final_code = execute_generated_code(code_str=corrected_code, exec_env=exec_env) # display the final code used

question, answer, analysis, method = synthesize_final_answer(nl_instruction, execution_result)

print(f"Question: {question}")
print(f"Answer: {answer}")
print(f"Analysis: {analysis}")
print(f"Method: {method}") 
print(f"Code: {final_code}")
print(f"Execution Result: {execution_result}")

Question: Identify outliers in the data
Answer: The identified outliers in the dataset are primarily characterized by their depth and magnitude values, with notable entries such as a depth of 646.456 km and a magnitude of 6.50, which are significantly higher than the typical ranges observed in the dataset.
Analysis: The presence of outliers, particularly those with extreme depth and magnitude, suggests potential geological phenomena that warrant further investigation. For instance, the outlier with a depth of 646.456 km could indicate a deep seismic event, possibly related to subduction zones or other tectonic activities. Understanding these outliers can provide insights into seismic patterns and help in assessing risks in affected regions.
Method: Outliers were identified by analyzing the depth and magnitude columns for values that significantly deviate from the mean or median, using statistical methods such as the interquartile range (IQR) or z-scores.
Code: import pandas as pd
from 