In [None]:
import os
import uuid
import docker
import pandas as pd
from dotenv import load_dotenv
from pydantic import Field, PrivateAttr
from langchain_experimental.agents import create_pandas_dataframe_agent
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_experimental.tools.python.tool import PythonAstREPLTool, sanitize_input

In [None]:
load_dotenv()

## Initialize LLM

In [None]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash-exp",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

In [None]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
ai_msg = llm.invoke(messages)
ai_msg

In [None]:
print(ai_msg.content)

## Dataframe Agent in Sandbox

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv"
)

In [None]:
class DockerSandbox(PythonAstREPLTool):
    """An enhanced DockerSandbox for running Python code in a Docker container."""

    name: str = "sandbox_execution"
    description: str = "Executes Python code securely in Docker container"
    df: pd.DataFrame = Field(default_factory=pd.DataFrame)
    _client: docker.DockerClient = PrivateAttr(default=None)  # Private attribute

    def __init__(self, dataframe: pd.DataFrame):
        super().__init__()
        self.df = dataframe
        self._client = docker.from_env()

    def _run(self, query: str, run_manager=None) -> str:
        unique_id = uuid.uuid4().hex
        temp_data_path = f"/tmp/data_{unique_id}.csv"

        # Clean up old files
        for file in ["/tmp/result.csv", "/tmp/result.txt"]:
            if os.path.exists(file):
                os.remove(file)

        # Save the dataframe
        self.df.to_csv(temp_data_path, index=False)

        # Sanitize input if the flag is set
        if self.sanitize_input:
            query = sanitize_input(query)
        indented_query = "\n".join([f"    {line}" for line in query.splitlines()])

        # Add necessary imports and sandbox code
        full_code = f"""
import pandas as pd
df = pd.read_csv('/data/data_{unique_id}.csv')

try:
{indented_query}
except Exception as e:
    print(f"Error during execution: {{str(e)}}")
"""
        # Debugging: Print the final script
        print("Executing the following Python script in Docker:")
        print(full_code)

        try:
            container_output = self._client.containers.run(
                "jupyter/scipy-notebook:latest",
                command=["python", "-c", full_code],
                volumes={"/tmp": {"bind": "/data", "mode": "rw"}},
                mem_limit="100m",
                cpu_period=100000,
                cpu_quota=50000,
                network_mode="none",
                remove=True,
                stdout=True,
                stderr=True,
            )

            # Read results
            if os.path.exists("/tmp/result.txt"):
                with open("/tmp/result.txt") as f:
                    result = f.read()
                return result

            decoded_output = container_output.decode("utf-8")
            print("Container output:")
            print(decoded_output)
            return decoded_output

        except Exception as e:
            return f"Error: {str(e)}"

In [None]:
prompt = """
You are an expert data scientist working on a dataframe named `df`. Always use this dataframe for all your calculations and analyses. Avoid creating new dataframes or datasets unless explicitly instructed. 
You are generating Python code for execution in a secure sandbox. Double-check your syntax for common typos, such as using 'rint' instead of 'print'.
Ensure all important and relevant results are printed.
""".strip()

In [None]:
# Create sandbox
sandbox = DockerSandbox(df)

# Use with LangChain
agent = create_pandas_dataframe_agent(
    llm=llm,
    df=df,
    verbose=True,
    allow_dangerous_code=True,
    prefix=prompt,
)

In [None]:
# Redefine the tools: using our own sandbox rather than the default one
agent.tools = [sandbox]

In [None]:
agent.invoke(
    "What are the most (statistically) significant factors that causually affect the survival rate of passengers on the Titanic?"
)