In [1]:
import json
import logging
import os
import time
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Dict, List, Literal, Optional
from uuid import uuid4
import boto3
from dotenv import load_dotenv



logger = logging.getLogger()

In [2]:
# Load .env file
load_dotenv()
aws_profile = os.getenv('AWS_PROFILE')
llm_region = os.getenv('LLM_REGION')
llm_model = os.getenv('LLM_MODEL')
print(f"Using AWS Profile: {aws_profile}")
print(f"Using AWS Region: {llm_region}")
print(f"Using LLM model: {llm_model}")

Using AWS Profile: aws-prototype
Using AWS Region: us-east-1
Using LLM model: anthropic.claude-3-5-sonnet-20240620-v1:0


In [3]:
from langchain import hub
from langchain.agents import (
    AgentExecutor,
    tool,
    create_tool_calling_agent,
    create_json_chat_agent,
)
from langchain_core.tools import StructuredTool, Tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_aws import ChatBedrock
from langchain_experimental.utilities import PythonREPL
from langchain_core.messages import HumanMessage

In [4]:
from config import Config
from aws.bedrock import BedrockHandler
from tools.python_repl import PythonREPLTool
from tools.outputformatter import create_formatting_tool

In [5]:
import pandas as pd

class chartist:
    def __init__(self, region_name: str = "us-east-1"):
        """
        Initialize chartist with required configuration

        Parameters:
        region_name (str): AWS region name
        """
        logger.info("Chartist Start")
        self.config = Config()
        self.bedrock = BedrockHandler(self.config)
        self.llm = self.bedrock.get_llm()
        self.prompt = hub.pull("hwchase17/react-chat-json")
        
        self.python_repl = PythonREPLTool()
        self.tools = [
            self.python_repl.get_tool(),
            create_formatting_tool(),
            #Tool(
            #    name="parse_dataframe",
            #    func=self.parse_dataframe,
            #    description="Parses a DataFrame and returns its structure, columns, data types, summary statistics, and sample data in JSON format.",
            #    ),
        ]
        self.agent_executor = self.set_agent_executor()

        # Set up holder for visualisations
        self.visualizations = None

    def set_agent_executor(self, verbose=False, handle_parse=True):
        logger.info("Setting up chartist executor")
        try:
            agent = create_json_chat_agent(self.llm, self.tools, self.prompt)
            logger.debug("Structured chat agent created")

            executor = AgentExecutor(
                agent=agent,
                tools=self.tools,
                verbose=verbose,
                handle_parsing_errors=handle_parse,
            )
            logger.info("Agent executor successfully set up")
            return executor
        except Exception as e:
            logger.error(f"Error setting up agent executor: {str(e)}")
            raise

    def parse_dataframe(self, df: pd.DataFrame) -> dict:
        """
        Extract useful information from a DataFrame and return it as a JSON-like dictionary.

        Parameters:
        df: DataFrame to analyze.

        Returns:
        dict: A structured dictionary containing DataFrame details.
        """
        logger.info("Parsing DataFrame into structured JSON dictionary")
        try:
            try:
                summary = df.describe(include="all").to_dict()
            except ValueError as e:
                logger.warning(f"Could not generate full summary statistics: {e}")
                summary = {}
            
            data_info = {
                "columns": [{"name": col, "dtype": str(dtype)} for col, dtype in df.dtypes.items()],
                "shape": {"rows": df.shape[0], "columns": df.shape[1]},
                "summary": summary,
                "sample_data": df.head(10).to_dict(orient="records"),
            }
            logger.debug(f"DataFrame parsed: {data_info}")
            return data_info
        except Exception as e:
            logger.error(f"Error parsing DataFrame: {str(e)}")
            raise
        

    def handle_request(self, data=None, prompt=None, **kwargs):
        """
        Handle user input by deciding how to process it, based on the type of data provided.

        Parameters:
        data: Input data for the agent. Could be a DataFrame or other types.
        prompt: String prompt to guide the agent.

        Returns:
        Response generated by the agent executor.
        """
        if isinstance(data, pd.DataFrame):
            logger.info("Detected DataFrame input. Preparing to analyze...")

            # Limit the DataFrame to the top 300 rows if necessary
            max_rows = 300
            if len(data) > max_rows:
                logger.info(f"DataFrame has more than {max_rows} rows. Trimming for processing.")
                data = data.head(max_rows)

            # Convert DataFrame to JSON string
            dataframe_json = data.to_json(orient="split")

            # Construct the combined prompt - TODO: maybe instead of creating a separate prompt, we add the bits
            # to the prompt depending on what is being analysed as we do with citations.
            combined_prompt = f"""
                        User Prompt:
                        {prompt}

                        DataFrame Information (Top {len(data)} Rows):
                        The following is a JSON representation of the DataFrame. Use this to suggest suitable visualizations:
                        {data.to_json(orient="split")}

                        Instructions:
                        1. Analyze the DataFrame to understand its structure and content
                        2. Suggest meaningful visualizations based on the data and user's instructions
                        3. For each visualization, include:
                        - Clear purpose
                        - Chart type
                        - Variables used
                        - Expected insights
                        4. Use the format_visualization_output tool to structure your response
                        5. Make sure to provide concrete, specific suggestions based on the actual data

                        Remember to use the formatting tool for your final output.
                        """

            # Send to the agent executor
            try:
                logger.info("Sending prompt and DataFrame to agent executor...")
                response = self.agent_executor.invoke({"input": combined_prompt})
                return response
            except Exception as e:
                logger.error(f"Error in agent executor: {str(e)}")
                return f"Error processing visualization suggestions: {str(e)}"

        elif prompt:
            logger.info("No DataFrame detected. Using agent executor for the provided prompt...")
            try:
                response = self.agent_executor.invoke({"input": prompt})
                return response
            except Exception as e:
                logger.error(f"Error in agent executor: {str(e)}")
                return f"Error processing prompt: {str(e)}"

        else:
            logger.warning("No input provided to handle_request method.")
            return "Please provide a DataFrame or a prompt for the agent to process."
        

In [6]:
import numpy as np
def create_sample_dataset():
    """Create a sample dataset for testing"""
    np.random.seed(42)
    dates = pd.date_range(start="2023-01-01", periods=100, freq="D")

    sample_df = pd.DataFrame(
        {
            "date": dates,
            "sales": np.random.normal(1000, 100, 100),
            "customers": np.random.randint(50, 200, 100),
            "satisfaction_score": np.random.uniform(3.5, 5.0, 100),
            "category": np.random.choice(["A", "B", "C"], 100),
        }
    )

    return sample_df

In [7]:
chartist = chartist()



In [8]:
sample_df = create_sample_dataset()

In [9]:
prompt = "Analyze this dataset and suggest visualizations to explore trends and insights."
response = chartist.handle_request(data=sample_df, prompt=prompt)

In [10]:
print(response)

{'input': '\n            User Prompt:\n            Analyze this dataset and suggest visualizations to explore trends and insights.\n\n            DataFrame Information (Top 100 Rows):\n            The following is a JSON representation of the DataFrame. Use this to suggest suitable visualizations:\n            {"columns":["date","sales","customers","satisfaction_score","category"],"index":[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99],"data":[[1672531200000,1049.6714153011,186,4.352462905,"A"],[1672617600000,986.1735698829,111,3.6405121517,"A"],[1672704000000,1064.7688538101,100,4.0515737046,"A"],[1672790400000,1152.3029856408,108,3.8978035515,"A"],[1672876800000,976.5846625277,167,3.8659844651,"A"],[1672963200000,976.5863043051,145,4.9595158321

In [2]:
# Python Interpreter Tool
python_repl = PythonREPL()
repl_tool = StructuredTool.from_function(
    func=python_repl.run,
    name="python_repl",
    description="A Python shell. Use this whenever calculations are required by passing in the required python commands to solve the calculation. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`."
)