diff --git a/integrations/AgentCore/README.md b/integrations/AgentCore/README.md index 142d6d7..35f0bb7 100644 --- a/integrations/AgentCore/README.md +++ b/integrations/AgentCore/README.md @@ -9,6 +9,7 @@ AgentCore/ ├── Makefile # Setup and environment management ├── requirements.txt # Python dependencies ├── runtime_with_strands_and_fireworksai_models.ipynb # Main deployment notebook +├── strands-agent-advanced-data-analysis-code-interpreter.ipynb # Code interpreter notebook ├── strands_agents_fireworks_ai.py # AgentCore deployment script └── strands_agents_fireworks_ai_local.py # Local testing script ``` diff --git a/integrations/AgentCore/constants.py b/integrations/AgentCore/constants.py new file mode 100644 index 0000000..bf64bf9 --- /dev/null +++ b/integrations/AgentCore/constants.py @@ -0,0 +1,51 @@ +DATA_SCIENTIST_SYSTEM_PROMPT = """ + You are an expert data analysis AI assistant specializing in economic and statistical analysis. You have access to a GDP dataset containing country-level data from 2020-2025 with columns: 'Country', '2020', '2021', '2022', '2023', '2024', '2025'. + + You MUST validate all answers through code execution using the tools provided. DO NOT answer questions without using the tools. + + DATA ANALYSIS PRINCIPLES: + 1. Always load and examine the dataset before answering questions + 2. Verify all statistical calculations, trends, and comparisons through code + 3. Use pandas for data manipulation and analysis, and matplotlib for data visualization + 4. Create visualizations when helpful to illustrate findings + 5. Show your analytical work with actual code execution + 6. Validate data quality and handle missing values appropriately + + VALIDATION PRINCIPLES: + 1. When making claims about calculations or trends - write code to verify them + 2. Use execute_python to perform statistical analysis, data aggregations, and comparisons + 3. Create test scripts to validate your understanding before giving answers + 4. Always show your work with actual code execution + 5. If uncertain, explicitly state limitations and validate what you can + + APPROACH: + - Load the dataset and inspect it before performing analysis + - For questions about specific countries, filter and analyze the relevant data + - For trend analysis, calculate year-over-year changes programmatically + - For comparisons, compute statistics and rankings with code + - For aggregations (regional averages, totals), show the grouping and calculation logic + - Include data validation checks (null values, data types, outliers) + - Document your analytical process for transparency + - The sandbox maintains state between executions, so you can refer to previous results + - Only use the tools and python packages available + + TOOL AVAILABLE: + - execute_python: Run Python code and see output + + PYTHON PACKAGES AVAILABLE: + - pandas + - numpy + - matplotlib + + RESPONSE FORMAT: The execute_python tool returns a JSON response with: + - sessionId: The sandbox session ID + - id: Request ID + - isError: Boolean indicating if there was an error + - content: Array of content objects with type and text/data + - structuredContent: For code execution, includes stdout, stderr, exitCode, executionTime + + For successful code execution, the output will be in content[0].text and also in structuredContent.stdout. + Check isError field to see if there was an error. + + Be thorough, accurate, and always validate your answers with code. Provide clear, data-driven insights backed by actual calculations. + """ \ No newline at end of file diff --git a/integrations/AgentCore/images/code-interpreter.png b/integrations/AgentCore/images/code-interpreter.png new file mode 100644 index 0000000..0e8da54 Binary files /dev/null and b/integrations/AgentCore/images/code-interpreter.png differ diff --git a/integrations/AgentCore/requirements.txt b/integrations/AgentCore/requirements.txt index c7f30f8..8c6fc9a 100644 --- a/integrations/AgentCore/requirements.txt +++ b/integrations/AgentCore/requirements.txt @@ -8,4 +8,6 @@ aws-opentelemetry-distro-genai-beta setuptools bedrock-agentcore jupyter -bedrock-agentcore-starter-toolkit \ No newline at end of file +bedrock-agentcore-starter-toolkit +pandas +kagglehub[pandas-datasets] \ No newline at end of file diff --git a/integrations/AgentCore/strands-agent-advanced-data-analysis-code-interpreter.ipynb b/integrations/AgentCore/strands-agent-advanced-data-analysis-code-interpreter.ipynb new file mode 100644 index 0000000..98d3153 --- /dev/null +++ b/integrations/AgentCore/strands-agent-advanced-data-analysis-code-interpreter.ipynb @@ -0,0 +1,1106 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "746626f6b18e1c8c", + "metadata": {}, + "source": [ + "## Data Analysis agent using FireworksAI + Amazon AgentCore Bedrock Code Interpreter\n", + "\n", + "This tutorial demonstrates how to create an AI agent that performs advanced data analysis through code execution using Python. We use Amazon Bedrock AgentCore Code Interpreter to run code that is generated by the LLM running on FireworksAI.\n", + "\n", + "This tutorial is an adaptation of the [agentcore data analysis tutorial](https://github.com/awslabs/amazon-bedrock-agentcore-samples/blob/main/01-tutorials/05-AgentCore-tools/01-Agent-Core-code-interpreter/03-advanced-data-analysis-with-agent-using-code-interpreter/strands-agent-advanced-data-analysis-code-interpreter.ipynb)\n", + "\n", + "We will demonstrate how to use AgentCore Bedrock Code Interpreter to:\n", + "1. Set up a sandbox environment\n", + "2. Configure a strands based agent that performs advanced data analysis by generating code based on the user query\n", + "3. Run top OSS coding models on FireworksAI (Qwen 3 Coder, Deepseek, Kimi, etc)\n", + "4. Execute code in a sandbox environment using Code Interpreter\n", + "5. Display the results back to the user\n", + "\n", + "## Prerequisites\n", + "- AWS account with Bedrock AgentCore Code Interpreter access\n", + "- You have the necessary IAM permissions to create and manage code interpreter resources\n", + "- Required Python packages installed(including boto3, bedrock-agentcore & strands)\n", + "- IAM role should have permissions to invoke models on Amazon Bedrock\n", + " - FireworksAI API access key, if you dont have one get one [here](https://app.fireworks.ai/settings/users/api-keys)\n", + "\n", + "## Your IAM execution role should have the following IAM policy attached" + ] + }, + { + "cell_type": "markdown", + "id": "f323388415caf3f7", + "metadata": {}, + "source": [ + "~~~ {\n", + "\"Version\": \"2012-10-17\",\n", + "\"Statement\": [\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"bedrock-agentcore:CreateCodeInterpreter\",\n", + " \"bedrock-agentcore:StartCodeInterpreterSession\",\n", + " \"bedrock-agentcore:InvokeCodeInterpreter\",\n", + " \"bedrock-agentcore:StopCodeInterpreterSession\",\n", + " \"bedrock-agentcore:DeleteCodeInterpreter\",\n", + " \"bedrock-agentcore:ListCodeInterpreters\",\n", + " \"bedrock-agentcore:GetCodeInterpreter\"\n", + " ],\n", + " \"Resource\": \"*\"\n", + " },\n", + " {\n", + " \"Effect\": \"Allow\",\n", + " \"Action\": [\n", + " \"logs:CreateLogGroup\",\n", + " \"logs:CreateLogStream\",\n", + " \"logs:PutLogEvents\"\n", + " ],\n", + " \"Resource\": \"arn:aws:logs:*:*:log-group:/aws/bedrock-agentcore/code-interpreter*\"\n", + " }\n", + "]\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "226b2cb86ff18d9c", + "metadata": {}, + "source": [ + "## How it works\n", + "\n", + "The code execution sandbox enables agents to safely process user queries by creating an isolated environment with a code interpreter, shell, and file system. After a Large Language Model helps with tool selection, code is executed within this session, before being returned to the user or Agent for synthesis.\n", + "\n", + "![architecture local](images/code-interpreter.png)" + ] + }, + { + "cell_type": "markdown", + "id": "859482709c77b03d", + "metadata": {}, + "source": [ + "## 1. Setting Up the Environment\n", + "\n", + "First, let's import the necessary libraries and initialize our Code Interpreter client.\n", + "\n", + "The default session timeout is 900 seconds(15 minutes). However, we start the session with a slightly session timeout duration of 1200 seconds(20 minutes), since we will perform detailed analysis on our data" + ] + }, + { + "cell_type": "code", + "id": "a13da423bac8ddd5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:31.807246Z", + "start_time": "2025-09-30T02:22:31.436714Z" + } + }, + "source": "!make setup", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Setting up local environment...\r\n", + "'uv' is already installed.\r\n", + "Virtual environment already exists.\r\n", + "Installing dependencies...\r\n", + "uv pip install -r requirements.txt\r\n", + "\u001B[2mAudited \u001B[1m13 packages\u001B[0m \u001B[2min 86ms\u001B[0m\u001B[0m\r\n" + ] + } + ], + "execution_count": 1 + }, + { + "cell_type": "code", + "id": "bb006310a96750c", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T04:32:35.995555Z", + "start_time": "2025-09-30T04:32:33.585005Z" + } + }, + "source": [ + "from bedrock_agentcore.tools.code_interpreter_client import CodeInterpreter\n", + "from strands import Agent, tool\n", + "import json\n", + "from typing import Dict, Any\n", + "from strands.models.openai import OpenAIModel\n", + "from dotenv import load_dotenv\n", + "import os\n", + "\n", + "load_dotenv()\n", + "\n", + "FIREWORKS_API_KEY = os.getenv(\"FIREWORKS_API_KEY\")\n", + "\n", + "assert FIREWORKS_API_KEY is not None, \"FIREWORKS_API_KEY not found in environment variables\"\n", + "\n", + "# Initialize the Code Interpreter within a supported AWS region.\n", + "code_client = CodeInterpreter('us-west-2')\n", + "code_client.start(session_timeout_seconds=1200)" + ], + "outputs": [ + { + "data": { + "text/plain": [ + "'01K6CE9JXACRSTCVNDDHV0GMA8'" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "execution_count": 46 + }, + { + "cell_type": "markdown", + "id": "dbd02b57bd10dda2", + "metadata": {}, + "source": [ + "## 2. Downloading data from Kaggle\n", + "\n", + "We will be using an open source dataset in kaggle which has GDP by country for the years 2020-2025. The link to the dataset is [here](https://www.kaggle.com/datasets/codebynadiia/gdp-per-country-20202025)" + ] + }, + { + "cell_type": "code", + "id": "bef3821f290a589b", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:35.122859Z", + "start_time": "2025-09-30T02:22:34.086056Z" + } + }, + "source": [ + "import kagglehub\n", + "from kagglehub import KaggleDatasetAdapter\n", + "\n", + "df = kagglehub.dataset_load(\n", + " KaggleDatasetAdapter.PANDAS,\n", + " handle=\"codebynadiia/gdp-per-country-20202025\",\n", + " path=\"2020-2025.csv\"\n", + ")\n", + "\n", + "df.to_csv(\"data/gdp_data.csv\", index=False)\n", + "\n", + "# Drop NaN values to keep things clean\n", + "df = df.dropna()\n", + "\n", + "print(f\"Dataset schema {df.columns}\")\n", + "print()\n", + "print(\"First 5 records:\", df.head())" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset schema Index(['Country', '2020', '2021', '2022', '2023', '2024', '2025'], dtype='object')\n", + "\n", + "First 5 records: Country 2020 2021 2022 2023 2024 \\\n", + "1 Albania 15271 18086.0 19185.0 23388.0 27259.0 \n", + "2 Algeria 164774 185850.0 225709.0 247789.0 264913.0 \n", + "3 Andorra 2885 3325.0 3376.0 3786.0 4038.0 \n", + "4 Angola 66521 84375.0 142442.0 109764.0 115946.0 \n", + "5 Antigua and Barbuda 1412 1602.0 1867.0 2006.0 2225.0 \n", + "\n", + " 2025 \n", + "1 28372.0 \n", + "2 268885.0 \n", + "3 4035.0 \n", + "4 113343.0 \n", + "5 2373.0 \n" + ] + } + ], + "execution_count": 3 + }, + { + "cell_type": "markdown", + "id": "b5fc91e0fb83fa05", + "metadata": {}, + "source": [ + "## 3. Preparing Files for Sandbox Environment\n", + "\n", + "We'll create a structure that defines the files we want to create in the sandbox environment." + ] + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:35.135732Z", + "start_time": "2025-09-30T02:22:35.134004Z" + } + }, + "cell_type": "code", + "source": [ + "def read_file(file_path: str) -> str:\n", + " \"\"\"Helper function to read file content with error handling\"\"\"\n", + " try:\n", + " with open(file_path, 'r', encoding='utf-8') as file:\n", + " return file.read()\n", + " except FileNotFoundError:\n", + " print(f\"Error: The file '{file_path}' was not found.\")\n", + " return \"\"\n", + " except Exception as e:\n", + " print(f\"An error occurred: {e}\")\n", + " return \"\"\n" + ], + "id": "90b483d820eb836b", + "outputs": [], + "execution_count": 4 + }, + { + "cell_type": "code", + "id": "da44fb745b84c6ca", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:35.140621Z", + "start_time": "2025-09-30T02:22:35.139159Z" + } + }, + "source": [ + "files_to_create = [\n", + " {\n", + " \"path\": \"data/gdp_data.csv\",\n", + " \"text\": read_file(\"data/gdp_data.csv\")\n", + " }]" + ], + "outputs": [], + "execution_count": 5 + }, + { + "cell_type": "markdown", + "id": "f055bea34c93279", + "metadata": {}, + "source": [ + "## 4. Creating Helper Function for Tool Invocation\n", + "\n", + "This helper function will make it easier to call sandbox tools and handle their responses. Within an active session, you can execute code in supported languages (Python, JavaScript), access libraries based on your dependencies configuration, generate visualizations, and maintain state between executions." + ] + }, + { + "cell_type": "code", + "id": "a74164c54b3b8ad5", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:35.145013Z", + "start_time": "2025-09-30T02:22:35.143494Z" + } + }, + "source": [ + "def call_tool(tool_name: str, arguments: Dict[str, Any]) -> str:\n", + " \"\"\"Helper function to invoke sandbox tools\n", + "\n", + " Args:\n", + " tool_name (str): Name of the tool to invoke\n", + " arguments (Dict[str, Any]): Arguments to pass to the tool\n", + "\n", + " Returns:\n", + " Dict[str, Any]: JSON formatted result\n", + " \"\"\"\n", + " response = code_client.invoke(tool_name, arguments)\n", + " for event in response[\"stream\"]:\n", + " return json.dumps(event[\"result\"])" + ], + "outputs": [], + "execution_count": 6 + }, + { + "cell_type": "markdown", + "id": "cd33790785ac084a", + "metadata": {}, + "source": [ + "## 5. Write data file to Code Sandbox\n", + "\n", + "Now we'll write our data file into the sandbox environment and verify they were created successfully." + ] + }, + { + "cell_type": "code", + "id": "380afae3a5ba4934", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:35.602108Z", + "start_time": "2025-09-30T02:22:35.149105Z" + } + }, + "source": [ + "# Write files to sandbox\n", + "writing_files = call_tool(\"writeFiles\", {\"content\": files_to_create})\n", + "print(\"Writing files result:\")\n", + "print(writing_files)\n", + "\n", + "# Verify files were created\n", + "listing_files = call_tool(\"listFiles\", {\"path\": \"\"})\n", + "print(\"\\nFiles in sandbox:\")\n", + "print(listing_files)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Writing files result:\n", + "{\"content\": [{\"type\": \"text\", \"text\": \"Successfully wrote all 1 files\"}], \"isError\": false}\n", + "\n", + "Files in sandbox:\n", + "{\"content\": [{\"type\": \"resource_link\", \"uri\": \"file:///log\", \"name\": \"log\", \"description\": \"Directory\"}, {\"type\": \"resource_link\", \"uri\": \"file:///data\", \"name\": \"data\", \"description\": \"Directory\"}, {\"type\": \"resource_link\", \"uri\": \"file:///.ipython\", \"name\": \".ipython\", \"description\": \"Directory\"}], \"isError\": false}\n" + ] + } + ], + "execution_count": 7 + }, + { + "cell_type": "markdown", + "id": "640eae7a52ce9f8d", + "metadata": {}, + "source": [ + "## 6. Perform Advanced Analysis using Strands based Agent\n", + "\n", + "Now we will configure an agent to perform data analysis on the data file that we uploaded into the sandbox(above)" + ] + }, + { + "cell_type": "markdown", + "id": "5a3b068faf4a5eaa", + "metadata": {}, + "source": [ + "### 6.1 System Prompt Definition\n", + "Define the behavior and capabilities of the AI assistant. We instruct our assistant to always validate answers through code execution and data based reasoning." + ] + }, + { + "cell_type": "code", + "id": "6e6830a170b45ce3", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:35.609762Z", + "start_time": "2025-09-30T02:22:35.607229Z" + } + }, + "source": [ + "from constants import DATA_SCIENTIST_SYSTEM_PROMPT\n", + "print(DATA_SCIENTIST_SYSTEM_PROMPT)" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " You are an expert data analysis AI assistant specializing in economic and statistical analysis. You have access to a GDP dataset containing country-level data from 2020-2025 with columns: 'Country', '2020', '2021', '2022', '2023', '2024', '2025'.\n", + " \n", + " You MUST validate all answers through code execution using the tools provided. DO NOT answer questions without using the tools.\n", + " \n", + " DATA ANALYSIS PRINCIPLES:\n", + " 1. Always load and examine the dataset before answering questions\n", + " 2. Verify all statistical calculations, trends, and comparisons through code\n", + " 3. Use pandas for data manipulation and analysis, and matplotlib for data visualization\n", + " 4. Create visualizations when helpful to illustrate findings\n", + " 5. Show your analytical work with actual code execution\n", + " 6. Validate data quality and handle missing values appropriately\n", + " \n", + " VALIDATION PRINCIPLES:\n", + " 1. When making claims about calculations or trends - write code to verify them\n", + " 2. Use execute_python to perform statistical analysis, data aggregations, and comparisons\n", + " 3. Create test scripts to validate your understanding before giving answers\n", + " 4. Always show your work with actual code execution\n", + " 5. If uncertain, explicitly state limitations and validate what you can\n", + " \n", + " APPROACH:\n", + " - Load the dataset and inspect it before performing analysis\n", + " - For questions about specific countries, filter and analyze the relevant data\n", + " - For trend analysis, calculate year-over-year changes programmatically\n", + " - For comparisons, compute statistics and rankings with code\n", + " - For aggregations (regional averages, totals), show the grouping and calculation logic\n", + " - Include data validation checks (null values, data types, outliers)\n", + " - Document your analytical process for transparency\n", + " - The sandbox maintains state between executions, so you can refer to previous results\n", + " - Only use the tools and python packages available\n", + " \n", + " TOOL AVAILABLE:\n", + " - execute_python: Run Python code and see output\n", + " \n", + " PYTHON PACKAGES AVAILABLE:\n", + " - pandas\n", + " - numpy\n", + " - matplotlib\n", + " \n", + " RESPONSE FORMAT: The execute_python tool returns a JSON response with:\n", + " - sessionId: The sandbox session ID\n", + " - id: Request ID\n", + " - isError: Boolean indicating if there was an error\n", + " - content: Array of content objects with type and text/data\n", + " - structuredContent: For code execution, includes stdout, stderr, exitCode, executionTime\n", + " \n", + " For successful code execution, the output will be in content[0].text and also in structuredContent.stdout.\n", + " Check isError field to see if there was an error.\n", + " \n", + " Be thorough, accurate, and always validate your answers with code. Provide clear, data-driven insights backed by actual calculations.\n", + " \n" + ] + } + ], + "execution_count": 8 + }, + { + "cell_type": "markdown", + "id": "87157a0ea835ab5", + "metadata": {}, + "source": [ + "### 6.2 Code Execution Tool Definition\n", + "Next we define the function as tool that will be used by the Agent as tool, to run code in the code sandbox. We use the @tool decorator to annotate the function as a custom tool for the Agent.\n", + "\n", + "Within an active code interpreter session, you can execute code in supported languages (Python, JavaScript), access libraries based on your dependencies configuration, generate visualizations, and maintain state between executions." + ] + }, + { + "cell_type": "code", + "id": "750472cd96e873c9", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:35.622425Z", + "start_time": "2025-09-30T02:22:35.615947Z" + } + }, + "source": [ + "#Define and configure the code interpreter tool\n", + "@tool\n", + "def execute_python(code: str, description: str = \"\") -> str:\n", + " \"\"\"Execute Python code in the sandbox.\"\"\"\n", + "\n", + " if description:\n", + " code = f\"# {description}\\n{code}\"\n", + "\n", + " #Print generated Code to be executed\n", + " print(f\"\\n Generated Code: {code}\")\n", + "\n", + "\n", + " # Call the Invoke method and execute the generated code, within the initialized code interpreter session\n", + " response = code_client.invoke(\"executeCode\", {\n", + " \"code\": code,\n", + " \"language\": \"python\",\n", + " \"clearContext\": False\n", + " })\n", + " for event in response[\"stream\"]:\n", + " return json.dumps(event[\"result\"])" + ], + "outputs": [], + "execution_count": 9 + }, + { + "cell_type": "markdown", + "id": "e9bc47b82755730d", + "metadata": {}, + "source": [ + "### 6.3 Agent Configuration\n", + "We create and configure an agent using the Strands SDK. We provide it the system prompt and the tool we defined above to execute generate code.\n", + "\n", + "We use [Qwen 3 Coder 480B](https://app.fireworks.ai/models/fireworks/qwen3-coder-480b-a35b-instruct) a SOTA OSS model from the Qwen family" + ] + }, + { + "cell_type": "code", + "id": "14c5e8f18b70dc01", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:22:36.501763Z", + "start_time": "2025-09-30T02:22:36.497637Z" + } + }, + "source": [ + "model = OpenAIModel(\n", + " client_args={\n", + " \"api_key\": FIREWORKS_API_KEY,\n", + " \"base_url\": \"https://api.fireworks.ai/inference/v1\",\n", + " },\n", + " model_id=\"accounts/fireworks/models/qwen3-coder-480b-a35b-instruct\",\n", + " params={\n", + " \"max_tokens\": 5000,\n", + " \"temperature\": 0.0,\n", + " }\n", + ")\n", + "\n", + "agent=Agent(\n", + " model=model,\n", + " tools=[execute_python],\n", + " system_prompt=DATA_SCIENTIST_SYSTEM_PROMPT,\n", + " callback_handler=None\n", + ")" + ], + "outputs": [], + "execution_count": 10 + }, + { + "cell_type": "markdown", + "id": "25693e10aa1e5689", + "metadata": {}, + "source": [ + "## 7. Agent Invocation and Response Processing\n", + "We invoke the agent with our query and process the agent's response\n", + "\n", + "\n", + "Note: Async execution requires running in an async environment" + ] + }, + { + "cell_type": "markdown", + "id": "bddfa7cb97ead950", + "metadata": {}, + "source": [ + "## 7.1 Query to perform Exploratory Data Analysis(EDA)" + ] + }, + { + "cell_type": "markdown", + "id": "89f98916e2cc3627", + "metadata": {}, + "source": [ + "Let's start with a query which instructs the agent to perform exploratory data analysis on the data file in the code sandbox environment" + ] + }, + { + "cell_type": "code", + "id": "7370ff964d06a1cd", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T02:24:31.543275Z", + "start_time": "2025-09-30T02:22:41.040563Z" + } + }, + "source": [ + "query = (\"Load the file 'gdp_data.csv' and perform some simple exploratory data analysis (EDA) on it. Tell me about distributions and outlier values. \"\n", + " \"Prepare a short final report with your findings.\")\n", + "\n", + "# Invoke the agent asynchcronously and stream the response\n", + "response_text = \"\"\n", + "async for event in agent.stream_async(query):\n", + " if \"data\" in event:\n", + " # Stream text response\n", + " chunk = event[\"data\"]\n", + " response_text += chunk\n", + " print(chunk, end=\"\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I'll help you load and analyze the GDP dataset. Let me start by loading the file and performing exploratory data analysis.\n", + "\n", + "\n", + " Generated Code: # Loading the GDP dataset and displaying basic information\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Load the GDP dataset\n", + "df = pd.read_csv('gdp_data.csv')\n", + "\n", + "# Display basic information about the dataset\n", + "print(\"Dataset Info:\")\n", + "print(df.info())\n", + "print(\"\\nFirst few rows:\")\n", + "print(df.head())\n", + "print(\"\\nDataset shape:\", df.shape)\n", + "I apologize for the error. It seems the file 'gdp_data.csv' doesn't exist in the current directory. Let me check what files are available for us to work with.\n", + "\n", + "\n", + " Generated Code: # Checking available files in the current directory\n", + "import os\n", + "\n", + "# List all files in the current directory\n", + "files = os.listdir('.')\n", + "print(\"Available files:\")\n", + "for file in files:\n", + " print(file)\n", + "I see that there's a 'data' directory. Let me check what's inside that directory, as the GDP data might be stored there.\n", + "\n", + "\n", + " Generated Code: # Checking files in the data directory\n", + "import os\n", + "\n", + "# List files in the data directory\n", + "try:\n", + " files = os.listdir('data')\n", + " print(\"Files in 'data' directory:\")\n", + " for file in files:\n", + " print(file)\n", + "except FileNotFoundError:\n", + " print(\"No 'data' directory found\")\n", + "Great! I found the GDP data file in the 'data' directory. Let me load it and perform the exploratory data analysis.\n", + "\n", + "\n", + " Generated Code: # Loading the GDP dataset and displaying basic information\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Load the GDP dataset from the data directory\n", + "df = pd.read_csv('data/gdp_data.csv')\n", + "\n", + "# Display basic information about the dataset\n", + "print(\"Dataset Info:\")\n", + "print(df.info())\n", + "print(\"\\nFirst few rows:\")\n", + "print(df.head())\n", + "print(\"\\nDataset shape:\", df.shape)\n", + "print(\"\\nColumn names:\")\n", + "print(df.columns.tolist())\n", + "Now let me perform a more detailed exploratory data analysis, including checking for missing values, basic statistics, and distributions.\n", + "\n", + "\n", + " Generated Code: # Checking missing values and basic statistics\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Load the GDP dataset\n", + "df = pd.read_csv('data/gdp_data.csv')\n", + "\n", + "# Check for missing values\n", + "print(\"Missing values per column:\")\n", + "print(df.isnull().sum())\n", + "print(\"\\nPercentage of missing values per column:\")\n", + "print((df.isnull().sum() / len(df)) * 100)\n", + "\n", + "# Basic statistics for numerical columns\n", + "print(\"\\nBasic statistics for GDP values:\")\n", + "gdp_columns = ['2020', '2021', '2022', '2023', '2024', '2025']\n", + "print(df[gdp_columns].describe())\n", + "\n", + "# Check data types\n", + "print(\"\\nData types:\")\n", + "print(df.dtypes)\n", + "Let me now identify which countries have missing values and examine the distribution of GDP values more closely, including visualization.\n", + "\n", + "\n", + " Generated Code: # Identifying countries with missing values and potential outliers\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Load the GDP dataset\n", + "df = pd.read_csv('data/gdp_data.csv')\n", + "\n", + "# Identify countries with missing values\n", + "print(\"Countries with missing GDP data:\")\n", + "missing_data = df[df.isnull().any(axis=1)]\n", + "print(missing_data[['Country'] + [col for col in df.columns if col != 'Country' and df[col].isnull().any()]])\n", + "\n", + "# Check for potential outliers using IQR method for 2025 data (most complete year)\n", + "gdp_2025 = df['2025'].dropna()\n", + "\n", + "Q1 = gdp_2025.quantile(0.25)\n", + "Q3 = gdp_2025.quantile(0.75)\n", + "IQR = Q3 - Q1\n", + "\n", + "lower_bound = Q1 - 1.5 * IQR\n", + "upper_bound = Q3 + 1.5 * IQR\n", + "\n", + "outliers = df[(df['2025'] < lower_bound) | (df['2025'] > upper_bound)]['Country'].tolist()\n", + "print(f\"\\nPotential outliers in 2025 GDP data (using IQR method):\")\n", + "print(outliers)\n", + "\n", + "# Let's also look at the top 10 countries by GDP in 2025\n", + "print(f\"\\nTop 10 countries by GDP in 2025:\")\n", + "top_10 = df.nlargest(10, '2025')[['Country', '2025']]\n", + "print(top_10)\n", + "Let me create some visualizations to better understand the GDP distributions and trends.\n", + "\n", + "\n", + " Generated Code: # Creating visualizations for GDP distributions\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Load the GDP dataset\n", + "df = pd.read_csv('data/gdp_data.csv')\n", + "\n", + "# Set up the plotting style\n", + "plt.style.use('seaborn-v0_8-darkgrid')\n", + "fig, axes = plt.subplots(2, 3, figsize=(18, 12))\n", + "fig.suptitle('GDP Distribution Across Years', fontsize=16)\n", + "\n", + "# Plot histograms for each year\n", + "years = ['2020', '2021', '2022', '2023', '2024', '2025']\n", + "axes = axes.flatten()\n", + "\n", + "for i, year in enumerate(years):\n", + " data = df[year].dropna()\n", + " axes[i].hist(data, bins=50, color=f'C{i}', alpha=0.7)\n", + " axes[i].set_title(f'GDP Distribution - {year}')\n", + " axes[i].set_xlabel('GDP Value')\n", + " axes[i].set_ylabel('Frequency')\n", + " axes[i].ticklabel_format(style='scientific', axis='x', scilimits=(0,0))\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Create a boxplot to show the distribution and outliers across years\n", + "plt.figure(figsize=(12, 8))\n", + "gdp_data = [df[year].dropna() for year in years]\n", + "plt.boxplot(gdp_data, labels=years)\n", + "plt.title('GDP Distribution Across Years (Boxplot)')\n", + "plt.ylabel('GDP Value')\n", + "plt.yscale('log') # Using log scale to better visualize the wide range of values\n", + "plt.show()\n", + "\n", + "# Plot the top 10 countries by 2025 GDP\n", + "top_10 = df.nlargest(10, '2025')\n", + "plt.figure(figsize=(12, 8))\n", + "plt.barh(top_10['Country'], top_10['2025'], color='skyblue')\n", + "plt.title('Top 10 Countries by GDP in 2025')\n", + "plt.xlabel('GDP Value')\n", + "plt.ticklabel_format(style='scientific', axis='x', scilimits=(0,0))\n", + "plt.gca().invert_yaxis() # To have the highest at the top\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Show basic statistics\n", + "print(\"Summary of GDP distributions:\")\n", + "for year in years:\n", + " data = df[year].dropna()\n", + " print(f\"{year}: Mean = {data.mean():.2f}, Median = {data.median():.2f}, Std = {data.std():.2f}\")\n", + "Let me also analyze the GDP growth trends for a better understanding of the data.\n", + "\n", + "\n", + " Generated Code: # Analyzing GDP growth trends\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Load the GDP dataset\n", + "df = pd.read_csv('data/gdp_data.csv')\n", + "\n", + "# Calculate year-over-year growth rates\n", + "gdp_columns = ['2020', '2021', '2022', '2023', '2024', '2025']\n", + "df_growth = df[['Country']].copy()\n", + "\n", + "# Calculate growth rates\n", + "for i in range(1, len(gdp_columns)):\n", + " current_year = gdp_columns[i]\n", + " previous_year = gdp_columns[i-1]\n", + " growth_rate_col = f'{current_year}_growth'\n", + " \n", + " # Calculate growth rate: (current - previous) / previous * 100\n", + " df_growth[growth_rate_col] = ((df[current_year] - df[previous_year]) / df[previous_year]) * 100\n", + "\n", + "# Display average growth rates by year\n", + "growth_columns = [col for col in df_growth.columns if 'growth' in col]\n", + "print(\"Average GDP Growth Rates by Year:\")\n", + "for col in growth_columns:\n", + " avg_growth = df_growth[col].mean()\n", + " print(f\"{col}: {avg_growth:.2f}%\")\n", + "\n", + "# Identify countries with the highest and lowest growth rates in 2025\n", + "print(\"\\nCountries with highest growth rates in 2025:\")\n", + "top_growth = df_growth.nlargest(5, '2025_growth')[['Country', '2025_growth']]\n", + "print(top_growth)\n", + "\n", + "print(\"\\nCountries with lowest growth rates in 2025:\")\n", + "bottom_growth = df_growth.nsmallest(5, '2025_growth')[['Country', '2025_growth']]\n", + "print(bottom_growth)\n", + "\n", + "# Check for any negative growth rates\n", + "negative_growth = df_growth[df_growth['2025_growth'] < 0]\n", + "print(f\"\\nNumber of countries with negative GDP growth in 2025: {len(negative_growth)}\")\n", + "\n", + "# Calculate overall GDP trends\n", + "print(\"\\nOverall GDP trends (sum of all countries by year):\")\n", + "total_gdp_by_year = {}\n", + "for year in gdp_columns:\n", + " total_gdp_by_year[year] = df[year].sum()\n", + " \n", + "for year, total in total_gdp_by_year.items():\n", + " print(f\"{year}: {total:,.0f}\")\n", + "\n", + "# Calculate global GDP growth rates\n", + "years = list(total_gdp_by_year.keys())\n", + "global_growth_rates = {}\n", + "for i in range(1, len(years)):\n", + " current_year = years[i]\n", + " previous_year = years[i-1]\n", + " growth_rate = ((total_gdp_by_year[current_year] - total_gdp_by_year[previous_year]) / total_gdp_by_year[previous_year]) * 100\n", + " global_growth_rates[current_year] = growth_rate\n", + " print(f\"Global GDP growth rate {previous_year}-{current_year}: {growth_rate:.2f}%\")\n", + "Now let me create a comprehensive final report summarizing all the findings from the exploratory data analysis.\n", + "\n", + "\n", + " Generated Code: # Generating final EDA report\n", + "import pandas as pd\n", + "\n", + "# Load the GDP dataset\n", + "df = pd.read_csv('data/gdp_data.csv')\n", + "\n", + "# Generate final report\n", + "report = \"\"\"\n", + "# GDP Dataset Exploratory Data Analysis Report\n", + "\n", + "## Dataset Overview\n", + "- **Dataset Shape**: 196 countries, 7 columns (Country name + 6 years of GDP data)\n", + "- **Time Period**: 2020-2025\n", + "- **Data Types**: Country names (object), GDP values (int64 and float64)\n", + "\n", + "## Data Quality Assessment\n", + "- **Missing Values**: \n", + " - 2020: 0 missing values\n", + " - 2021-2023: 2 missing values each (1.02%)\n", + " - 2024: 4 missing values (2.04%)\n", + " - 2025: 7 missing values (3.57%)\n", + " \n", + "- **Countries with Missing Data**:\n", + " - Afghanistan (missing 2024-2025 data)\n", + " - Eritrea (missing all data except country name)\n", + " - Lebanon (missing 2025 data)\n", + " - Pakistan (missing 2025 data)\n", + " - Sri Lanka (missing 2025 data)\n", + " - Syria (missing all GDP data)\n", + " - Palestine (missing 2024-2025 data)\n", + "\n", + "## GDP Distribution Analysis\n", + "- **Wide Range of Values**: GDP values range from just over 50 to over 30 million (USD)\n", + "- **High Variability**: Standard deviation is much larger than the mean, indicating high disparity between countries\n", + "- **Skewed Distribution**: Most countries have relatively low GDP values, with a few countries having extremely high values\n", + "- **Top Economies (2025)**:\n", + " 1. United States: ~30.5 million\n", + " 2. China: ~19.2 million\n", + " 3. Germany: ~4.7 million\n", + " 4. India: ~4.2 million\n", + " 5. Japan: ~4.2 million\n", + "\n", + "## Outliers\n", + "- Using the IQR method, 22 countries were identified as outliers in 2025, including major economies like the US, China, Japan, Germany, etc.\n", + "- These \"outliers\" are actually the largest economies and represent expected extreme values rather than data errors\n", + "\n", + "## GDP Growth Trends\n", + "- **Average Growth Rates**:\n", + " - 2020-2021: 13.86%\n", + " - 2021-2022: 8.46%\n", + " - 2022-2023: 6.41%\n", + " - 2023-2024: 5.72%\n", + " - 2024-2025: 4.43%\n", + " \n", + "- **Global GDP Growth**:\n", + " - 2020-2021: 14.00%\n", + " - 2021-2022: 4.19%\n", + " - 2022-2023: 4.40%\n", + " - 2023-2024: 3.85%\n", + " - 2024-2025: 2.45%\n", + " \n", + "- **Highest Growth (2025)**: Burundi (42.21%), Haiti (27.90%), Malawi (18.33%)\n", + "- **Negative Growth (2025)**: 23 countries, including South Sudan (-26.28%), Ethiopia (-17.93%), Iran (-15.03%)\n", + "\n", + "## Key Insights\n", + "1. The dataset shows a clear pattern of recovery and growth following the economic impacts of 2020\n", + "2. Growth rates have been gradually declining since 2021, which is typical for post-recovery periods\n", + "3. There's significant disparity in GDP values between countries, with a few major economies dominating global GDP\n", + "4. Some countries continue to face economic challenges, as evidenced by negative growth rates\n", + "5. The data quality is generally good, with only a small percentage of missing values\n", + "\n", + "## Recommendations for Further Analysis\n", + "1. Investigate the reasons behind missing data for specific countries\n", + "2. Analyze regional patterns by grouping countries by geographic regions\n", + "3. Examine per-capita GDP to account for population differences\n", + "4. Compare growth patterns between developed and developing economies\n", + "\"\"\"\n", + "\n", + "print(report)\n", + "# GDP Dataset Exploratory Data Analysis Report\n", + "\n", + "## Dataset Overview\n", + "- **Dataset Shape**: 196 countries, 7 columns (Country name + 6 years of GDP data)\n", + "- **Time Period**: 2020-2025\n", + "- **Data Types**: Country names (object), GDP values (int64 and float64)\n", + "\n", + "## Data Quality Assessment\n", + "- **Missing Values**: \n", + " - 2020: 0 missing values\n", + " - 2021-2023: 2 missing values each (1.02%)\n", + " - 2024: 4 missing values (2.04%)\n", + " - 2025: 7 missing values (3.57%)\n", + " \n", + "- **Countries with Missing Data**:\n", + " - Afghanistan (missing 2024-2025 data)\n", + " - Eritrea (missing all data except country name)\n", + " - Lebanon (missing 2025 data)\n", + " - Pakistan (missing 2025 data)\n", + " - Sri Lanka (missing 2025 data)\n", + " - Syria (missing all GDP data)\n", + " - Palestine (missing 2024-2025 data)\n", + "\n", + "## GDP Distribution Analysis\n", + "- **Wide Range of Values**: GDP values range from just over 50 to over 30 million (USD)\n", + "- **High Variability**: Standard deviation is much larger than the mean, indicating high disparity between countries\n", + "- **Skewed Distribution**: Most countries have relatively low GDP values, with a few countries having extremely high values\n", + "- **Top Economies (2025)**:\n", + " 1. United States: ~30.5 million\n", + " 2. China: ~19.2 million\n", + " 3. Germany: ~4.7 million\n", + " 4. India: ~4.2 million\n", + " 5. Japan: ~4.2 million\n", + "\n", + "## Outliers\n", + "- Using the IQR method, 22 countries were identified as outliers in 2025, including major economies like the US, China, Japan, Germany, etc.\n", + "- These \"outliers\" are actually the largest economies and represent expected extreme values rather than data errors\n", + "\n", + "## GDP Growth Trends\n", + "- **Average Growth Rates**:\n", + " - 2020-2021: 13.86%\n", + " - 2021-2022: 8.46%\n", + " - 2022-2023: 6.41%\n", + " - 2023-2024: 5.72%\n", + " - 2024-2025: 4.43%\n", + " \n", + "- **Global GDP Growth**:\n", + " - 2020-2021: 14.00%\n", + " - 2021-2022: 4.19%\n", + " - 2022-2023: 4.40%\n", + " - 2023-2024: 3.85%\n", + " - 2024-2025: 2.45%\n", + " \n", + "- **Highest Growth (2025)**: Burundi (42.21%), Haiti (27.90%), Malawi (18.33%)\n", + "- **Negative Growth (2025)**: 23 countries, including South Sudan (-26.28%), Ethiopia (-17.93%), Iran (-15.03%)\n", + "\n", + "## Key Insights\n", + "1. The dataset shows a clear pattern of recovery and growth following the economic impacts of 2020\n", + "2. Growth rates have been gradually declining since 2021, which is typical for post-recovery periods\n", + "3. There's significant disparity in GDP values between countries, with a few major economies dominating global GDP\n", + "4. Some countries continue to face economic challenges, as evidenced by negative growth rates\n", + "5. The data quality is generally good, with only a small percentage of missing values\n", + "\n", + "## Recommendations for Further Analysis\n", + "1. Investigate the reasons behind missing data for specific countries\n", + "2. Analyze regional patterns by grouping countries by geographic regions\n", + "3. Examine per-capita GDP to account for population differences\n", + "4. Compare growth patterns between developed and developing economies" + ] + } + ], + "execution_count": 11 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "Lets double check that these insights are correct and not hallucinated", + "id": "84436a72e90348b8" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T04:20:33.062922Z", + "start_time": "2025-09-30T04:20:33.059867Z" + } + }, + "cell_type": "code", + "source": [ + "# Calculate growth per country year to year\n", + "for years in range(2021, 2026):\n", + " df[\"growth_\" + str(years)] = (100 * (df[str(years)] - df[str(years - 1)]) )/ df[str(years - 1)]" + ], + "id": "932e55ea3349b84f", + "outputs": [], + "execution_count": 22 + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "### Double check the agents work", + "id": "cc466e4eecb393b1" + }, + { + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T04:24:26.800498Z", + "start_time": "2025-09-30T04:24:26.795581Z" + } + }, + "cell_type": "code", + "source": [ + "print(f'Countries with the highest growth in 2025:\\n {df.sort_values(by=\"growth_2025\", ascending=False).loc[:, [\"Country\", \"growth_2025\"]].head(3)}')\n", + "print()\n", + "print(f'Countries with the largest contraction in 2025:\\n {df.sort_values(by=\"growth_2025\", ascending=True).loc[:, [\"Country\", \"growth_2025\"]].head(3)}')" + ], + "id": "cc584a1667170799", + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Countries with the highest growth in 2025:\n", + " Country growth_2025\n", + "28 Burundi 42.209572\n", + "72 Haiti 27.904228\n", + "104 Malawi 18.326693\n", + "\n", + "Countries with the largest contraction in 2025:\n", + " Country growth_2025\n", + "159 South Sudan -26.276968\n", + "57 Ethiopia -17.932827\n", + "79 Iran -15.034994\n" + ] + } + ], + "execution_count": 35 + }, + { + "cell_type": "markdown", + "id": "2d6b3ce0963d4a83", + "metadata": {}, + "source": [ + "## 8. Cleanup\n", + "\n", + "Finally, we'll clean up by stopping the Code Interpreter session. Once finished using a session, the session should be shopped to release resources and avoid unnecessary charges." + ] + }, + { + "cell_type": "code", + "id": "baa2ca7fce8b181d", + "metadata": { + "ExecuteTime": { + "end_time": "2025-09-30T04:32:40.259254Z", + "start_time": "2025-09-30T04:32:40.040546Z" + } + }, + "source": [ + "# Stop the Code Interpreter session\n", + "code_client.stop()\n", + "print(\"Code Interpreter session stopped successfully!\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Code Interpreter session stopped successfully!\n" + ] + } + ], + "execution_count": 47 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}