In [1]:
from google.genai import types
from google.adk.agents import LlmAgent
from google.adk.models.google_llm import Gemini
from google.adk.runners import InMemoryRunner
from google.adk.sessions import InMemorySessionService
from google.adk.tools import google_search, AgentTool, ToolContext
from google.adk.code_executors import BuiltInCodeExecutor
from google.adk.sessions import DatabaseSessionService
from google.adk.sessions import InMemorySessionService
from google.adk.runners import Runner

In [12]:
import os
from getpass import getpass

if "GOOGLE_API_KEY" not in os.environ:
    os.environ["GOOGLE_API_KEY"]=getpass("ApiKey:")


ApiKey: ········


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
#Tool for read the CSV and inspect for missing values
def analyze_missing_data(df_path:str)->dict:
    """
    Reads a CSV file into a pandas DataFrame, prints the total number of rows 
    and columns, and returns a detailed dictionary summary of missing (NaN) values 
    for every column that has them. Returns "No missing values found." if clean.
    """
    try:
        df=pd.read_csv(df_path)
        
        #Calculate missing data sum
        missing_data=df.isnull().sum()
        missing_data=missing_data[missing_data>0]

        missing_dict=missing_data.to_dict()
        
        if missing_data.empty:
            return {
                "status":"success",
                "message":"No missing values found.",
                "missing_data":{}
            }
        else:
            return {
                "status":"success",
                "message":"Missing values detected",
                "missing_data": missing_dict,
                "total_rows":len(df)
            }
    except Exception as e:
        return {
            "status":"error",
            "message":str(e)
        }

In [5]:
#Tool to fill missing values using the mean
def impute_data_with_mean(df_path:str,column_name:str)->dict:
    """
    Loads the CSV file, fills missing values (NaNs) in the specified column 
    with the mean of that column, and saves the modified DataFrame back 
    to the original path. Reports the result and the new missing count.
    
    Args:
        df_path: Path to the CSV file (e.g., 'data/train.csv').
        column_name: The column in which to fill NaNs.
        
    Returns:
        A dict confirming the imputation result.
    """
    try:
        πrint(f"DEBUG: Trying to read {df_path}...")
        df=pd.read_csv(df_path)
        
        if column_name not in df.columns:
            return {
                "status":"error",
                "message":f"Column '{column_name}' not found in dataset"
            }

        #Calculate mean and impute
        impute_value=df[column_name].mean()
        initial_missing_count=df[column_name].isnull().sum()
        df[column_name]=df[column_name].fillna(impute_value)
    
        #Save back to the original path
        df.to_csv(df_path,index=False)
        final_missing_count=df[column_name].isnull().sum()
        
        return {
            "status":"success",
            "column":column_name,
            "imputed_value":impute_value,
            "filled_count":initial_missing_count,
            "message":f"Successfully filled {initial_missing} missing values with mean: {impute_value:.2f}"
        }
    
    except Exception as e:
        return {
            "status":"error",
            "message":str(e)
        }

In [6]:
retry_config = types.HttpRetryOptions(
    attempts=5,  # Maximum retry attempts
    exp_base=7,  # Delay multiplier
    initial_delay=1,
    http_status_codes=[429, 500, 503, 504],  # Retry on these HTTP errors
)

In [7]:
import sys
from io import StringIO

# Custom tool to execute code and return a DICT
def execute_python_code(code: str) -> dict:
    """
    Executes Python code and returns the output (stdout) or error in a structured dictionary.
    Useful for calculating stats, filtering data, or creating plots.
    """
    # Capture standard output (print statements)
    old_stdout = sys.stdout
    redirected_output = StringIO()
    sys.stdout = redirected_output

    try:
        # Execute the code
        # Note: We use a shared dictionary for variables if you want state to persist, 
        # but for simple tasks 'locals()' or a new dict is safer.
        exec(code, globals())
        
        # Get the output
        sys.stdout = old_stdout
        output = redirected_output.getvalue()
        
        return {
            "status": "success",
            "code_executed": code,
            "output": output if output else "Code executed successfully (no output).",
            "message": "Code ran without errors."
        }

    except Exception as e:
        sys.stdout = old_stdout
        return {
            "status": "error",
            "error_type": type(e).__name__,
            "message": str(e)
        }

In [8]:
data_analyst_agent = LlmAgent(
    name="DataAnalyst",
    model=Gemini(model="gemini-2.5-flash-lite", retry_options=retry_config),
    instruction="""You are an expert Data Analyst and data cleaning specialist. 
        Your primary goal is to inspect CSV files for missing data and suggest/execute 
        the best method to fill them, always prioritizing data integrity. 
        You must use the 'analyze_missing_data' tool first when asked about data quality.
    """,
    tools=[analyze_missing_data,impute_data_with_mean,execute_python_code],
)

In [14]:
df=pd.read_csv('train.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [16]:
runner = InMemoryRunner(agent=data_analyst_agent)


file_path = 'train.csv' 
print("--- Agent Running ---")


_ = await runner.run_debug(
    f"Check '{file_path}' for missing values. If you find any in 'Age', fill them with the mean."
)

--- Agent Running ---

 ### Created new session: debug_session_id

User > Check 'train.csv' for missing values. If you find any in 'Age', fill them with the mean.




DataAnalyst > It looks like there was an error in my last attempt, and I couldn't fill the missing 'Age' values. I'll try that again, and this time it should work.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




DataAnalyst > It seems I'm having trouble with that command. I'll try a different approach to fill the missing 'Age' values.
DataAnalyst > I've filled the missing values in the 'Age' column with the mean. Let me know if there's anything else you need!


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
