In [42]:
# =====================================================
# üß† SASTA DS-STAR : Autonomous Data Science Agent
# =====================================================
# Works perfectly in Google Colab
# Requires a Gemini API key (get it free from https://aistudio.google.com/app/apikey)
# =====================================================

!pip install -q google-generativeai pandas matplotlib

import os, json, pandas as pd, matplotlib.pyplot as plt
import google.generativeai as genai


In [43]:
# üîë Configure your Gemini API key
genai.configure(api_key="AIzaSyAVThApeb354f4Wnvij5pBlWZsglEA5Xow")

# Load model
model = genai.GenerativeModel("gemini-2.5-flash")


In [44]:
# ---------------------------------
# 1Ô∏è‚É£ FILE ANALYZER
# ---------------------------------
def analyze_files(path="/content/sample_data/"):
    summaries = {}
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        if file.endswith(".csv"):
            try:
                df = pd.read_csv(file_path)
                summaries[file] = f"CSV with {df.shape[0]} rows, {df.shape[1]} columns, columns={list(df.columns)}"
            except Exception as e:
                summaries[file] = f"Error reading CSV: {e}"
        elif file.endswith(".json"):
            try:
                with open(file_path) as f:
                    data = json.load(f)
                summaries[file] = f"JSON with keys: {list(data.keys()) if isinstance(data, dict) else 'list items'}"
            except Exception as e:
                summaries[file] = f"Error reading JSON: {e}"
        elif file.endswith(".txt"):
            with open(file_path) as f:
                text = f.read()
            summaries[file] = f"Text file with {len(text.split())} words."
    return summaries


In [45]:
# ---------------------------------
# 2Ô∏è‚É£ PLANNER
# ---------------------------------
def plan_analysis(summary_dict, question="Find insights from the data"):
    prompt = f"""
You are a helpful data-science planner.
Given these file summaries:
{summary_dict}
Task: {question}
Write a concise 3‚Äì5 step plan to analyze the data, ending with visualization if possible.
"""
    resp = model.generate_content(prompt)
    return resp.text


In [46]:
# ---------------------------------
# 3Ô∏è‚É£ CODER
# ---------------------------------
def generate_code(plan_text):
    prompt = f"""
Write clean Python code using pandas and matplotlib for this plan:
{plan_text}
Use the available data files in the /content/sample_data/ folder.
Ensure the code prints or plots results clearly.
"""
    resp = model.generate_content(prompt)
    return resp.text


In [47]:
# ---------------------------------
# 4Ô∏è‚É£ EXECUTOR + VERIFIER + REFINER
# ---------------------------------
def execute_with_refinement(code, max_rounds=3):
    logs = []
    for round in range(1, max_rounds + 1):
        print(f"\n‚öôÔ∏è  Round {round}: Executing generated code...\n")
        try:
            exec(code, globals())
            print("\n‚úÖ Code executed successfully!")
            logs.append(f"Round {round}: Success")
            break
        except Exception as e:
            print("‚ùå Error:", e)
            logs.append(f"Round {round}: Failed with error {e}")
            fix_prompt = f"Fix this Python code error ({e}) in the following code:\n{code}"
            code = model.generate_content(fix_prompt).text
    return code, logs


In [48]:
# ---------------------------------
# üßæ REPORT GENERATOR
# ---------------------------------
def save_report(plan, code, logs, report_path="report.txt"):
    with open(report_path, "w") as f:
        f.write("==== SASTA DS-STAR REPORT ====\n\n")
        f.write("üß≠ PLAN:\n")
        f.write(plan + "\n\n")
        f.write("üíª FINAL CODE:\n")
        f.write(code + "\n\n")
        f.write("üß© EXECUTION LOG:\n")
        for log in logs:
            f.write(log + "\n")
    print(f"\nüìÑ Report saved as {report_path}")


In [49]:
# ---------------------------------
# 5Ô∏è‚É£ DRIVER FUNCTION
# ---------------------------------
def ds_star_main(question="Analyze and visualize the data", path="/content/sample_data/"):
    summaries = analyze_files(path)
    print("üìÅ File summaries:\n", summaries)

    plan = plan_analysis(summaries, question)
    print("\nüß≠ PLAN:\n", plan)

    code = generate_code(plan)
    print("\nüíª GENERATED CODE:\n", code)

    final_code, logs = execute_with_refinement(code)
    print("\nüèÅ FINAL CODE USED:\n", final_code)

    save_report(plan, final_code, logs)


In [None]:
# ---------------------------------
# üöÄ RUN
# ---------------------------------
ds_star_main("Find correlations and visualize relationships between variables.")


üìÅ File summaries:
 {'anscombe.json': 'JSON with keys: list items', 'california_housing_train.csv': "CSV with 17000 rows, 9 columns, columns=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']", 'california_housing_test.csv': "CSV with 3000 rows, 9 columns, columns=['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value']", 'mnist_test.csv': "CSV with 9999 rows, 785 columns, columns=['7', '0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '0.8', '0.9', '0.10', '0.11', '0.12', '0.13', '0.14', '0.15', '0.16', '0.17', '0.18', '0.19', '0.20', '0.21', '0.22', '0.23', '0.24', '0.25', '0.26', '0.27', '0.28', '0.29', '0.30', '0.31', '0.32', '0.33', '0.34', '0.35', '0.36', '0.37', '0.38', '0.39', '0.40', '0.41', '0.42', '0.43', '0.44', '0.45', '0.46', '0.47', '0.48', '0.49', '0.50', '0.51', '0.52', '0.53', 