# "What if 🎯🚀 we build an AI Agent 🤖 who can compete for me in this Kaggle competition 🏆📊?"



In [None]:
#Remove conflicting packages from the Kaggle base environment.¶
!pip uninstall -qqy kfp jupyterlab libpysal thinc spacy fastai ydata-profiling google-cloud-bigquery google-generativeai
!pip install -qU 'langgraph==0.3.21' 'langchain-google-genai==2.1.2' 'langgraph-prebuilt==0.1.7'


In [None]:
import numpy as np 
import pandas as pd 
import os
import operator
import re # Import regex for parsing
import json
import io
import sys



from langchain_core.messages import BaseMessage, HumanMessage, AIMessage # Import AIMessage
# from langchain_openai import ChatOpenAI # Replace with your desired LLM provider
from langgraph.graph import StateGraph, END
# Kaggle and Google AI
from kaggle_secrets import UserSecretsClient
from langchain_google_genai import ChatGoogleGenerativeAI
from google import genai
from google.genai import types
from google.api_core import retry

# IPython Display
from IPython.display import Markdown, Image, display

# PDF Processing
import pypdf

# ChromaDB
#import chromadb
#from chromadb import Documents, EmbeddingFunction, Embeddings


from typing import TypedDict, Annotated, Optional, Literal, List, Dict, Any
from typing_extensions import TypedDict # 
from langchain_core.messages import BaseMessage 
from contextlib import redirect_stdout

# Langchain and Langgraph
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage # Ensure these are imported


# Pretty Print
from pprint import pprint
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("GOOGLE_API_KEY")


train, test, submission = pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv'),pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv'),pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')

# **🎯🚀We will defined an agent who will coordinate our data analyst with out data scienctist🎯🚀**
# **Lets build a 🤖 herarchical 🤖 architecture**
# 1. supervisor
# 2. data scientist
# 3. data analyst
# 4. outputs to Human interpreter 

In [None]:
# Using simple dicts for messages initially for clarity, but BaseMessage[] is better practice# Example: messages: Annotated[list[BaseMessage], add_messages]
# --- State Definition ---
class GraphState(TypedDict):
    """
    🗂️ Central state definition for the CALOR-IA multi-agent workflow.
    Reflects using DataFrame variable names instead of file paths.
    """
    messages: Annotated[List[BaseMessage], operator.add]
    supervisor_tasks: Optional[List[str]]
    current_task_description: Optional[str]
    analyst_output: Optional[Dict[str, Any]]
    scientist_output: Optional[Dict[str, Any]]
    intermediate_data_path: Optional[str] # Removed - using variable names now
    processed_df_name: Optional[str]       # Name of the DataFrame variable ready for the scientist
    error: Optional[str]
    next_agent: Optional[Literal["DataAnalyst",  "DataScientist", "Supervisor", "HumanInterpreter", "__end__"]] 
    final_answer_generated: bool # Added for clarity


In [None]:
CALOR_IA_SUPERVISOR_SYSINT = (
    "system",
    """
    🤖 You are CALOR-IA Supervisor Bot, the orchestrator of a data science team.

    🎯 Mission
    1️⃣ Receive user requests and break them down into actionable steps for your team.
    2️⃣ Coordinate two AI workers:
        - CALOR-IA_DATA_ANALYST_SYSINT: Handles data exploration, cleaning, visualization, and preparation.
        - CALOR-IA_DATA_SCIENTIST_SYSINT: Handles model building, training, evaluation, and prediction.
    3️⃣ Decide which worker should handle the *next* step based on the conversation history and the overall goal (Kaggle win 🏆).
    4️⃣ Formulate a clear, specific `current_task_description` for the assigned worker.
    5️⃣ Review the `analyst_output` or `scientist_output` provided by the workers.
    6️⃣ Synthesize results, manage the overall `supervisor_tasks` list, and communicate progress or final results back to the user via the `messages` state.
    7️⃣ Determine the `next_agent` required: "DataAnalyst", "DataScientist", "Supervisor" (if waiting for user clarification or summarizing), or "__end__" when the overall task is complete.
    8️⃣ Guide the human user 🧑‍💻 and AI agents 🤖 towards success.

    📝 Workflow & Rules
    - Analyze the latest message(s) in the `messages` state.
    - Check `analyst_output` and `scientist_output` for recent worker results.
    - Based on the goal and current state:
        - If data analysis/prep is needed -> `next_agent` = "DataAnalyst". Create a task in `current_task_description`.
        - If modeling/prediction is needed (and data is ready, possibly using `intermediate_data_path`) -> `next_agent` = "DataScientist". Create a task in `current_task_description`.
        - If results need summarizing for the user or clarification is needed -> `next_agent` = "Supervisor". Update `messages`.
        - If the user's request is fully addressed -> `next_agent` = "__end__". Set `final_answer_generated` = True.
    - Provide clear, step-by-step instructions in `current_task_description`.
    - If workers need data from each other, ensure the relevant `intermediate_data_path` is mentioned or available.
    - Keep track of high-level goals in `supervisor_tasks`.
    - Stick to data science topics relevant to the user's goal.
    - Assign tasks in json format for better understanding
    - You will compile all python code used and returned to the user 

    Let’s coordinate this project to victory! ✨
    """
)
CALOR_IA_DATA_ANALYST_SYSINT = (
    "system",
    """
    🤖 You are CALOR-IA Data Analyst Bot.

    🎯 Mission
    1️⃣ Execute the data analysis task provided in the `current_task_description` from the Supervisor.
    2️⃣ Write clean, runnable Python 🐍 code for data loading, cleaning, exploration, and visualization (using pandas, matplotlib, seaborn, etc.).
    3️⃣ Analyze data (potentially loaded from `intermediate_data_path` if provided) and extract valuable insights.
    4️⃣ Prepare data for the Data Scientist if requested (e.g., creating features, splitting data) and potentially save it, updating `intermediate_data_path`.
    5️⃣ Place your results (code, summary of findings, paths to saved plots or data) into the `analyst_output` dictionary in the graph state.
    6️⃣ Collaborate with CALOR_IA_DATA_SCIENTIST_SYSINT via the Supervisor by providing necessary data artifacts and insights.

    📝 Style & Rules
    - Focus solely on the task in `current_task_description`.
    - Separate numerical and categorical data for better handling 
    - Output results clearly structured within the `analyst_output` dictionary (e.g., `{"code": "...", "summary": "...", "plot_path": "/path/to/plot.png", "data_preview": "..."}`).
    - Generate runnable Python code within markdown code blocks (```python ... ```).
    - Explain your code and findings concisely.
    - If you save data or plots, mention the path clearly in your output summary.
    - Use matplotlib/seaborn for plots.
    - Stick to data analysis/preparation; defer modeling to the Scientist.
    - Return the task to supervisor in json format for better understanding and the python code used to develop the task
    - Return Just the python code
    
    Let’s crunch some data! ✨
    """
)

CALOR_IA_DATA_SCIENTIST_SYSINT = (
    "system",
    """
    🤖 You are CALOR-IA Data Scientist Bot.

    🎯 Mission
    1️⃣ Execute the machine learning task provided in the `current_task_description` from the Supervisor.
    2️⃣ Use data provided (potentially loaded from `intermediate_data_path` prepared by the Analyst) to build, train, and evaluate ML models (using scikit-learn, TensorFlow, PyTorch, etc.).
    3️⃣ Perform feature engineering if required and not already done by the Analyst.
    4️⃣ Generate predictions on test data as requested.
    5️⃣ Place your results (model summary, performance metrics, paths to saved models or predictions) into the `scientist_output` dictionary in the graph state.
    6️⃣ Collaborate with CALOR_IA_DATA_ANALYST_SYSINT via the Supervisor by requesting specific data views or providing model insights.

    📝 Style & Rules
    - Focus solely on the task in `current_task_description`.
    - Output results clearly structured within the `scientist_output` dictionary (e.g., `{"model_description": "...", "metrics": {"accuracy": 0.95, ...}, "predictions_path": "/path/to/preds.csv", "code": "..."}`).
    - Generate runnable Python code for model definition, training, and prediction within markdown code blocks (```python ... ```).
    - Explain your model choices, training procedures, and evaluation metrics step-by-step.
    - If you save models or predictions, mention the path clearly in your output summary.
    - Aim for models that generalize well.
    - Stick to machine learning tasks; defer data prep/exploration to the Analyst unless specified.
    - Return the task to supervisor in json format for better understanding and the python code used to develop the task
    - All models must be scored in RMSLE=n1​i=1∑n​(log(1+y^​i​)−log(1+yi​))2
    - You will run the models on test data
    Let’s build some high-performing models! ✨
    """
)
CALOR_IA_HUMAN_REDACTR_SYSINT = ("""
    "system"
    You are an expert on python code your task is gather all code and blen them to make it ready to be copy and paste, So you must create a proffesional pipeline of code that explain step by step 
    """)

In [None]:
def supervisor_node(state: GraphState) -> Dict[str, Any]:
    """
    Central decision-making node. Routes tasks to appropriate agents
    or ends the workflow based on the current state. Handles DataFrame variable names.
    """
    print("\n--- SUPERVISOR ---")
    messages = state.get("messages", [])
    last_message = messages[-1] if messages else None
    print(f"Supervisor reviewing state. Last message type: {type(last_message).__name__ if last_message else 'None'}")
    if hasattr(last_message, 'content'):
         content_display = str(last_message.content)
         print(f"Supervisor received content: {content_display[:200]}{'...' if len(content_display) > 200 else ''}")

    analyst_output_data = state.get('analyst_output')
    scientist_output_data = state.get('scientist_output')
    tasks = state.get('supervisor_tasks', [])

    print(f"Analyst output available: {analyst_output_data is not None}")
    print(f"Scientist output available: {scientist_output_data is not None}")
    print(f"Final answer requested: {state.get('final_answer_generated', False)}") # Add print for clarity

    next_agent = None
    task = None
    new_message_content = ""
    error_flag = None

    # --- Decision Logic ---

    # Check for final answer request first
    if state.get("final_answer_generated", False):
        print("Supervisor: Final answer signal received. Ending workflow.")
        next_agent = "__end__" # Explicitly route to the end state
        # No need for a new message here, the Interpreter already sent the final one
        # new_message_content = "Workflow complete." # Optional: if you want a final message *from* supervisor

    # If not ending, proceed with normal routing based on agent outputs
    elif scientist_output_data is not None:
        if scientist_output_data.get('error') is not None:
            error_msg = f"Scientist encountered an error: {scientist_output_data['error']}"
            print(f"Supervisor: {error_msg}")
            error_flag = error_msg
            next_agent = "HumanInterpreter" # Still go to interpreter on error
            new_message_content = "There was an issue during model training/prediction. Preparing final report."
        else:
            # Scientist succeeded
            print("Supervisor: Scientist completed work successfully.")
            next_agent = "HumanInterpreter" # Go to interpreter to compile final script
            new_message_content = "Scientist workflow complete. Preparing final report."

    elif analyst_output_data is not None:
        if analyst_output_data.get('error') is not None:
            error_msg = f"Analyst encountered an error: {analyst_output_data['error']}"
            print(f"Supervisor: {error_msg}")
            error_flag = error_msg
            next_agent = "HumanInterpreter" # Go to interpreter on error
            new_message_content = "There was an issue during data analysis. Preparing final report."
        else:
            # Analyst Succeeded
            print("Supervisor: Analyst successfully generated code and data names.")
            task = "Build and evaluate LightGBM and XGBoost models"
            next_agent = "DataScientist"
            new_message_content = f"Analyst processing complete. Routing to Data Scientist."

    else: # Initial state or unexpected state
        if last_message and isinstance(last_message, HumanMessage):
            task = "Perform initial EDA and preprocessing on train/test data."
            next_agent = "DataAnalyst"
            new_message_content = "Assigning Analyst to initial task based on user request."
        else:
            print("Supervisor: Unexpected state - no initial message or prior agent output. Ending.")
            error_flag = "Unexpected workflow state reached."
            next_agent = "__end__"
            new_message_content = "Unexpected workflow state. Finalizing."

    # --- Prepare Return Dictionary ---
    updated_messages = state.get("messages", [])
    # Only add a new message if next_agent is NOT __end__ (Interpreter already added the final one)
    if next_agent != "__end__" and new_message_content and (not updated_messages or updated_messages[-1].content != new_message_content):
         updated_messages = updated_messages + [AIMessage(content=str(new_message_content))]

    # Ensure node name matches if needed by your framework (though LangGraph usually uses function name strings)
    # if next_agent == "BotToHumanInterpreter": # This correction logic is likely not needed if using string names directly
    #     print("Supervisor correcting agent name to 'HumanInterpreter'")
    #     next_agent = "HumanInterpreter"


    return_dict = {
        "messages": updated_messages,
        "supervisor_tasks": tasks,
        "current_task_description": task,
        "next_agent": next_agent, # This is the crucial return value for the graph to follow
        "error": error_flag,
    }

    print(f"Supervisor decision: next_agent='{next_agent}', task='{task}'")
    return return_dict

In [None]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error # Added RMSE for XGBoost metric
import lightgbm as lgb
import xgboost as xgb

def data_analyst_node(state: GraphState) -> Dict[str, Any]:
    """Data Analyst Node - Updated for DataFrame variable names."""
    print("\n--- DATA ANALYST ---")
    task = state.get('current_task_description')
    # Assume initial DFs are named 'train_df' and 'test_df' for clarity
    # Adjust these names if your actual initial variables are different (e.g., 'train', 'test')
    initial_train_name = 'train_df' # <-- Adjust if your initial var is different
    initial_test_name = 'test_df'   # <-- Adjust if your initial var is different

    if not task:
        error_output = {"error": "Data Analyst received no task.", "next_agent": "Supervisor"}
        print("Returning error output:", error_output)
        return error_output

    # Simulate the names of the DataFrames that the processing code will create
    processed_train_name = f"processed_{initial_train_name}" # e.g., 'processed_train_df'
    processed_test_name = f"processed_{initial_test_name}"   # e.g., 'processed_test_df'

    print(f"Analyst expects input DataFrame variable: '{initial_train_name}'")
    print(f"Analyst expects input DataFrame variable: '{initial_test_name}'")
    print(f"Analyst will simulate creating output DataFrames: '{processed_train_name}' and '{processed_test_name}'")

    # --- Simulate Code (Make sure variable names match below) ---
    simulated_code = f"""
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
    from sklearn.impute import KNNImputer
    from sklearn.compose import ColumnTransformer
    from sklearn.pipeline import Pipeline

    # Assume these two exist already in the notebook/environment with names:
    # {initial_train_name}  # Your initial training DataFrame
    # {initial_test_name}   # Your initial test DataFrame

    # Ensure correct variable names are used in the code
    train = {initial_train_name}
    test = {initial_test_name} # Use the initial test name here

    # 1) Identify numerical vs. categorical columns
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()

    print("Numerical columns:", num_cols)
    print("Categorical columns:", cat_cols)

    # 2) Build preprocessing pipelines
    # ... (rest of your pipeline code, unchanged)
    num_pipeline = Pipeline([
        ('scaler', MinMaxScaler())
    ])
    cat_pipeline = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ])

    # 3) KNN imputer for any remaining NaNs
    imputer = KNNImputer()

    # 4) Fit on train, transform both train & test
    train_array = preprocessor.fit_transform(train)
    train_array = imputer.fit_transform(train_array)

    test_array = preprocessor.transform(test) # Use 'test' variable name here
    test_array = imputer.transform(test_array)

    # 5) Recover feature names so we can build nice DataFrames
    # ... (rest of your feature name recovery code, unchanged)
    num_feats = preprocessor.named_transformers_['num'] \
                   .named_steps['scaler'].feature_names_in_
    cat_feats = preprocessor.named_transformers_['cat'] \
                   .named_steps['onehot'].get_feature_names_out(cat_cols)
    all_feats = list(num_feats) + list(cat_feats)

    # Create the processed DataFrames using the intended variable names
    {processed_train_name} = pd.DataFrame(train_array, columns=all_feats)
    {processed_test_name}  = pd.DataFrame(test_array,  columns=all_feats)

    print(f"Processed train shape: {{{processed_train_name}.shape}}") # Print using the variable name
    print(f"Processed test shape:  {{{processed_test_name}.shape}}")   # Print using the variable name
    """

    # --- Return Dictionary (CORRECTED) ---
    output = {
        "analyst_output":{
            "code": simulated_code,
            # Store the *names* of the output DataFrames under the expected keys
            "processed_train_df_name": processed_train_name, # e.g., 'processed_train_df'
            "processed_test_df_name": processed_test_name,   # e.g., 'processed_test_df'
            # You can keep X and X_test if other parts of your system use them, but
            # the Scientist error indicates it expects the 'processed_..._name' keys
            # "X": train, # Probably not needed here if you pass the names
            # "X_test": test, # Probably not needed here if you pass the names
        },
         "current_task_description": None, # Task is complete for Analyst
    }

    print("Analyst returning output:", output) # Added for debugging
    return output

def process_update(update: Dict[str, Any]) -> None:
    if not isinstance(update, dict):
        print("Update is not a dictionary.")
        return

    if 'messages' in update:
        messages = update['messages']
        print("Messages from update:")
        for message in messages:
            print(message)

    if 'code' in update:
        print("Code snippet provided in update:")
        print(update['code'])

    if 'summary' in update:
        print("Summary from update:")
        print(update['summary'])

    if 'error' in update:
        print("Error encountered:")
        print(update['error'])

In [None]:
import os
from typing import Dict, Any

# Assuming GraphState class is defined elsewhere
# Assuming necessary library imports (like pandas, numpy etc.) are at the top level of your script

def data_scientist_node(state: GraphState) -> Dict[str, Any]:
    """
    Data Scientist Node - Generates modeling code (LightGBM, XGBoost)
    using processed data names provided by the DataAnalyst node via the state.
    This version DOES NOT execute the generated code.
    """
    print("\n--- DATA SCIENTIST (Code Generation Only) ---")
    task = state.get('current_task_description')
    if not task:
        print("Scientist: No task received.")
        # Return an error structure within the expected output key
        return {
            "scientist_output": {"error": "Data Scientist received no task."},
            "next_agent": "Supervisor"
            }

    # --- Get processed DataFrame names from the analyst's output in state ---
    analyst_result = state.get('analyst_output')
    if not analyst_result or not isinstance(analyst_result, dict):
        error_msg = "Scientist Error: Analyst output not found or invalid in state."
        print(error_msg)
        return {
            "scientist_output": {"error": error_msg},
            "next_agent": "Supervisor"
            }

    processed_train_name = analyst_result.get('processed_train_df_name')
    processed_test_name = analyst_result.get('processed_test_df_name')

    if not processed_train_name or not processed_test_name:
        error_msg = "Scientist Error: Processed DataFrame names not found within analyst output in state."
        print(error_msg)
        return {
            "scientist_output": {"error": error_msg},
             "next_agent": "Supervisor"
             }

    print(f"Scientist generating code for task: {task}")
    print(f"Will use processed train DataFrame variable: '{processed_train_name}'")
    print(f"Will use processed test DataFrame variable : '{processed_test_name}'")

    # --- Generate Code String ---
    # Use single braces {} ONLY for variables substituted NOW ({processed_train_name}, {processed_test_name})
    # Use double braces {{}} for f-strings intended for LATER execution within the generated code.
    generated_code =f'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error # Added RMSE for XGBoost metric
import lightgbm as lgb
import xgboost as xgb
import warnings
import os # Import os to create directories if needed

# --- Suppress Warnings ---
warnings.filterwarnings('ignore', category=UserWarning, module='lightgbm')
warnings.filterwarnings('ignore', category=FutureWarning)

print("\\n--- Data Scientist Code Execution Started ---")

# --- Configuration ---
# These names are substituted when the string is created
PROCESSED_TRAIN_NAME = '{processed_train_name}'
PROCESSED_TEST_NAME  = '{processed_test_name}'
TARGET_COL = 'Calories' # !! IMPORTANT: Adjust this if your target column name is different !!
OUTPUT_DIR = 'model_outputs'

# --- Setup ---
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Ensuring output directory exists: {{OUTPUT_DIR}}") # Inner f-string: Use {{}}

# --- Data Loading ---
print(f"Loading data: train='{{PROCESSED_TRAIN_NAME}}', test='{{PROCESSED_TEST_NAME}}'") # Inner f-string: Use {{}}
try:
    # Attempt to load from the global scope where exec runs
    train_df = globals()[PROCESSED_TRAIN_NAME]
    test_df  = globals()[PROCESSED_TEST_NAME]
    print(f"Loaded train shape: {{train_df.shape}}") # Inner f-string: Use {{}}
    print(f"Loaded test shape: {{test_df.shape}}")   # Inner f-string: Use {{}}
except KeyError as e:
    print(f"[ERROR] Processed DataFrame '{{e}}' not found in the execution environment.") # Inner f-string: Use {{}}
    print("Please ensure the Data Analyst code ran successfully and created these variables.")
    # Decide how to proceed: raise error, exit, or try to continue? For now, exit.
    exit(1) # Exit the executed script with an error code

# --- Feature/Target Split ---
if TARGET_COL not in train_df.columns:
    print(f"[ERROR] Target column '{{TARGET_COL}}' not found in training data ('{{PROCESSED_TRAIN_NAME}}')") # Inner f-string: Use {{}}
    exit(1) # Exit if target column is missing

X = train_df.drop(TARGET_COL, axis=1)
y = train_df[TARGET_COL]

# --- Align Test Columns ---
train_cols_expected = X.columns # Get columns in order from X
X_test = test_df.copy() # Start with a copy of the test set

# Add missing columns with NaN
for col in train_cols_expected:
    if col not in X_test.columns:
        print(f"[Warning] Column '{{col}}' missing in test set. Adding with NaN.") # Inner f-string: Use {{}}
        X_test[col] = np.nan

# Select and reorder columns to match training data exactly
X_test = X_test[train_cols_expected]

print(f"Feature shape (X): {{X.shape}}")             # Inner f-string: Use {{}}
print(f"Test Feature shape (X_test): {{X_test.shape}}") # Inner f-string: Use {{}}

# --- Train/Validation Split ---
X_tr, X_val, y_tr, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(f"Train split shape: {{X_tr.shape}}, Validation split shape: {{X_val.shape}}") # Inner f-string: Use {{}}

# --- Metrics ---
def rmsle(y_true, y_pred):
    y_pred_safe = np.maximum(0, y_pred) # Ensure non-negative predictions
    epsilon = 1e-9 # Add epsilon to avoid log(0)
    return np.sqrt(mean_squared_log_error(y_true + epsilon, y_pred_safe + epsilon))

def rmse(y_true, y_pred):
     return np.sqrt(mean_squared_error(y_true, y_pred))

# --- Model Flags ---
lgbm_success = False
xgb_success = False
can_run_xgb = True # Assume XGBoost can run initially
lgb_error = None
xgb_error = None
gbm = None # Initialize model variables
bst = None

# --- LightGBM ---
print("\\n>>> Training LightGBM...")
try:
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val   = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    lgb_params = {{ # Dictionary literal, braces are fine
        'objective':'regression_l1', 'metric':'rmsle', 'verbosity':-1,
        'n_estimators': 1000, 'learning_rate':0.05, 'feature_fraction': 0.8,
        'bagging_fraction': 0.8, 'bagging_freq': 1, 'seed': 42, 'n_jobs': -1,
        'boosting_type': 'gbdt',
    }}
    lgb_evals = {{}} # Dictionary literal, braces are fine
    callbacks = [
        lgb.log_evaluation(period=100, show_stdv=False),
        lgb.early_stopping(stopping_rounds=50, verbose=False)
    ]

    gbm = lgb.train(
        params=lgb_params, train_set=lgb_train, valid_sets=[lgb_train, lgb_val],
        valid_names=['train','val'], callbacks=callbacks, evals_result=lgb_evals
    )
    print("LightGBM training complete.")
    lgbm_success = True # Mark success

    # Plotting
    metrics_to_plot = [lgb_params['metric']] if isinstance(lgb_params['metric'], str) else lgb_params['metric']
    valid_metrics_to_plot = [m for m in metrics_to_plot if m in lgb_evals.get('train', {{}})]

    if valid_metrics_to_plot:
        metric_key = valid_metrics_to_plot[0]
        plt.figure(figsize=(8, 5))
        plt.plot(lgb_evals['train'][metric_key], label=f'train {{metric_key}}') # Inner f-string: Use {{}}
        plt.plot(lgb_evals['val'][metric_key],   label=f'val {{metric_key}}')   # Inner f-string: Use {{}}
        plt.title(f'LightGBM {{metric_key.upper()}}')                          # Inner f-string: Use {{}}
        plt.legend()
        plt.xlabel('Boosting Rounds')
        plt.ylabel(metric_key.upper())
        plt.grid(True)
        lgb_plot_path = os.path.join(OUTPUT_DIR, 'lgb_metric_plot.png')
        plt.savefig(lgb_plot_path)
        plt.close()
        print(f"Saved LightGBM plot to {{lgb_plot_path}}")                     # Inner f-string: Use {{}}
    else:
        print(f"[Warning] Metrics {{metrics_to_plot}} not found in LightGBM evals results for plotting.") # Inner f-string: Use {{}}

    # Predictions & Validation
    best_iter = gbm.best_iteration if gbm.best_iteration else lgb_params.get('n_estimators', 1000)
    y_val_pred_lgb  = gbm.predict(X_val, num_iteration=best_iter)
    y_test_pred_lgb = gbm.predict(X_test, num_iteration=best_iter)
    y_test_pred_lgb_safe = np.maximum(0, y_test_pred_lgb) # Ensure non-negative
    val_rmsle_lgb = rmsle(y_val, y_val_pred_lgb)
    print(f"LightGBM Validation RMSLE = {{val_rmsle_lgb:.5f}} (using best iter: {{best_iter}})") # Inner f-string: Use {{}}
    lgb_pred_path = os.path.join(OUTPUT_DIR, 'lgb_test_preds.csv')
    pd.DataFrame({{'prediction': y_test_pred_lgb_safe}}).to_csv(lgb_pred_path, index=False) # Dict literal OK
    print(f"Saved LightGBM predictions to {{lgb_pred_path}}") # Inner f-string: Use {{}}

except Exception as e:
    lgb_error = str(e)
    print(f"[ERROR] During LightGBM training or prediction: {{e}}") # Inner f-string: Use {{}}

# --- XGBoost ---
print("\\n>>> Training XGBoost...")
try:
    # Check if DMatrix creation is possible
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test)
except Exception as e:
    xgb_error = f"DMatrix creation failed: {{e}}" # Use {{e}} for escaping
    print(f"[ERROR] Creating XGBoost DMatrix: {{e}}") # Inner f-string: Use {{}}
    can_run_xgb = False # Cannot run XGBoost if DMatrix fails

if can_run_xgb:
    try:
        xgb_params = {{ # Dictionary literal, braces are fine
            'objective':'reg:squarederror', 'eval_metric':'rmse', 'eta':0.05,
            'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8, 'seed': 42
        }}
        xgb_evals = {{}} # Dictionary literal, braces are fine

        bst = xgb.train(
            params=xgb_params, dtrain=dtrain, num_boost_round=1000,
            evals=[(dtrain,'train'), (dval,'val')], early_stopping_rounds=50,
            evals_result=xgb_evals, verbose_eval=100
        )
        print("XGBoost training complete.")
        xgb_success = True # Mark success

        # Plotting
        metrics_to_plot_xgb = [xgb_params['eval_metric']] if isinstance(xgb_params['eval_metric'], str) else xgb_params['eval_metric']
        valid_metrics_to_plot_xgb = [m for m in metrics_to_plot_xgb if m in xgb_evals.get('train', {{}})]

        if valid_metrics_to_plot_xgb:
            metric_key_xgb = valid_metrics_to_plot_xgb[0]
            plt.figure(figsize=(8, 5))
            plt.plot(xgb_evals['train'][metric_key_xgb], label=f'train {{metric_key_xgb}}') # Inner f-string: Use {{}}
            plt.plot(xgb_evals['val'][metric_key_xgb],   label=f'val {{metric_key_xgb}}')   # Inner f-string: Use {{}}
            plt.title(f'XGBoost {{metric_key_xgb.upper()}}')                              # Inner f-string: Use {{}}
            plt.legend()
            plt.xlabel('Boosting Rounds')
            plt.ylabel(metric_key_xgb.upper())
            plt.grid(True)
            xgb_plot_path = os.path.join(OUTPUT_DIR, 'xgb_metric_plot.png')
            plt.savefig(xgb_plot_path)
            plt.close()
            print(f"Saved XGBoost plot to {{xgb_plot_path}}")                         # Inner f-string: Use {{}}
        else:
            print(f"[Warning] Metrics {{metrics_to_plot_xgb}} not found in XGBoost evals results for plotting.") # Inner f-string: Use {{}}

        # Predictions & Validation
        best_iter_xgb = bst.best_iteration
        y_val_pred_xgb  = bst.predict(dval, iteration_range=(0, best_iter_xgb))
        y_test_pred_xgb = bst.predict(dtest, iteration_range=(0, best_iter_xgb))
        y_test_pred_xgb_safe = np.maximum(0, y_test_pred_xgb) # Ensure non-negative
        val_rmsle_xgb = rmsle(y_val, y_val_pred_xgb)
        print(f"XGBoost Validation RMSLE = {{val_rmsle_xgb:.5f}} (using best iter: {{best_iter_xgb}})") # Inner f-string: Use {{}}
        xgb_pred_path = os.path.join(OUTPUT_DIR, 'xgb_test_preds.csv')
        pd.DataFrame({{'prediction': y_test_pred_xgb_safe}}).to_csv(xgb_pred_path, index=False) # Dict literal OK
        print(f"Saved XGBoost predictions to {{xgb_pred_path}}") # Inner f-string: Use {{}}

    except Exception as e:
        xgb_error = str(e)
        print(f"[ERROR] During XGBoost training or prediction: {{e}}") # Inner f-string: Use {{}}
else:
    print("Skipping XGBoost training due to previous error.")

# --- Final Summary ---
if lgb_error:
    print(f"LightGBM Error Summary: {{lgb_error}}") # Inner f-string: Use {{}}
if xgb_error:
    print(f"XGBoost Error Summary: {{xgb_error}}") # Inner f-string: Use {{}}

print("\\n--- Data Scientist Code Execution Finished ---")
''' # End of the generated_code f-string literal

    # --- Code Generation Complete ---
    print("--- Code Generation Complete ---")

    # --- Prepare Results (Code Only) ---
    # Since we are not executing, we only return the generated code.
    # No stdout, error capture, or artifact paths from execution.
    scientist_result = {
        "code": generated_code,
        "stdout": None, # No execution, no stdout
        "error": None, # No execution attempt, no execution error
        "execution_success": None, # Execution was not attempted
        "lgb_predictions_path": None,
        "xgb_predictions_path": None,
        "lgb_plot_path": None,
        "xgb_plot_path": None,
    }

    # --- Return state update ---
    return {
        "scientist_output": scientist_result,
        "current_task_description": None, # Clear task description
    }

In [None]:
def bot_to_human_interpreter(state: GraphState) -> Dict[str, Any]:
    """
    Gathers code from Analyst and Scientist outputs, blends them into a single script,
    adds explanations within the code, and formats it for copy-pasting.
    Returns an update dict with the combined code message and signals the end of the workflow.
    """
    print("\n--- BOT TO HUMAN INTERPRETER (Code Compiler) ---")

    # --- Robustly retrieve and validate outputs from the state ---
    analyst_output_raw = state.get('analyst_output')
    scientist_output_raw = state.get('scientist_output')

    # Ensure they are dictionaries before proceeding; default to empty dict otherwise
    analyst_output = analyst_output_raw if isinstance(analyst_output_raw, dict) else {}
    # --- THIS LINE IS CORRECTED ---
    scientist_output = scientist_output_raw if isinstance(scientist_output_raw, dict) else {}
    # --- End of robust retrieval ---

    # Initialize a list to hold parts of the combined script
    combined_code_parts = []

    # Add an introductory comment/header to the script
    combined_code_parts.append("# Combined Data Science Workflow Script\n")
    combined_code_parts.append("# Generated by CALOR-IA's Agent Team\n\n")
    combined_code_parts.append("# This script combines preprocessing and modeling steps.\n")
    combined_code_parts.append("# Ensure you have the necessary libraries installed (e.g., pandas, numpy, sklearn, lightgbm, xgboost, matplotlib).\n")
    combined_code_parts.append("# Make sure your initial 'train' and 'test' DataFrames (or equivalent data loading) exist before running.\n\n")


    # --- Section 1: Data Preprocessing (from Data Analyst) ---
    combined_code_parts.append("# --- 1. Data Preprocessing ---\n")
    combined_code_parts.append("# This section handles steps like identifying column types, imputation, scaling, and encoding.\n")
    combined_code_parts.append("# It prepares the raw data into processed DataFrames ready for modeling.\n\n")

    # Get the Analyst's code and error status (safe now due to checks above)
    analyst_code = analyst_output.get('code', '')
    analyst_error = analyst_output.get('error')
    processed_train_name = analyst_output.get('processed_train_df_name', "'processed_train_variable_name_unavailable'") # Get expected output name
    processed_test_name = analyst_output.get('processed_test_df_name', "'processed_test_variable_name_unavailable'")   # Get expected output name


    if analyst_error:
        combined_code_parts.append(f"# !!! Data Analyst Error during preprocessing execution.\n")
        combined_code_parts.append(f"# !!! Error Details: {analyst_error}\n\n")
    elif not analyst_code:
         combined_code_parts.append("# Data preprocessing code was not provided by the Analyst.\n\n")
    else:
        # Append the analyst's code
        combined_code_parts.append(analyst_code.strip())
        combined_code_parts.append("\n\n") # Add spacing after the code block
        # Add comments explaining the *expected output variables*
        combined_code_parts.append("# Expected outputs from this section (as variables in your environment):\n")
        combined_code_parts.append(f"# - {processed_train_name}: Processed training features (potentially including target)\n")
        combined_code_parts.append(f"# - {processed_test_name}: Processed test features\n\n")


    # --- Section 2: Model Training and Evaluation (from Data Scientist) ---
    combined_code_parts.append("# --- 2. Model Training and Evaluation ---\n")
    combined_code_parts.append("# This section uses the processed data to train machine learning models (LightGBM, XGBoost) and evaluate them.\n")
    combined_code_parts.append(f"# It assumes the variables '{processed_train_name}' and '{processed_test_name}' exist from the previous section.\n\n")

    # Get the Scientist's code and error status (safe now due to checks above)
    scientist_code = scientist_output.get('code', '')
    scientist_error = scientist_output.get('error') # Captures generation or execution errors
    scientist_stdout = scientist_output.get('stdout', '') # Get captured output from execution

    if scientist_error:
        combined_code_parts.append(f"# !!! Data Scientist Error during modeling execution or code generation.\n")
        combined_code_parts.append(f"# !!! Error Details: {scientist_error}\n\n")
        # Optionally include stdout even if there was an error, might contain clues
        if scientist_stdout:
             combined_code_parts.append("# Captured output (may contain error details):\n")
             commented_stdout = "\n".join([f"# {line}" for line in scientist_stdout.strip().split('\n')])
             combined_code_parts.append(commented_stdout + "\n\n")

    elif not scientist_code:
        combined_code_parts.append("# Model training code was not provided by the Scientist.\n\n")
    else:
        # Append the scientist's code
        combined_code_parts.append(scientist_code.strip())
        combined_code_parts.append("\n\n") # Add spacing after the code block
        # Add comments explaining expected outputs/artifacts
        combined_code_parts.append("# Expected outputs/artifacts from this section:\n")
        combined_code_parts.append("# - Console output showing training progress and validation scores.\n")
        combined_code_parts.append("# - 'lgb_metric_plot.png', 'xgb_metric_plot.png': Plots showing model performance during training.\n")
        combined_code_parts.append("# - 'lgb_test_preds.csv', 'xgb_test_preds.csv': CSV files with predictions on the test set.\n\n")
        # Include captured stdout as comments if available
        if scientist_stdout:
             combined_code_parts.append("# Captured Output from Model Training Execution:\n")
             commented_stdout = "\n".join([f"# {line}" for line in scientist_stdout.strip().split('\n')])
             combined_code_parts.append(commented_stdout + "\n\n")


    # --- Section 3: Summary Notes ---
    combined_code_parts.append("# --- 3. Summary Notes ---\n")
    if analyst_error or scientist_error:
         combined_code_parts.append("# NOTE: Errors occurred during the automated workflow. Please review the script and error messages carefully.\n")
    else:
         combined_code_parts.append("# NOTE: The automated workflow appears to have completed. Review the generated files and console output.\n")

    combined_code_parts.append("\n# --- End of Script ---")
    combined_code_parts.append("\n# Copy and paste the entire block above into your environment/notebook to run the workflow!\n")


    # Combine all parts into a single string representing the full script
    final_python_script = "".join(combined_code_parts)

    # Wrap the entire script in a single markdown code block for easy copy-pasting
    human_message_content = "Here is the complete Python script generated by the workflow, combining preprocessing and modeling steps:\n\n"
    human_message_content += "```python\n" # Start markdown code block
    human_message_content += final_python_script
    human_message_content += "```\n"       # End markdown code block

    print("Interpreter compiled code and generated final message for human.")

    # Prepare the state update dictionary to return
    # Append the new AI message and signal the end
    return {
        "messages": state.get("messages", []) + [AIMessage(content=human_message_content)],
        "final_answer_generated": True,
    }

In [None]:
def route_tasks(state: GraphState) -> Literal["DataAnalyst", "DataScientist", "Supervisor", "__end__"]:
    """Determines the next node to execute."""
    next_agent = state.get('next_agent')
    print(f"\n--- ROUTING ---")
    # Handle potential None value gracefully before checking membership
    if next_agent is None:
        print("Routing Error: next_agent is None. Ending.")
        return "__end__"
    print(f"Supervisor decided next step is: {next_agent}")
    if next_agent not in ["DataAnalyst", "DataScientist", "Supervisor", "__end__"]: #  
        print(f"Routing Error: Invalid next_agent '{next_agent}'. Ending.")
        return "__end__" # Default to end if invalid state
    return next_agent

# --- Build the Graph (Corrected Edges) ---

workflow = StateGraph(GraphState)

# Add nodes
workflow.add_node("Supervisor", supervisor_node)
workflow.add_node("DataAnalyst", data_analyst_node)
workflow.add_node("DataScientist", data_scientist_node)
workflow.add_node("HumanInterpreter", bot_to_human_interpreter) # Use this exact name

# Define entry point
workflow.set_entry_point("Supervisor")

# --- ADD THESE EDGES ---
# After Analyst runs, go back to Supervisor
workflow.add_edge("DataAnalyst", "Supervisor")
# After Scientist runs, go back to Supervisor
workflow.add_edge("DataScientist", "Supervisor")
# After Interpreter runs, go back to Supervisor (it will then check final_answer_generated)
workflow.add_edge("HumanInterpreter", "Supervisor")
# --- END OF ADDED EDGES ---

# Define conditional edges FROM SUPERVISOR ONLY
workflow.add_conditional_edges(
    "Supervisor",
    # Function to decide route based on supervisor's decision
    lambda state: state.get("next_agent"),
    # Mapping decision to node name
    {
        "DataAnalyst": "DataAnalyst",
        "DataScientist": "DataScientist",
        "HumanInterpreter": "HumanInterpreter", # Ensure this matches add_node name
        "Supervisor": "Supervisor", # Allow looping back if needed (e.g., waiting)
        "__end__": END # Map "__end__" string to the graph's end state
    }
)

# Compile the graph
app = workflow.compile()
# --- Visualize the Graph (Optional, Unchanged) ---
try:
    from PIL import Image
    import io
    img_bytes = app.get_graph().draw_mermaid_png()
    img = Image.open(io.BytesIO(img_bytes))
except Exception as e:
    print(f"\nCould not generate graph visualization: {e}. (Might need `pip install pygraphviz` and graphviz system library)")

img

In [None]:
import json # Make sure json is imported if you haven't already

# Assuming imports, node definitions, graph setup, and compilation are done above

# --- Run the Graph ---

# Initial state setup (unchanged)
initial_input_message = f"""Alright team! Let's kick off this project to build a model predicting 'Calories'.

Here's the plan, broken down into stages, leveraging our expert "agents":

1.  🚀  **Data Setup & Target ID:**
    *   We're starting with our training data (`train`) and test data (`test).
    *   Your first step is to identify the 'Calories' column within `train_df`. This is our target variable!

2.  🛠️  **Data Preprocessing (with Data Analyst):**
    *   Now, collaborate closely with the **Data Analyst** persona.
    *   Focus on cleaning and preparing *both* `train_df` and `test_df` for modeling. This involves tasks like handling missing values, encoding, feature engineering, etc.
    *   We need to iterate on these steps until we achieve high-quality, model-ready data.

3.  🧠  **Model Building (with Data Scientist):**
    *   Once the preprocessing is solid, pass the *processed* data over to the **Data Scientist** persona.
    *   Their mission is to train *two* predictive models: a LightGBM model and an XGBoost model, using the high-quality training data.

4.  🧩  **Code Integration (with HumanInterpreter):**
    *   Finally, gather the distinct code sections generated by the Data Analyst (preprocessing) and the Data Scientist (modeling).
    *   Use the **HumanInterpreter** to seamlessly integrate these sections into one complete, executable Python script that represents the full workflow.

Your final output should be this complete, integrated Python code. Let's build something awesome! 💪
"""
initial_state = {
    "messages": [HumanMessage(content=initial_input_message)],
    "final_answer_generated": False,
    "current_task_description": "Preprocess and analyze training and test data.", 
}

print("\n--- STARTING WORKFLOW (Using DataFrame Variable Names) ---")

# Keep track of seen message contents to avoid reprinting supervisor messages repeatedly
seen_message_contents = set()


for step, event in enumerate(app.stream(initial_state, {"recursion_limit":15})):
    print(f"\n--- Workflow Step {step + 1} ---")
    for node_name, update in event.items():
        print(f"Processing update from node: {node_name}")

        # Check if the update is valid and process messages
        if update is None:
            print("Node returned None, no state update.")
            continue # Skip processing if the node returned None

        # Process messages added in this update
        if 'messages' in update:
            new_messages_in_update = [
                msg for msg in update['messages']
                if isinstance(msg, (AIMessage, HumanMessage)) and msg.content not in seen_message_contents
            ]
            for msg in new_messages_in_update:
                if isinstance(msg, AIMessage):
                    print(f"🤖 {node_name} says: {msg.content}")
                elif isinstance(msg, HumanMessage):
                    print(f"🧑‍💻 User says (via state): {msg.content}") # User messages might reappear if state is passed
                seen_message_contents.add(msg.content)


        # The HumanInterpreter output is already formatted with the code block
        # We don't need special processing here if it's added to messages
        # The loop above processing 'messages' will print it when the HumanInterpreter runs
        # If you wanted to handle HumanInterpreter output *differently* here, you could add:
        # if node_name == 'HumanInterpreter':
        #     # Access the state after the update to get the new message
        #     final_message = update.get('messages', [])[-1] if update.get('messages') else None
        #     if final_message and isinstance(final_message, AIMessage):
        #          print("\n--- FINAL REPORT ---")
        #          print(final_message.content) # Print the full markdown content
        #          print("--- END FINAL REPORT ---")


    print("\n" + "="*50 + "\n")

# After the loop finishes, the graph execution has completed (__end__ reached or limit hit)
print("\n--- Workflow Finished ---")
# You can optionally print the final state
# print("Final State:")
# final_state = app.get_state(initial_state) # This might not work directly depending on runner config
# print(final_state)


# **Ready to Run!**

**This code is designed to be copied directly into a cell in your Kaggle notebook. Just make sure you have already loaded your initial train.csv and test.csv files into DataFrames named train_df and test_df respectively, before this code block.**

**The script will then:**

**🚀 Preprocess train_df and test_df using the defined pipelines and KNN Imputer, saving the results as processed_train_df and processed_test_df.**
**🧠 Train both LightGBM and XGBoost models on the processed training data.**
**📊 Evaluate models on a validation split and Save performance plots.**
**💾 Generate predictions on the processed test data and save them to CSV files (lgb_test_preds.csv, xgb_test_preds.csv) in an model_outputs directory.**
**What's Next?**

**After running this code, you'll have the test predictions saved. The final step for your Kaggle submission would be to load one of the prediction CSVs (e.g., lgb_test_preds.csv), merge it with the original test.csv's 'id' column, and save it in the required submission.csv format. You could even explore averaging the predictions from both models for potentially better results!**

**This workflow shows just how powerful collaborative AI agents can be in jumpstarting your machine learning projects. Go ahead, copy the code, run it, and see the magic happen! ✨ Let us know what you build next!**

**Note** If u are here is because you are watching a version wheere i took the output and use it with a simple prompt on cluade 3.5 Sonnet

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, mean_squared_error
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import warnings
import os

# Suppress warnings
warnings.filterwarnings('ignore')

def preprocess_data(train, test, target_col='Calories'):
    # Identify columns
    num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
    num_cols.remove(target_col)
    cat_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Create preprocessor
    preprocessor = ColumnTransformer(transformers=[
        ('num', MinMaxScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_cols)
    ])
    
    # Prepare data
    X = train.drop(target_col, axis=1)
    y = train[target_col]
    
    # Transform data
    X_processed = preprocessor.fit_transform(X)
    X_test_processed = preprocessor.transform(test)
    
    return X_processed, X_test_processed, y, preprocessor

def train_lightgbm(X_train, y_train, X_val, y_val):
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'verbose': -1
    }
    
    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[train_data, val_data],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=100)
        ]
    )
    
    return model

def train_xgboost(X_train, y_train, X_val, y_val):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.05,
        'max_depth': 6,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    }
    
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    return model

def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, np.maximum(0, y_pred)))

# Load data
train, test, submission = (
    pd.read_csv('/kaggle/input/playground-series-s5e5/train.csv'),
    pd.read_csv('/kaggle/input/playground-series-s5e5/test.csv'),
    pd.read_csv('/kaggle/input/playground-series-s5e5/sample_submission.csv')
)

# Create output directory
os.makedirs('model_outputs', exist_ok=True)

# Preprocess data
X_processed, X_test_processed, y, preprocessor = preprocess_data(train, test)

# Split data
X_train, X_val, y_train, y_val = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Train LightGBM
print("Training LightGBM...")
lgb_model = train_lightgbm(X_train, y_train, X_val, y_val)
lgb_val_pred = lgb_model.predict(X_val)
lgb_test_pred = lgb_model.predict(X_test_processed)

# Train XGBoost
print("\nTraining XGBoost...")
xgb_model = train_xgboost(X_train, y_train, X_val, y_val)
xgb_val_pred = xgb_model.predict(xgb.DMatrix(X_val))
xgb_test_pred = xgb_model.predict(xgb.DMatrix(X_test_processed))

# Print validation scores
print(f"\nLightGBM Validation RMSLE: {rmsle(y_val, lgb_val_pred):.5f}")
print(f"XGBoost Validation RMSLE: {rmsle(y_val, xgb_val_pred):.5f}")

# Create submission
submission['Calories'] = (lgb_test_pred + xgb_test_pred) / 2
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created!")