<a href="https://colab.research.google.com/github/dteso/AI-Mini-Trainer/blob/main/AI_mini_trainer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gradio scikit-learn pandas plotly atomicwrites

In [None]:
### ML Mini Trainer - v1.0.0
### DAVID TESO POZO


import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, json, pickle, warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from atomicwrites import atomic_write

# ---------------------------
# IMPORTACIONES DE SKLEARN
# ---------------------------
from sklearn.datasets import (
    load_iris, load_wine, load_breast_cancer, load_digits,
    load_diabetes, fetch_california_housing, load_linnerud,
    make_regression, make_friedman1
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, r2_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA

# ---------------------------
# CONFIGURACIÓN: DATASETS Y MODELOS
# ---------------------------
available_datasets = {
    "Iris (Clasificación)": load_iris,
    "Wine (Clasificación)": load_wine,
    "Breast Cancer (Clasificación)": load_breast_cancer,
    "Digits (Clasificación)": load_digits,
    "Linnerud (Clasificación - multietiqueta)": load_linnerud,
    "Diabetes (Regresión)": load_diabetes,
    "California Housing (Regresión)": fetch_california_housing,
    "Friedman1 (Regresión sintética)": lambda: {
        "data": make_friedman1(n_samples=200, n_features=10, random_state=42)[0],
        "target": make_friedman1(n_samples=200, n_features=10, random_state=42)[1],
        "feature_names": [f"X{i}" for i in range(10)]
    },
    "Make Regression (Regresión sintética)": lambda: {
        "data": make_regression(n_samples=200, n_features=8, noise=0.1, random_state=42)[0],
        "target": make_regression(n_samples=200, n_features=8, noise=0.1, random_state=42)[1],
        "feature_names": [f"X{i}" for i in range(8)]
    },
}
classification_models = {
    "Logistic Regression": LogisticRegression,
    "KNN Classifier": KNeighborsClassifier,
    "Decision Tree Classifier": DecisionTreeClassifier,
    "Random Forest Classifier": RandomForestClassifier,
    "SVC": SVC,
    "Naive Bayes": GaussianNB,
}
regression_models = {
    "Linear Regression": LinearRegression,
    "Random Forest Regressor": RandomForestRegressor
}
available_models = {**classification_models, **regression_models}

# ---------------------------
# FUNCIONES DE APOYO
# ---------------------------
def load_dataset(name):
    dataset = available_datasets[name]()
    df = pd.DataFrame(dataset["data"],
                      columns=dataset.get("feature_names",
                                          [f"X{i}" for i in range(dataset["data"].shape[1])]))
    target = pd.Series(dataset["target"], name="target")
    return df, target

def is_classification_task(target):
    return pd.Series(target).nunique() < 20 and pd.api.types.is_integer_dtype(target)

def show_dataset_with_target(dataset_name):
    df, _ = load_dataset(dataset_name)
    # añadimos la columna 'class' para mostrar labels si hay target_names
    dataset = available_datasets[dataset_name]()
    if "target_names" in dataset:
        try:
            labels = pd.Series(dataset["target"])\
                       .apply(lambda x: dataset["target_names"][x])
        except:
            labels = dataset["target"]
    else:
        labels = dataset["target"]
    df["class"] = labels
    return df, f"Vista del Dataset ({len(df)} elementos)"

def update_model_choices(dataset_name):
    df, target = load_dataset(dataset_name)
    if is_classification_task(target):
        return gr.update(choices=list(classification_models.keys()),
                         value=list(classification_models.keys())[0])
    else:
        return gr.update(choices=list(regression_models.keys()),
                         value=list(regression_models.keys())[0])

def train_multiple_models(dataset_name, model_names, test_size):
    df, target = load_dataset(dataset_name)
    X_train, X_test, y_train, y_test = train_test_split(df, target,
                                                        test_size=test_size,
                                                        random_state=42)
    results = []
    is_classif = is_classification_task(target)
    for name in model_names:
        ModelClass = available_models[name]
        model = ModelClass()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        if is_classif:
            metric = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred, output_dict=True)
        else:
            metric = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            report = {"R2 Score": metric, "MSE": mse}
        results.append({"Modelo": name, "Precisión": metric, "Reporte": report})
    return results

def plot_accuracy_comparison(results):
    fig, ax = plt.subplots()
    metrics = [r["Precisión"] for r in results]
    sns.barplot(x=[r["Modelo"] for r in results], y=metrics, ax=ax)
    if all(0 <= m <= 1 for m in metrics):
        ax.set_ylim(0, 1); ax.set_ylabel("Precisión")
    else:
        ax.set_ylabel("R2 Score")
    ax.set_title("Comparación de Modelos")
    ax.tick_params(axis="x", rotation=45)
    plt.tight_layout()
    return fig

def export_reports_as_csv(results):
    df = pd.DataFrame([{"Modelo": r["Modelo"], "Precisión": r["Precisión"]} for r in results])
    return df.to_csv(index=False)

def full_training(dataset_name, selected_models, test_size):
    results = train_multiple_models(dataset_name, selected_models, test_size)
    df_res = pd.DataFrame([{"Modelo": r["Modelo"], "Precisión": round(r["Precisión"],4)} for r in results])\
               .sort_values(by="Precisión", ascending=False)
    fig = plot_accuracy_comparison(results)
    csv_str = export_reports_as_csv(results)
    csv_path = "report.csv"
    with open(csv_path, "w") as f:
        f.write(csv_str)
    return df_res, fig, csv_path

def run_eda(dataset_name):
    df, _ = load_dataset(dataset_name)
    return df.describe().reset_index()

def plot_eda(dataset_name):
    df, _ = load_dataset(dataset_name)
    df_m = df.melt(var_name="feature", value_name="valor")
    sns.set(style="whitegrid")
    fig, ax = plt.subplots(figsize=(12,6))
    sns.boxplot(data=df_m, x="feature", y="valor",
                hue="feature", palette="Set2", dodge=False, ax=ax)
    leg = ax.get_legend()
    if leg: leg.remove()
    ax.set_title("Distribución por Feature (Boxplot)", fontsize=14, weight="bold")
    ax.set_xlabel("Feature"); ax.set_ylabel("Valor")
    ax.tick_params(axis="x", rotation=30)
    plt.tight_layout()
    return fig

def run_pca(dataset_name):
    df, _ = load_dataset(dataset_name)
    pca = PCA(n_components=2)
    comps = pca.fit_transform(df)
    df_pca = pd.DataFrame(comps, columns=["PC1","PC2"])
    dataset = available_datasets[dataset_name]()
    if "target_names" in dataset:
        try:
            labels = pd.Series(dataset["target"]).apply(lambda x: dataset["target_names"][x])
        except:
            labels = dataset["target"]
    else:
        labels = dataset["target"]
    df_pca["Clase"] = labels
    sns.set(style="whitegrid")
    fig, ax = plt.subplots(figsize=(8,6))
    sns.scatterplot(data=df_pca, x="PC1", y="PC2",
                    hue="Clase", palette="Set2", s=60, ax=ax)
    ax.set_title("PCA - 2 Componentes", fontsize=14, weight="bold")
    ax.set_xlabel("PC1"); ax.set_ylabel("PC2"); ax.legend(title="Clase")
    plt.tight_layout()
    return fig

# ---------------------------
# HIPERPARÁMETROS DE MODELOS
# ---------------------------
model_params_demo = {
    "Logistic Regression": {
        "C": {"type":"slider","min":0.01,"max":10.0,"value":1.0,"step":0.01,"label":"C (Regularización)"},
        "max_iter": {"type":"slider","min":100,"max":2000,"value":1000,"step":100,"label":"Iteraciones Máx."}
    },
    "KNN Classifier": {
        "n_neighbors": {"type":"slider","min":1,"max":30,"value":5,"step":1,"label":"n_neighbors"}
    },
    "Decision Tree Classifier": {
        "max_depth": {"type":"slider","min":1,"max":20,"value":10,"step":1,"label":"max_depth"},
        "min_samples_split": {"type":"slider","min":2,"max":20,"value":2,"step":1,"label":"min_samples_split"}
    },
    "Random Forest Classifier": {
        "n_estimators": {"type":"slider","min":10,"max":200,"value":100,"step":10,"label":"n_estimators"},
        "max_depth": {"type":"slider","min":1,"max":20,"value":10,"step":1,"label":"max_depth"}
    },
    "SVC": {
        "C": {"type":"slider","min":0.1,"max":10.0,"value":1.0,"step":0.1,"label":"C"},
        "gamma": {"type":"slider","min":0.001,"max":1.0,"value":0.01,"step":0.001,"label":"gamma"}
    },
    "Random Forest Regressor": {
        "n_estimators": {"type":"slider","min":10,"max":200,"value":100,"step":10,"label":"n_estimators"},
        "max_depth": {"type":"slider","min":1,"max":20,"value":5,"step":1,"label":"max_depth"}
    }
}

def fix_slider_params(params):
    p = params.copy()
    # Para Gradio: 'minimum'/'maximum'
    if "min" in p: p["minimum"] = p.pop("min")
    if "max" in p: p["maximum"] = p.pop("max")
    return p

# ---------------------------
# GUARDAR/REGISTRAR MODELOS
# ---------------------------
def save_model_locally(model, model_save_name, headers, target_names=None):
    model_dir = "models"; os.makedirs(model_dir, exist_ok=True)
    if not model_save_name.endswith(".pkl"): model_save_name += ".pkl"
    model_path = os.path.join(model_dir, model_save_name)
    # pickle
    with open(model_path, "wb") as f:
        pickle.dump({"model":model,"headers":headers,"target_names":target_names}, f)
    # serializable
    if isinstance(target_names, np.ndarray): tns = target_names.tolist()
    elif isinstance(target_names, (list,tuple)): tns = target_names
    else: tns = None
    # registry
    rp = "model_registry.json"
    registry = json.load(open(rp)) if os.path.exists(rp) else {}
    registry[model_save_name] = {"model_path":model_path,"headers":headers,"target_names":tns}
    with atomic_write(rp, overwrite=True, encoding="utf-8") as f:
        json.dump(registry, f, indent=4)
    return model_path, registry

# ---------------------------
# ENTRENAMIENTO INDIVIDUAL
# ---------------------------
def train_one_model_with_save(dataset_name, model_name, test_size,
                              val1, val2, val3, val4):
    df, target = load_dataset(dataset_name)
    headers = df.columns.tolist()
    dataset = available_datasets[dataset_name]()
    tns = dataset.get("target_names", None)
    X_train, X_test, y_train, y_test = train_test_split(df, target,
                                                        test_size=test_size,
                                                        random_state=42)
    # Hiperparámetros
    if model_name in classification_models:
        demo = model_params_demo[model_name]
        # extraemos valores
        if model_name=="Logistic Regression":
            hyper = {"C": val1, "max_iter": int(round(val2))}
        elif model_name=="KNN Classifier":
            hyper = {"n_neighbors": int(round(val1))}
        elif model_name=="Decision Tree Classifier":
            hyper = {"max_depth": int(round(val1)),
                     "min_samples_split": int(round(val2))}
        elif model_name=="Random Forest Classifier":
            hyper = {"n_estimators": int(round(val1)),
                     "max_depth": int(round(val2))}
        elif model_name=="SVC":
            hyper = {"C": val1, "gamma": val2}
        else:
            hyper = {}
    else:
        # regresión
        if model_name=="Random Forest Regressor":
            hyper = {"n_estimators": int(round(val1)),
                     "max_depth": int(round(val2))}
        else:
            hyper = {}
    ModelClass = available_models.get(model_name)
    if not ModelClass:
        return "Modelo no válido", None, pd.DataFrame(), None, None
    model = ModelClass(**hyper)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    # Métrica y figura
    if model_name in classification_models:
        metric = accuracy_score(y_test, y_pred)
        mname = "Accuracy"
        from sklearn.metrics import confusion_matrix
        cm = confusion_matrix(y_test, y_pred)
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
        ax.set_title("Confusion Matrix")
        table_df = pd.DataFrame({"y_true": y_test, "y_pred": y_pred})
    else:
        metric = r2_score(y_test, y_pred)
        mname = "R2 Score"
        fig, ax = plt.subplots()
        ax.scatter(y_test, y_pred)
        ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()],
                "k--", lw=2)
        ax.set_xlabel("Actual"); ax.set_ylabel("Predicted")
        ax.set_title("Actual vs Predicted")
        table_df = pd.DataFrame({"Actual": y_test, "Predicted": y_pred})
    state = {"model": model, "headers": headers, "target_names": tns}
    return f"{mname}: {metric:.4f}", fig, table_df, state, hyper

# ---------------------------
# UTILIDADES PARA PREDICCIÓN
# ---------------------------
MAX_FEATURES = 20

def load_model_registry():
    rp = "model_registry.json"
    if os.path.exists(rp):
        try: return list(json.load(open(rp)).keys())
        except: return []
    return []

def refresh_model_list():
    return gr.update(choices=load_model_registry())

def update_textboxes_from_saved_model(model_name):
    rp = "model_registry.json"
    if not os.path.exists(rp):
        return [gr.update(visible=False) for _ in range(MAX_FEATURES)]
    reg = json.load(open(rp))
    headers = reg.get(model_name, {}).get("headers", [])
    updates = []
    for i in range(MAX_FEATURES):
        if i < len(headers):
            updates.append(gr.update(label=headers[i], visible=True, value=""))
        else:
            updates.append(gr.update(visible=False))
    return updates

def predict_model_combined(model_name, excel_file, *features):
    rp = "model_registry.json"
    if not os.path.exists(rp): return None, "No hay modelos guardados."
    reg = json.load(open(rp))
    if model_name not in reg: return None, "Modelo no encontrado."
    data = pickle.load(open(reg[model_name]["model_path"], "rb"))
    model, headers, tns = data["model"], data["headers"], data.get("target_names", None)
    def mp(p):
        if tns is not None:
            try: return tns[int(p)]
            except: return p
        return p
    # Masiva
    if excel_file is not None:
        df = pd.read_excel(excel_file.name)
        if not set(headers).issubset(df.columns):
            return None, f"Faltan columnas: {headers}"
        preds = model.predict(df[headers])
        df["Predicción"] = [mp(p) for p in preds]
        return df, ""
    # Individual
    vals = []
    for i in range(len(headers)):
        s = features[i] if i < len(features) else ""
        if s.strip():
            try: vals.append(float(s))
            except: return None, "Todos los features deben ser numéricos."
        else:
            vals.append(0.0)
    row = pd.DataFrame([vals], columns=headers)
    p = model.predict(row)[0]
    return None, f"Predicción: {mp(p)}"

def predict_individual(model_name, *features):
    _, msg = predict_model_combined(model_name, None, *features)
    return f"<h2>{msg}</h2>"

def predict_bulk(model_name, excel_file):
    df, _ = predict_model_combined(model_name, excel_file)
    return df

def generate_excel_template(model_name):
    rp = "model_registry.json"
    if not os.path.exists(rp): return None
    reg = json.load(open(rp))
    if model_name not in reg: return None
    headers = reg[model_name].get("headers", [])
    df = pd.DataFrame(columns=headers)
    # nombre dinámico
    safe = "".join(c if c.isalnum() else "_" for c in model_name)
    fname = f"{safe}_plantilla.xlsx"
    df.to_excel(fname, index=False)
    return fname

# Modelo dropdown inicia vacío
initial_models = load_model_registry()

# ---------------------------
# INTERFAZ GRADIO PRINCIPAL
# ---------------------------
with gr.Blocks() as demo:

    # Pestaña 1: Dataset + Modelos ML
    with gr.Tab("Dataset + Modelos ML"):
        gr.Markdown("## Visualización de Dataset y Comparación de Modelos (múltiples)")
        with gr.Row():
            ds_sel = gr.Dropdown(list(available_datasets.keys()), value="Iris (Clasificación)", label="Dataset")
            models_cb = gr.CheckboxGroup(list(classification_models.keys()), value=list(classification_models.keys()), label="Modelos")
            ts = gr.Slider(0.1,0.5,0.3,0.05, label="Test size")
        btn_train = gr.Button("Entrenar y Comparar")
        tbl_res = gr.Dataframe(headers=["Modelo","Precisión"], label="Resultados")
        plot_res = gr.Plot(label="Gráfica")
        file_res = gr.File(label="Reporte CSV")
        btn_train.click(full_training, [ds_sel, models_cb, ts], [tbl_res, plot_res, file_res])

        df0, lbl0 = show_dataset_with_target("Iris (Clasificación)")
        dt = gr.Dataframe(value=df0); md = gr.Markdown(lbl0)
        ds_sel.change(show_dataset_with_target, ds_sel, [dt, md])
        ds_sel.change(update_model_choices, ds_sel, models_cb)

        with gr.Tabs():
            with gr.Tab("EDA"):
                eda_tbl = gr.Dataframe(value=run_eda("Iris (Clasificación)"))
                eda_plt = gr.Plot(value=plot_eda("Iris (Clasificación)"))
            with gr.Tab("PCA"):
                pca_plt = gr.Plot(value=run_pca("Iris (Clasificación)"))
        ds_sel.change(run_eda, ds_sel, eda_tbl)
        ds_sel.change(plot_eda, ds_sel, eda_plt)
        ds_sel.change(run_pca, ds_sel, pca_plt)

    # Pestaña 2: Entrenamiento individual
    with gr.Tab("Entrenamiento"):
        gr.Markdown("## Entrenamiento de un solo modelo")
        ds_tr = gr.Dropdown(list(available_datasets.keys()), value="Iris (Clasificación)", label="Dataset")
        mdl_tr = gr.Dropdown(list(classification_models.keys()), value=list(classification_models.keys())[0], label="Modelo")
        ds_tr.change(update_model_choices, ds_tr, mdl_tr)
        ts_tr = gr.Slider(0.1,0.5,0.3,0.05, label="Test size")

        # Logistic Regression sliders
        lr_Cp = fix_slider_params(model_params_demo["Logistic Regression"]["C"])
        lr_Itp = fix_slider_params(model_params_demo["Logistic Regression"]["max_iter"])
        c_sl = gr.Slider(minimum=lr_Cp["minimum"], maximum=lr_Cp["maximum"],
                         step=lr_Cp["step"], value=lr_Cp["value"], label=lr_Cp["label"])
        it_sl = gr.Slider(minimum=lr_Itp["minimum"], maximum=lr_Itp["maximum"],
                          step=lr_Itp["step"], value=lr_Itp["value"], label=lr_Itp["label"])
        # Random Forest Regressor sliders
        rf_Nep = fix_slider_params(model_params_demo["Random Forest Regressor"]["n_estimators"])
        rf_Mdp = fix_slider_params(model_params_demo["Random Forest Regressor"]["max_depth"])
        ne_sl = gr.Slider(minimum=rf_Nep["minimum"], maximum=rf_Nep["maximum"],
                          step=rf_Nep["step"], value=rf_Nep["value"], label=rf_Nep["label"], visible=False)
        md_sl = gr.Slider(minimum=rf_Mdp["minimum"], maximum=rf_Mdp["maximum"],
                          step=rf_Mdp["step"], value=rf_Mdp["value"], label=rf_Mdp["label"], visible=False)

        def show_params(name):
            if name=="Logistic Regression":
                return gr.update(visible=True), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
            elif name=="Random Forest Regressor":
                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
            else:
                return (gr.update(visible=False),)*4

        mdl_tr.change(show_params, mdl_tr, [c_sl, it_sl, ne_sl, md_sl])

        btn_tr1 = gr.Button("Entrenar")
        md_out = gr.Markdown()
        plt_out = gr.Plot()
        tbl_out = gr.Dataframe()
        st = gr.State()
        btn_tr1.click(train_one_model_with_save,
                      inputs=[ds_tr, mdl_tr, ts_tr, c_sl, it_sl, ne_sl, md_sl],
                      outputs=[md_out, plt_out, tbl_out, st])

        save_name = gr.Textbox(label="Nombre modelo")
        btn_save = gr.Button("Guardar")
        def do_save(state, name):
            if not state: return "Nada que guardar"
            path, _ = save_model_locally(state["model"], name, state["headers"], state["target_names"])
            return f"Guardado en {path}"
        btn_save.click(do_save, [st, save_name], md_out)

    # Pestaña 3: Predicción (Individual / Masiva)
    with gr.Tab("Predicción"):
        gr.Markdown("## Predicción\n- **Individual** vs **Masiva**")

        # Ahora sin placeholder
        model_dd = gr.Dropdown(
            choices=load_model_registry(),
            value=None,
            label="Modelos guardados"
        )

        btn_ref = gr.Button("Refrescar modelos")
        btn_ref.click(refresh_model_list, [], model_dd)

        with gr.Tabs():
            # Individual
            with gr.Tab("Individual"):
                feature_inputs = [
                    gr.Textbox(label=f"Feature {i}", visible=False)
                    for i in range(MAX_FEATURES)
                ]
                model_dd.change(update_textboxes_from_saved_model,
                                inputs=[model_dd],
                                outputs=feature_inputs)
                btn_ind = gr.Button("Predecir")
                out_ind = gr.HTML()
                btn_ind.click(predict_individual,
                              inputs=[model_dd] + feature_inputs,
                              outputs=[out_ind])

            # Masiva
            with gr.Tab("Masiva"):
                tpl_btn = gr.Button("Generar plantilla Excel")
                tpl_file = gr.File(label="Descargar plantilla Excel")
                tpl_btn.click(generate_excel_template,
                              inputs=[model_dd],
                              outputs=[tpl_file])
                uploader = gr.File(label="Cargar Excel",
                                   file_types=[".xlsx", ".xls"])
                btn_bulk = gr.Button("Predecir Masivo")
                out_bulk = gr.Dataframe(label="Resultados Masivos")
                btn_bulk.click(predict_bulk,
                               inputs=[model_dd, uploader],
                               outputs=[out_bulk])

demo.launch(share=True, debug=True)
