In [None]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.model_selection import TimeSeriesSplit
from datetime import timedelta
from itertools import product
from google.colab import files

In [None]:
csv_path = "/content/drive/MyDrive/Proyecto Final/btc_enriched_with_target.csv"

In [None]:
import pandas as pd
import numpy as np
import os
import joblib
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import average_precision_score
from sklearn.model_selection import TimeSeriesSplit
from datetime import timedelta
from itertools import product

def generar_probabilidades_compra(csv_path, guardar_csv=False) -> pd.DataFrame:
    df = pd.read_csv(csv_path, parse_dates=['Date'])
    df.dropna(subset=['Target'], inplace=True)
    df = df.sort_values('Date').reset_index(drop=True)

    # Crear carpeta para guardar modelos si no existe
    os.makedirs("modelos_por_mes", exist_ok=True)

    # Inicializa columna si no existe
    if 'proba_compra' not in df.columns:
        df['proba_compra'] = np.nan

    def best_params_val(X_train, y_train, X_val, y_val):
        param_grid = {
            "n_estimators": [100, 200],
            "max_depth": [3, 6],
            "learning_rate": [0.01, 0.1],
            "subsample": [0.8, 1],
            "colsample_bytree": [0.8, 1]
        }
        best_score, best_params = -1, None
        keys = list(param_grid.keys())
        for combo in product(*param_grid.values()):
            params = dict(zip(keys, combo))
            model = XGBClassifier(
                objective="binary:logistic",
                eval_metric="logloss",
                random_state=42,
                **params
            )
            model.fit(X_train, y_train)
            proba = model.predict_proba(X_val)[:, 1]
            score = average_precision_score(y_val, proba)
            if score > best_score:
                best_score, best_params = score, params
        return best_params

    # Split para validación
    df_train_val = df[df['Date'] <= '2021-06-30']
    df_train = df_train_val[df_train_val['Date'] <= '2020-12-31']
    df_val   = df_train_val[df_train_val['Date'] > '2020-12-31']

    X_train, y_train = df_train.drop(columns=['Target', 'Date', 'proba_compra'], errors='ignore'), df_train['Target']
    X_val, y_val     = df_val.drop(columns=['Target', 'Date', 'proba_compra'], errors='ignore'), df_val['Target']
    best_params = best_params_val(X_train, y_train, X_val, y_val)

    # Rolling mensual con embargo
    start_date = pd.to_datetime("2021-01-01")
    end_date = df['Date'].max()
    current_date = start_date

    while current_date <= end_date:
        month_start = current_date.replace(day=1)
        embargo_day = pd.to_datetime(month_start)
        test_start = embargo_day + timedelta(days=1)
        test_end = embargo_day + pd.offsets.MonthEnd(0)

        mask_train = df['Date'] <= embargo_day - timedelta(days=1)
        mask_test = (df['Date'] >= test_start) & (df['Date'] <= test_end)

        if df.loc[mask_test, 'proba_compra'].notna().all():
            print(f"Mes {month_start.strftime('%Y-%m')} ya procesado. Saltando.")
            current_date += pd.offsets.MonthBegin(1)
            continue

        df_train = df[mask_train]
        df_test = df[mask_test]

        if df_test.empty or df_train.empty:
            current_date += pd.offsets.MonthBegin(1)
            continue

        X_train = df_train.drop(columns=['Date', 'Target', 'proba_compra'], errors='ignore')
        y_train = df_train['Target']

        X_y = pd.concat([X_train, y_train], axis=1).dropna()
        X_train = X_y.drop(columns=['Target'])
        y_train = X_y['Target'].astype(int)

        if y_train.empty or not (0 in y_train.values and 1 in y_train.values):
            print(f" Mes {month_start.strftime('%Y-%m')} — problema con y_train. Saltando...")
            current_date += pd.offsets.MonthBegin(1)
            continue

        X_test = df_test.drop(columns=['Date', 'Target', 'proba_compra'], errors='ignore')

        num_pos = sum(y_train == 1)
        num_neg = sum(y_train == 0)
        scale_pos_weight = num_neg / num_pos if num_pos > 0 else 1

        xgb = XGBClassifier(
            objective='binary:logistic',
            eval_metric='logloss',
            random_state=42,
            scale_pos_weight=scale_pos_weight,
            **best_params
        )
        calib = CalibratedClassifierCV(xgb, method='sigmoid', cv=TimeSeriesSplit(n_splits=5))
        calib.fit(X_train, y_train)

        # Guardar modelo mensual
        nombre_modelo = f"modelos_por_mes/modelo_{month_start.strftime('%Y_%m')}.pkl"
        joblib.dump(calib, nombre_modelo)

        # Predicción test (solo probabilidades)
        proba_test = calib.predict_proba(X_test)[:, 1]
        df.loc[mask_test, 'proba_compra'] = proba_test

        # Día de embargo (añadir NaN si no está)
        if not (df['Date'] == embargo_day).any():
            df = pd.concat([
                df,
                pd.DataFrame({'Date': [embargo_day], 'proba_compra': [np.nan]})
            ], ignore_index=True)

        current_date += pd.offsets.MonthBegin(1)

    df = df.sort_values('Date').reset_index(drop=True)

    if guardar_csv:
        df.to_csv("predicciones_actualizadas.csv", index=False)
        try:
            from google.colab import files
            files.download("predicciones_actualizadas.csv")
        except ImportError:
            pass

    return df[['Date', 'proba_compra']]


In [None]:
generar_probabilidades_compra(csv_path)

Unnamed: 0,Date,proba_compra
0,2015-01-20,
1,2015-01-21,
2,2015-01-22,
3,2015-01-23,
4,2015-01-24,
...,...,...
3845,2025-07-31,0.529621
3846,2025-08-01,
3847,2025-08-02,0.551155
3848,2025-08-03,0.532398
