In [2]:
import logging
import pandas as pd
import pickle

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)

from src.constants.constants import FEATURES_PATH, MODELS_PATH


In [106]:
ft_imp = ['upper_shadow',
 'sma_7_20_ratio',
 'gap',
 'returns_skew_14',
 'close_position_30',
 'stoch_d',
 'macd_histogram',
 'price_to_sma_7',
 'returns_kurt_30',
 'returns_skew_7']

In [110]:
X_train = pd.read_parquet(FEATURES_PATH / "X_train_scaled.parquet")[ft_imp]
y_train = pd.read_parquet(FEATURES_PATH / "y_train.parquet")['target']


In [111]:
X_train.head(2)

Unnamed: 0,upper_shadow,sma_7_20_ratio,gap,returns_skew_14,close_position_30,stoch_d,macd_histogram,price_to_sma_7,returns_kurt_30,returns_skew_7
0,0.153353,0.196564,-0.257782,-0.572161,1.296563,0.82823,-0.383146,0.595257,-1.152764,-0.858377
1,-0.644022,0.251756,-0.131187,-0.606809,1.32757,1.145368,-0.08589,1.331612,-1.108648,-1.05321


In [132]:
model = GradientBoostingClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=4,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    warm_start=True,
    verbose=0,
    subsample=0.5,
    validation_fraction=0.5,
)

In [167]:
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=6,
    min_samples_split=15,
    min_samples_leaf=10,
    random_state=42,
    class_weight='balanced',
    verbose=1,
    warm_start=True,
    max_features=None,
    oob_score=True,
    n_jobs=-1  # Usar todos los cores
)

In [181]:
model = LogisticRegression(
    max_iter=200,
    random_state=42,
    n_jobs=1,
    fit_intercept=False,
    class_weight='balanced'  # Para manejar desbalance de clases
)

In [182]:
model.fit(X_train, y_train)

In [183]:
y_train_pred = model.predict(X_train)

In [184]:
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

In [185]:
print(f"âœ… Modelo entrenado")
print(f"ðŸ“Š MÃ©tricas en TRAIN:")
print(f"   Accuracy: {train_accuracy:.4f}")
print(f"   Precision: {train_precision:.4f}")
print(f"   Recall: {train_recall:.4f}")
print(f"   F1-Score: {train_f1:.4f}")

âœ… Modelo entrenado
ðŸ“Š MÃ©tricas en TRAIN:
   Accuracy: 0.5369
   Precision: 0.5363
   Recall: 0.4979
   F1-Score: 0.5164


In [186]:
X_test = pd.read_parquet(FEATURES_PATH / "X_test_scaled.parquet")[ft_imp]
y_test = pd.read_parquet(FEATURES_PATH / "y_test.parquet")['target']


In [219]:
# Predicciones
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

In [220]:
df_preds = pd.DataFrame({"y_test": y_test, "y_pred": y_pred, "y_proba": y_proba})

In [227]:
y_pred = [1 if x > 0.55 else 0 for x in df_preds.y_proba]

In [228]:
# MÃ©tricas
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

print(f"ðŸ“Š MÃ©tricas en TEST:")
print(f"   Accuracy: {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall: {recall:.4f}")
print(f"   F1-Score: {f1:.4f}")
print(f"   ROC-AUC: {roc_auc:.4f}")
print(f"\nðŸ“Š Confusion Matrix:")
print(f"   TN: {cm[0,0]}  FP: {cm[0,1]}")
print(f"   FN: {cm[1,0]}  TP: {cm[1,1]}")

ðŸ“Š MÃ©tricas en TEST:
   Accuracy: 0.5092
   Precision: 0.4833
   Recall: 0.3718
   F1-Score: 0.4203
   ROC-AUC: 0.4807

ðŸ“Š Confusion Matrix:
   TN: 54  FP: 31
   FN: 49  TP: 29


In [223]:
df_imp = pd.DataFrame({"feature": X_train.columns, "imp": model.feature_importances_}).sort_values("imp", ascending=False)

AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'

In [224]:
df_imp.head(10).feature.tolist()

['upper_shadow',
 'gap',
 'returns_kurt_30',
 'price_to_sma_7',
 'close_position_30',
 'macd_histogram',
 'stoch_d',
 'sma_7_20_ratio',
 'returns_skew_14',
 'returns_skew_7']

In [225]:
df_preds = pd.DataFrame({"y_test": y_test, "y_pred": y_pred, "y_proba": y_proba})

In [226]:
df_preds.head(10)

Unnamed: 0,y_test,y_pred,y_proba
0,1,1,0.468766
1,0,1,0.482546
2,1,1,0.528845
3,1,1,0.456421
4,1,1,0.419615
5,0,1,0.487582
6,0,1,0.461086
7,1,1,0.472183
8,1,1,0.553355
9,0,1,0.450109


In [212]:
df_preds.tail(10)

Unnamed: 0,y_test,y_pred,y_proba
153,1,1,0.658664
154,0,1,0.683066
155,0,0,0.613412
156,0,0,0.563862
157,0,0,0.584442
158,1,1,0.654984
159,1,0,0.539974
160,0,0,0.527157
161,1,0,0.572479
162,0,0,0.570712


In [213]:
df_preds.y_proba.max()

0.7106407139703336

In [214]:
df_preds[df_preds.y_proba > 0.7]

Unnamed: 0,y_test,y_pred,y_proba
44,0,1,0.710641
102,1,1,0.703203
