In [1]:
import pandas as pd

df = pd.read_csv("C:\\Users\\DELL\\OneDrive\\Área de Trabalho\\Ironhack\\final-project-ironhack-da\\data\\cleaned\\model.csv")
df = df.sort_values("data")

In [2]:
X = df.drop(columns=["target", "selic_next", "data"])

y = df["target"]

In [3]:
split = int(len(df) * 0.8)

X_train = X.iloc[:split]
X_test  = X.iloc[split:]

y_train = y.iloc[:split]
y_test  = y.iloc[split:]

In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

In [5]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)

In [6]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=500, random_state=42)
logreg.fit(X_train_bal, y_train_bal)   

In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=300, random_state=42)
rf.fit(X_train_bal, y_train_bal)

In [8]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=400,
    learning_rate=0.03,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb.fit(X_train_bal, y_train_bal)

In [9]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

threshold = 0.25 

# Logistic Regression
logreg_probs = logreg.predict_proba(X_test_scaled)[:, 1]
logreg_pred_thresh = (logreg_probs >= threshold).astype(int)
print("\n=== Logistic Regression (threshold={}) ===".format(threshold))
print("Confusion Matrix:\n", confusion_matrix(y_test, logreg_pred_thresh))
print(classification_report(y_test, logreg_pred_thresh))
print("ROC AUC:", roc_auc_score(y_test, logreg_probs))

# Random Forest
rf_probs = rf.predict_proba(X_test_scaled)[:, 1]
rf_pred_thresh = (rf_probs >= threshold).astype(int)
print("\n=== Random Forest (threshold={}) ===".format(threshold))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_pred_thresh))
print(classification_report(y_test, rf_pred_thresh))
print("ROC AUC:", roc_auc_score(y_test, rf_probs))

# XGBoost
xgb_probs = xgb.predict_proba(X_test_scaled)[:, 1]
xgb_pred_thresh = (xgb_probs >= threshold).astype(int)
print("\n=== XGBoost (threshold={}) ===".format(threshold))
print("Confusion Matrix:\n", confusion_matrix(y_test, xgb_pred_thresh))
print(classification_report(y_test, xgb_pred_thresh))
print("ROC AUC:", roc_auc_score(y_test, xgb_probs))


=== Logistic Regression (threshold=0.25) ===
Confusion Matrix:
 [[12  0]
 [ 9  2]]
              precision    recall  f1-score   support

           0       0.57      1.00      0.73        12
           1       1.00      0.18      0.31        11

    accuracy                           0.61        23
   macro avg       0.79      0.59      0.52        23
weighted avg       0.78      0.61      0.53        23

ROC AUC: 0.6818181818181818

=== Random Forest (threshold=0.25) ===
Confusion Matrix:
 [[10  2]
 [ 6  5]]
              precision    recall  f1-score   support

           0       0.62      0.83      0.71        12
           1       0.71      0.45      0.56        11

    accuracy                           0.65        23
   macro avg       0.67      0.64      0.63        23
weighted avg       0.67      0.65      0.64        23

ROC AUC: 0.7045454545454545

=== XGBoost (threshold=0.25) ===
Confusion Matrix:
 [[12  0]
 [11  0]]
              precision    recall  f1-score   support

 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# Predict

In [11]:
rf_pred = rf.predict(X_test_scaled)  # Predicted labels (0/1)

In [12]:
rf_proba = rf.predict_proba(X_test_scaled)[:,1]  # Probability for class 1 ("Rise")

In [13]:
# Define your mapping
label_map = {0: 'Not Rise', 1: 'Rise'}

# Make a result DataFrame
results = X_test.copy()
results['True_Label'] = y_test.map(label_map)
results['RF_Pred'] = rf_pred
results['RF_Pred_Label'] = results['RF_Pred'].map(label_map)
results['RF_Prob_Rise'] = rf_proba

In [14]:
results['Date'] = df.loc[y_test.index, 'data']  

In [15]:
results.to_csv('rf_predictions_tableau.csv', index=False, sep=';', encoding='utf-8-sig')