In [None]:
# ==============================================================
# 0) SET-UP
# ==============================================================

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV 
from sklearn.metrics import confusion_matrix, classification_report, precision_score, f1_score 


# ==============================================================
# 1) HİPERPARAMETRE ARAMASI  (Time-series CV) - CLASS WEIGHTS İLE
# ==============================================================
try:
    tscv = TimeSeriesSplitBoth(n_splits=5, test_size=12, min_pos=1, min_neg=1)
    print("Using TimeSeriesSplitBoth for cross-validation.")
except NameError:
    print("TimeSeriesSplitBoth not defined. Falling back to TimeSeriesSplit. Ensure TimeSeriesSplitBoth is defined and executed in a previous cell.")
    tscv = TimeSeriesSplit(n_splits=5, test_size=12)


# RandomForestClassifier - SMOTE kaldırıldı
rf_model = RandomForestClassifier(random_state=42, bootstrap=True, oob_score=True) # oob_score=True eklendi

# Parametre gridi güncellendi - SMOTE parametreleri kaldırıldı, rf__ önekleri kaldırıldı
param_grid = {
    'n_estimators'     : [50, 100, 150], 
    'max_depth'        : [2, 3, 4, 5], # Slightly increased max_depth options
    'min_samples_leaf' : [10, 15, 20],   
    'min_samples_split': [20, 30, 40],  
    'max_features'     : ['sqrt', 0.4, 0.6], 
    'class_weight'     : ['balanced', 'balanced_subsample', None] 
}

gcv = GridSearchCV(
    estimator=rf_model, 
    param_grid=param_grid,
    scoring='f1', # 1-sınıfı için F1 skoruna odaklanıyoruz
    cv=tscv,
    n_jobs=-1,
    verbose=1
)
gcv.fit(X, y)

print("\nBest parameters (with class weights):", gcv.best_params_)
print("Best F1 on time series (with class weights):", gcv.best_score_.round(3))

# ==============================================================
# 1.1) ÖZELLİK ÖNEMLERİNİ GÖSTER (EN İYİ MODEL İLE)
# ==============================================================
best_rf_model = gcv.best_estimator_ 
feature_importances = pd.Series(best_rf_model.feature_importances_, index=X.columns)
print("\nFeature Importances (from best RF model):")
print(feature_importances.sort_values(ascending=False))


# ==============================================================
# 2) EN İYİ MODELİ FULL TRAIN,  OOB & CV KARŞILAŞTIR
# ==============================================================

best_rf = gcv.best_estimator_ 

if hasattr(best_rf, 'oob_decision_function_') and best_rf.oob_decision_function_ is not None:
    oob_pred_proba_rf = best_rf.oob_decision_function_
    if oob_pred_proba_rf.ndim == 2:
        oob_pred_rf = (oob_pred_proba_rf[:, 1] > 0.5).astype(int) 
        report_oob_rf = classification_report(y, oob_pred_rf, output_dict=True, zero_division=0, labels=np.unique(y))
        if '1' in report_oob_rf and isinstance(report_oob_rf['1'], dict) and 'f1-score' in report_oob_rf['1']:
            oob_f1_rf = report_oob_rf['1']['f1-score']
            print("OOB F1 (Random Forest, threshold 0.5):", round(oob_f1_rf, 3))
        elif 1 in report_oob_rf and isinstance(report_oob_rf[1], dict) and 'f1-score' in report_oob_rf[1]: # Check for int key
            oob_f1_rf = report_oob_rf[1]['f1-score']
            print("OOB F1 (Random Forest, threshold 0.5):", round(oob_f1_rf, 3))
        else:
            print("OOB F1 (Random Forest): Class '1' veya F1 skoru OOB raporunda bulunamadı.")
            oob_f1_rf = 0.0
    else:
        print("OOB decision function (Random Forest) uygun formatta değil.")
        oob_f1_rf = 0.0
elif hasattr(best_rf, 'oob_score_'):
     print(f"OOB Accuracy (Random Forest): {best_rf.oob_score_:.3f} (F1 score for OOB requires oob_decision_function)")
     oob_f1_rf = 0.0 
else:
    print("Random Forest modeli 'oob_decision_function_' veya 'oob_score_' özelliğine sahip değil.")
    oob_f1_rf = 0.0


# ==============================================================
# 3) FORWARD WALK  —  SON FOLD’U GERÇEK TEST GİBİ RAPORLA
# ==============================================================

all_splits = list(tscv.split(X, y))
split_to_test_idx = 1 

if len(all_splits) > split_to_test_idx:
    print(f"Test için {split_to_test_idx}. indeksli zaman serisi katmanı kullanılıyor.")
    train_idx, test_idx = all_splits[split_to_test_idx]
else:
    print(f"Uyarı: İstenen katman ({split_to_test_idx}) bulunamadı. Son mevcut katman kullanılacak.")
    if len(all_splits) > 0:
        train_idx, test_idx = all_splits[-1]
    else:
        raise ValueError("No splits available from TimeSeriesSplit. Check CV parameters and data size.")


X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_test , y_test  = X.iloc[test_idx],  y.iloc[test_idx]

print(f"Kullanılan Eğitim Seti Boyutu: {X_train.shape[0]}, Test Seti Boyutu: {X_test.shape[0]}")
print("Kullanılan Test Setindeki (y_test) Sınıf Dağılımı:")
print(y_test.value_counts())

if 1 not in y_test.value_counts():
    print("UYARI: Seçilen bu test katmanında '1' sınıfı bulunmuyor. 'split_to_test_idx'yi veya TimeSeriesSplit ayarlarını gözden geçirin.")

best_rf.fit(X_train, y_train) 

# --- PROBABILITY THRESHOLD TUNING ---
# Revised to maximize F1-score for class 1 on the test set
y_pred_proba_test_rf = best_rf.predict_proba(X_test)[:, 1] 

optimal_threshold_rf = 0.5 
best_f1_class1_rf_tuned = -1.0

potential_thresholds = np.arange(0.05, 1.0, 0.01)
print("\nTuning probability threshold for Random Forest to maximize F1-score for class 1...")
for current_thresh in potential_thresholds:
    y_pred_temp_rf = (y_pred_proba_test_rf >= current_thresh).astype(int)
    current_f1_class1 = f1_score(y_test, y_pred_temp_rf, pos_label=1, zero_division=0)
    
    if current_f1_class1 > best_f1_class1_rf_tuned:
        best_f1_class1_rf_tuned = current_f1_class1
        optimal_threshold_rf = current_thresh

print(f"Optimal threshold for RF found: {optimal_threshold_rf:.2f} (Maximizing F1 Class 1: {best_f1_class1_rf_tuned:.3f})")
y_pred_rf_tuned = (y_pred_proba_test_rf >= optimal_threshold_rf).astype(int)
# --- END THRESHOLD TUNING ---

unique_labels_in_y = np.unique(y_test) # Use y_test for labels in confusion matrix and report
if len(unique_labels_in_y) < 2 and len(np.unique(y)) == 2: # If test set has only one class, use all possible labels from y
    unique_labels_in_y = np.unique(y)


cm_rf = confusion_matrix(y_test, y_pred_rf_tuned, labels=unique_labels_in_y)
print(f"\nConfusion Matrix (Random Forest, test katmanı {split_to_test_idx}, tuned threshold):")
print(cm_rf)

print(f"\nClassification Report (Random Forest, test katmanı {split_to_test_idx}, tuned threshold):")
print(classification_report(y_test, y_pred_rf_tuned, digits=3, labels=unique_labels_in_y, zero_division=0))


# ==============================================================
# 4) BASİT OVERFITTING GÖSTERGESİ
# ==============================================================
y_pred_proba_train_rf = best_rf.predict_proba(X_train)[:, 1]
y_pred_train_rf_tuned = (y_pred_proba_train_rf >= optimal_threshold_rf).astype(int)

report_train_rf = classification_report(y_train, y_pred_train_rf_tuned, output_dict=True, zero_division=0, labels=unique_labels_in_y)
train_f1_rf = 0.0
# Ensure class '1' (represented as int 1 or str '1') is checked correctly
if '1' in report_train_rf and isinstance(report_train_rf['1'], dict):
    train_f1_rf = report_train_rf['1'].get('f1-score', 0.0)
elif 1 in report_train_rf and isinstance(report_train_rf[1], dict): 
    train_f1_rf = report_train_rf[1].get('f1-score', 0.0)


report_test_rf = classification_report(y_test , y_pred_rf_tuned, output_dict=True, zero_division=0, labels=unique_labels_in_y)
test_f1_rf = 0.0
if '1' in report_test_rf and isinstance(report_test_rf['1'], dict):
    test_f1_rf = report_test_rf['1'].get('f1-score', 0.0)
elif 1 in report_test_rf and isinstance(report_test_rf[1], dict):
    test_f1_rf = report_test_rf[1].get('f1-score', 0.0)


print(f"\nF1 (Random Forest, train) = {train_f1_rf:.3f} | F1 (Random Forest, test) = {test_f1_rf:.3f} (using tuned threshold {optimal_threshold_rf:.2f})")

if train_f1_rf - test_f1_rf > 0.10:
    print("⚠️ RF >0.10 fark → overfitting; max_depth, min_samples_leaf.")

# ==============================================================
# 5) (Opsiyonel) BalancedRandomForest ile hızlı karşılaştırma
# ==============================================================

# Adjusting BRF parameters to reduce overfitting
brf_n_estimators = gcv.best_params_.get('n_estimators', 100)
brf_max_depth = gcv.best_params_.get('max_depth', 3) # Using max_depth from best RF model

brf = BalancedRandomForestClassifier(
    n_estimators=brf_n_estimators, 
    max_depth=brf_max_depth, 
    random_state=42,
    # class_weight='balanced' is implicit in BalancedRandomForestClassifier by resampling
).fit(X_train, y_train)

y_pred_brf_test = brf.predict(X_test) 

print(f"\nBalancedRF — test confusion (n_estimators={brf_n_estimators}, max_depth={brf_max_depth}):")
print(confusion_matrix(y_test, y_pred_brf_test, labels=unique_labels_in_y))
print(f"\nBalancedRF — classification report (n_estimators={brf_n_estimators}, max_depth={brf_max_depth}):")
print(classification_report(y_test, y_pred_brf_test, digits=3, labels=unique_labels_in_y, zero_division=0))

# Overfitting check for BRF
y_pred_brf_train = brf.predict(X_train)
report_train_brf = classification_report(y_train, y_pred_brf_train, output_dict=True, zero_division=0, labels=unique_labels_in_y)
train_f1_brf = 0.0
if '1' in report_train_brf and isinstance(report_train_brf['1'], dict):
    train_f1_brf = report_train_brf['1'].get('f1-score', 0.0)
elif 1 in report_train_brf and isinstance(report_train_brf[1], dict):
    train_f1_brf = report_train_brf[1].get('f1-score', 0.0)

report_test_brf = classification_report(y_test, y_pred_brf_test, output_dict=True, zero_division=0, labels=unique_labels_in_y)
test_f1_brf = 0.0
if '1' in report_test_brf and isinstance(report_test_brf['1'], dict):
    test_f1_brf = report_test_brf['1'].get('f1-score', 0.0)
elif 1 in report_test_brf and isinstance(report_test_brf[1], dict):
    test_f1_brf = report_test_brf[1].get('f1-score', 0.0)

print(f"F1 (BalancedRF, train) = {train_f1_brf:.3f} | F1 (BalancedRF, test) = {test_f1_brf:.3f}")
if abs(train_f1_brf - test_f1_brf) > 0.10 and train_f1_brf > test_f1_brf : # Check for significant drop
    print("⚠️ BRF >0.10 fark → overfitting.")



In [None]:
# ==============================================================
# 1) Imports
# ============================================================== 
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import (
    f1_score, confusion_matrix,
    classification_report, make_scorer
)
import numpy as np

# ==============================================================
# 2) Cross-validator: 5 folds, 18-month test blocks,
#    each test block must contain ≥1 positive & ≥1 negative
# ============================================================== 
cv = TimeSeriesSplitBoth(
        n_splits = 5,
        test_size = 18,
        min_pos = 1,
        min_neg = 1
)

# ==============================================================
# 3) Model + grid
# ============================================================== 
ratio = (y == 0).sum() / (y == 1).sum()       # class imbalance

xgb = XGBClassifier(
        objective = "binary:logistic",
        eval_metric = "logloss",
        random_state = 42,
        tree_method = "hist",     # fastest on CPU
        n_jobs = -1
)

param_grid = {
    "n_estimators"      : [100, 150, 200],
    "learning_rate"     : [0.05, 0.1],
    "max_depth"         : [2, 3],
    "subsample"         : [0.8, 1.0],
    "colsample_bytree"  : [0.6, 0.8],
    "gamma"             : [0, 0.5],
    "scale_pos_weight"  : [ratio, ratio * 1.5]   # class weight
}

scorer = make_scorer(f1_score, pos_label=1)

gcv = GridSearchCV(
        estimator  = xgb,
        param_grid = param_grid,
        scoring    = scorer,
        cv         = cv,
        n_jobs     = -1,
        verbose    = 2,
        refit      = True
)

print(" Grid-searching XGBoost …")
gcv.fit(X, y)

print("\nBest CV F1 (class 1):", round(gcv.best_score_, 3))
print("Best hyper-parameters:")
for k, v in gcv.best_params_.items():
    print(f"  {k}: {v}")

# ==============================================================
# 4) Final hold-out = last fold   (never seen during CV)
# ============================================================== 
splits       = list(cv.split(X, y))
train_idx, test_idx = splits[-1]

X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_test , y_test  = X.iloc[test_idx],  y.iloc[test_idx]

best_xgb = gcv.best_estimator_.fit(X_train, y_train)

# ===== probability → threshold search (macro-F1) ==============
proba = best_xgb.predict_proba(X_test)[:, 1]
ths   = np.linspace(0.2, 0.8, 13)

best_f1, best_t = 0, 0.5
for t in ths:
    f1 = f1_score(y_test, proba > t, pos_label=1)
    if f1 > best_f1:
        best_f1, best_t = f1, t

print(f"\nOptimal threshold: {best_t:.2f}  (F1₁ = {best_f1:.3f})")
y_pred = (proba > best_t).astype(int)

# ==============================================================
# 5) Reports  –  overfitting check
# ============================================================== 
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion matrix (hold-out):\n", cm)

print("\nClassification report (hold-out):")
print(classification_report(y_test, y_pred, digits=3, zero_division=0))

# Train-set report
train_report = classification_report(
        y_train,
        best_xgb.predict(X_train) > 0.5,     # default 0.5 for train
        digits=3, output_dict=True,
        zero_division=0
)

train_f1 = train_report['1']['f1-score']
test_f1  = best_f1
print(f"\nF1 (train) = {train_f1:.3f} | F1 (test) = {test_f1:.3f}")

if train_f1 - test_f1 > 0.10:
    print("⚠️  >0.10 gap → potential overfitting; reduce max_depth or n_estimators.")
else:
    print("✅  No significant overfitting gap detected.")


In [None]:
# ==============================================================
# 0) SET–UP
# ==============================================================

import pandas as pd, numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report

# -------------------- feature / target ------------------------
monthly['high_vol_next'] = monthly['high_vol'].shift(-1)
monthly = monthly.dropna(subset=['high_vol_next'])

y = monthly['high_vol_next'].astype(int)
X = monthly.drop(columns=['high_vol', 'high_vol_next'])

# sınıf dengesizliği oranı (0=low, 1=high)
scale_pos = (y == 0).sum() / (y == 1).sum()

# ==============================================================
# 1) HİPERPARAMETRE ARAMASI  (Time-series CV)
# ==============================================================

tscv = TimeSeriesSplit(n_splits=5, test_size=6)

param_grid = {
    'n_estimators'     : [300, 500, 800],
    'max_depth'        : [2, 3, 4],
    'learning_rate'    : [0.05, 0.1],
    'subsample'        : [0.7, 0.9],
    'colsample_bytree' : [0.6, 0.8],
    'gamma'            : [0, 1],          # minimum loss reduction
    'min_child_weight' : [1, 5],
    'scale_pos_weight' : [scale_pos]      # class imbalance
}

xgb_base = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

gcv = GridSearchCV(
    estimator=xgb_base,
    param_grid=param_grid,
    scoring='f1',
    cv=tscv,
    verbose=1,
    n_jobs=-1
)
gcv.fit(X, y)

print("En iyi parametreler:", gcv.best_params_)
print("Time-series CV’de en iyi F1:", round(gcv.best_score_, 3))

# ==============================================================
# 2) EN İYİ MODELİ FULL TRAIN,  TRAIN-TEST KARŞILAŞTIR
# ==============================================================

best_xgb = gcv.best_estimator_

# Bilinen etiketler (sınıflar)
known_labels = sorted(y.unique()) # Genellikle [0, 1] olacaktır

# Test için katman seçimi: Son 6 ayda '1' sınıfı bulunmadığından,
# '1' sınıfını içeren ve daha anlamlı bir değerlendirme sunacak
# farklı bir katman seçiyoruz (örneğin, sondan 4. katman - index 1).
# Bu katman 2023-06-30 tarihindeki 'high_vol = 1' verisini içerir.
all_splits = list(tscv.split(X, y))
# train_idx, test_idx = all_splits[-1] # Önceki: Sadece son 6 ayı alıyordu, '1' içermiyordu.

chosen_fold_idx = 1 # Sondan 4. katman (0-indeksli)
if len(all_splits) > chosen_fold_idx and chosen_fold_idx < len(all_splits):
    print(f"\nDeğerlendirme için {chosen_fold_idx}. indeksli zaman serisi katmanı kullanılıyor.")
    train_idx, test_idx = all_splits[chosen_fold_idx]
else:
    print(f"Uyarı: İstenen katman ({chosen_fold_idx}) bulunamadı. Son mevcut katman kullanılacak.")
    train_idx, test_idx = all_splits[-1]

X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_test , y_test  = X.iloc[test_idx],  y.iloc[test_idx]

print(f"Kullanılan Eğitim Seti Boyutu: {X_train.shape[0]}, Test Seti Boyutu: {X_test.shape[0]}")
print("Kullanılan Test Setindeki (y_test) Sınıf Dağılımı:")
print(y_test.value_counts())

if 1 not in y_test.value_counts():
    print("UYARI: Seçilen bu test katmanında '1' sınıfı bulunmuyor. Farklı bir 'chosen_fold_idx' deneyin veya TimeSeriesSplit ayarlarını gözden geçirin.")

best_xgb.fit(X_train, y_train)

y_pred_train = best_xgb.predict(X_train)
y_pred_test  = best_xgb.predict(X_test)

cm = confusion_matrix(y_test, y_pred_test, labels=known_labels)
print(f"\nConfusion Matrix (seçilen test katmanı {chosen_fold_idx}):\n", cm)

print(f"\nClassification Report (test katmanı {chosen_fold_idx}):\n",
      classification_report(y_test, y_pred_test, digits=3, labels=known_labels, zero_division=0))

report_train_dict = classification_report(y_train, y_pred_train, output_dict=True, labels=known_labels, zero_division=0)
report_test_dict  = classification_report(y_test,  y_pred_test, output_dict=True, labels=known_labels, zero_division=0)

# '1' sınıfı için F1 skorunu güvenli bir şekilde al
train_f1 = report_train_dict.get(str(known_labels[1]), {}).get('f1-score', 0.0) if len(known_labels) > 1 else report_train_dict.get('1', {}).get('f1-score', 0.0)
test_f1  = report_test_dict.get(str(known_labels[1]), {}).get('f1-score', 0.0) if len(known_labels) > 1 else report_test_dict.get('1', {}).get('f1-score', 0.0)

print(f"\nF1 (train) = {train_f1:.3f} | F1 (test) = {test_f1:.3f}")

if train_f1 > 0 and abs(train_f1 - test_f1) > 0.10 : # train_f1 > 0 kontrolü eklendi
    print("⚠️  >0.10 fark → olası overfitting; "
          "learning_rate’i düşürüp n_estimators’ı artırmayı, "
          "veya max_depth’i küçültmeyi deneyin.")
