In [1]:
import optuna
import optuna.visualization as ov
from optuna.samplers import TPESampler
import plotly
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = pd.read_csv('cleanedData.csv')
data = data.drop("Order", axis = 'columns')
data = data.drop("PID", axis = 'columns')
print(data.shape)
data.head()

(2930, 74)


Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,3Ssn Porch,Screen Porch,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice,HasFence
0,20,RL,141.0,31770,1,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,NoFence,0,5,2010,WD,Normal,215000,0
1,20,RH,80.0,11622,1,Reg,Lvl,AllPub,Inside,Gtl,...,0,120,MnPrv,0,6,2010,WD,Normal,105000,1
2,20,RL,81.0,14267,1,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,NoFence,12500,6,2010,WD,Normal,172000,0
3,20,RL,93.0,11160,1,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,NoFence,0,4,2010,WD,Normal,244000,0
4,60,RL,74.0,13830,1,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,MnPrv,0,3,2010,WD,Normal,189900,1


In [3]:
X, y = data.drop("SalePrice", axis = 'columns'), data['SalePrice']
X.head()


Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Enclosed Porch,3Ssn Porch,Screen Porch,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,HasFence
0,20,RL,141.0,31770,1,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,NoFence,0,5,2010,WD,Normal,0
1,20,RH,80.0,11622,1,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,120,MnPrv,0,6,2010,WD,Normal,1
2,20,RL,81.0,14267,1,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,NoFence,12500,6,2010,WD,Normal,0
3,20,RL,93.0,11160,1,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,NoFence,0,4,2010,WD,Normal,0
4,60,RL,74.0,13830,1,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,MnPrv,0,3,2010,WD,Normal,1


In [4]:
y_classesI = pd.cut(
    y,
    bins=10, 
    labels=False, 
    include_lowest=True 
)
print(y_classesI.head())

0    2
1    1
2    2
3    3
4    2
Name: SalePrice, dtype: int64


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encodedI = le.fit_transform(y_classesI.dropna())


In [6]:
X_numeric = pd.get_dummies(X) 
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_encodedI, test_size=0.1, random_state=42
)

In [7]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }

    model = XGBClassifier(
        **params,
        eval_metric='mlogloss',
        objective='multi:softprob',
        num_class=len(np.unique(y_classesI))
    )

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)

    f1 = f1_score(y_test, y_pred, average='macro')
    return f1


In [8]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=10)


[I 2025-11-16 00:22:32,075] A new study created in memory with name: no-name-96f8e5ca-a6d9-4da7-9d3f-47eaf4095227
[I 2025-11-16 00:22:36,370] Trial 0 finished with value: 0.5123288237677851 and parameters: {'max_depth': 5, 'learning_rate': 0.16912345760180592, 'n_estimators': 814, 'subsample': 0.6948900507204658, 'colsample_bytree': 0.6360253556110614, 'gamma': 2.441543838339353, 'reg_alpha': 0.6691171539116052, 'reg_lambda': 0.15083860162212936}. Best is trial 0 with value: 0.5123288237677851.
[I 2025-11-16 00:22:42,097] Trial 1 finished with value: 0.532362590017347 and parameters: {'max_depth': 12, 'learning_rate': 0.01292936057096134, 'n_estimators': 696, 'subsample': 0.5031174379871343, 'colsample_bytree': 0.9161282964078314, 'gamma': 2.609707442980391, 'reg_alpha': 0.9923933523794357, 'reg_lambda': 0.14041873558334683}. Best is trial 1 with value: 0.532362590017347.
[I 2025-11-16 00:22:45,937] Trial 2 finished with value: 0.5067310380103749 and parameters: {'max_depth': 9, 'learn

In [9]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)


Best Macro F1: 0.5358934620371614
Best hyperparameters: {'max_depth': 10, 'learning_rate': 0.06065426480713356, 'n_estimators': 401, 'subsample': 0.5577663654301614, 'colsample_bytree': 0.5137249856802149, 'gamma': 2.2520756796491965, 'reg_alpha': 0.8998801619405432, 'reg_lambda': 0.20471690218103733}


In [10]:
import nbformat
ov.plot_optimization_history(study).show()
ov.plot_param_importances(study).show()

In [11]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classesI))
)

final_model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5137249856802149
,device,
,early_stopping_rounds,
,enable_categorical,False


In [12]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predI = final_model.predict(X_test)
y_probI = final_model.predict_proba(X_test)

print("\n Accuracy:", accuracy_score(y_test, y_predI))

print("\n f1 score: ", sklearn.metrics.f1_score(y_test, y_predI, average='weighted'))

print("\n Precision:", sklearn.metrics.precision_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predI[:5])
print(" Actual class labels   :", y_test[:5])


 Accuracy: 0.8191126279863481

 f1 score:  0.8135361112871622

 Precision: 0.8091699296726297

 Recall 0.8191126279863481

 Predicted class labels: [1 1 2 1 1]
 Actual class labels   : [1 1 2 1 1]


In [13]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=50)


[I 2025-11-16 00:23:59,228] A new study created in memory with name: no-name-a2a01e95-4c8e-445c-b914-bb21ed8ff6bf
[I 2025-11-16 00:24:00,175] Trial 0 finished with value: 0.5063091743061414 and parameters: {'max_depth': 7, 'learning_rate': 0.270812712784176, 'n_estimators': 177, 'subsample': 0.9549201001986865, 'colsample_bytree': 0.8921788627223057, 'gamma': 4.317858277687739, 'reg_alpha': 0.8385177858265543, 'reg_lambda': 0.6535140088066979}. Best is trial 0 with value: 0.5063091743061414.
[I 2025-11-16 00:24:03,551] Trial 1 finished with value: 0.45593153580475715 and parameters: {'max_depth': 9, 'learning_rate': 0.2973572510875225, 'n_estimators': 715, 'subsample': 0.7579129036071328, 'colsample_bytree': 0.6926691023852661, 'gamma': 2.8997262916116364, 'reg_alpha': 0.4770978705295179, 'reg_lambda': 0.6713917686491746}. Best is trial 0 with value: 0.5063091743061414.
[I 2025-11-16 00:24:04,743] Trial 2 finished with value: 0.5234457213012247 and parameters: {'max_depth': 6, 'learnin

In [14]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)


Best Macro F1: 0.5465051303550937
Best hyperparameters: {'max_depth': 6, 'learning_rate': 0.04371526223842599, 'n_estimators': 659, 'subsample': 0.5538760333245984, 'colsample_bytree': 0.6965714025545047, 'gamma': 3.161027339691143, 'reg_alpha': 0.7820869197670173, 'reg_lambda': 0.6282609582426528}


In [15]:
ov.plot_optimization_history(study).show()
ov.plot_param_importances(study).show()

In [16]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classesI))
)

final_model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6965714025545047
,device,
,early_stopping_rounds,
,enable_categorical,False


In [17]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predI = final_model.predict(X_test)
y_probI = final_model.predict_proba(X_test)

print("\n Accuracy:", accuracy_score(y_test, y_predI))

print("\n f1 score: ", sklearn.metrics.f1_score(y_test, y_predI, average='weighted'))

print("\n Precision:", sklearn.metrics.precision_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predI[:5])
print(" Actual class labels   :", y_test[:5])


 Accuracy: 0.825938566552901

 f1 score:  0.8194817955847506

 Precision: 0.8135982605899484

 Recall 0.825938566552901

 Predicted class labels: [1 1 2 1 1]
 Actual class labels   : [1 1 2 1 1]


In [18]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=30)

[I 2025-11-16 00:27:47,531] A new study created in memory with name: no-name-b03d9dbc-b4ad-4a66-9b7d-e6e86418f8fa
[I 2025-11-16 00:27:50,706] Trial 0 finished with value: 0.5797426995646473 and parameters: {'max_depth': 11, 'learning_rate': 0.2936075702347442, 'n_estimators': 583, 'subsample': 0.7074726731059341, 'colsample_bytree': 0.5608497260544103, 'gamma': 0.730279193415771, 'reg_alpha': 0.7218608889240685, 'reg_lambda': 0.6315867319792452}. Best is trial 0 with value: 0.5797426995646473.
[I 2025-11-16 00:27:51,743] Trial 1 finished with value: 0.4808171807030188 and parameters: {'max_depth': 10, 'learning_rate': 0.12830408023129314, 'n_estimators': 112, 'subsample': 0.8819536615311443, 'colsample_bytree': 0.5434137489746326, 'gamma': 0.9352957648706584, 'reg_alpha': 0.18863231915978917, 'reg_lambda': 0.9452822496221611}. Best is trial 0 with value: 0.5797426995646473.
[I 2025-11-16 00:27:54,085] Trial 2 finished with value: 0.516644922096128 and parameters: {'max_depth': 3, 'lear

In [19]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)

Best Macro F1: 0.5797426995646473
Best hyperparameters: {'max_depth': 11, 'learning_rate': 0.2936075702347442, 'n_estimators': 583, 'subsample': 0.7074726731059341, 'colsample_bytree': 0.5608497260544103, 'gamma': 0.730279193415771, 'reg_alpha': 0.7218608889240685, 'reg_lambda': 0.6315867319792452}


In [20]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classesI))
)

final_model.fit(X_train, y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5608497260544103
,device,
,early_stopping_rounds,
,enable_categorical,False


In [21]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predI = final_model.predict(X_test)
y_probI = final_model.predict_proba(X_test)

print("\n Accuracy:", accuracy_score(y_test, y_predI))

print("\n f1 score: ", sklearn.metrics.f1_score(y_test, y_predI, average='weighted'))

print("\n Precision:", sklearn.metrics.precision_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predI[:5])
print(" Actual class labels   :", y_test[:5])


 Accuracy: 0.8122866894197952

 f1 score:  0.8070408805263756

 Precision: 0.8035643596135534

 Recall 0.8122866894197952

 Predicted class labels: [2 1 2 1 1]
 Actual class labels   : [1 1 2 1 1]


In [24]:
yD = data["SalePrice"]

n_classes = 10
min_val, max_val = yD.min(), yD.max()
internal_edges = np.linspace(min_val, max_val, n_classes - 2)  
internal_edges = internal_edges[1:-1] 
bins = [-np.inf] + internal_edges.tolist() + [np.inf]

y_classD = pd.cut(y, bins=bins, labels=False)
y_classD

0       1
1       0
2       1
3       2
4       1
       ..
2925    1
2926    1
2927    1
2928    1
2929    1
Name: SalePrice, Length: 2930, dtype: int64

In [34]:
Xd = data.drop(columns=["SalePrice"])

X_trainD, X_testD, y_trainD, y_testD = train_test_split(
    Xd, y_classD,                      # <- actual regression target or same y
    stratify=y_classD,          # <- temporary labels used ONLY for stratification
    test_size=0.1,
    random_state=42
)

In [35]:
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y_classD),
    y=y_classD
)

In [36]:
def objectiveD(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }

    model = XGBClassifier(
        **params,
        eval_metric='mlogloss',
        objective='multi:softprob',
        num_class=len(np.unique(y_classD))
    )

    model.fit(X_trainD, y_trainD)
    y_pred_probaD = model.predict_proba(X_testD)
    y_predD = np.argmax(y_pred_probaD, axis=1)

    f1 = f1_score(y_testD, y_predD, average='macro')
    return f1


In [37]:
studyD = optuna.create_study(direction='maximize', sampler=TPESampler())
studyD.optimize(objectiveD, n_trials=50)

[I 2025-11-16 15:23:12,882] A new study created in memory with name: no-name-05459ff7-0b59-4cac-899c-dfb78ca8f812
[W 2025-11-16 15:23:12,932] Trial 0 failed with parameters: {'max_depth': 11, 'learning_rate': 0.05995268499446944, 'n_estimators': 412, 'subsample': 0.6510705431906121, 'colsample_bytree': 0.7721288256721982, 'gamma': 2.7147906869595544, 'reg_alpha': 0.2834002078163117, 'reg_lambda': 0.43356647300522455} because of the following error: ValueError('DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:MS Zoning: object, Lot Shape: object, Land Contour: object, Utilities: object, Lot Config: object, Land Slope: object, Neighborhood: object, Condition 1: object, Condition 2: object, Bldg Type: object, House Style: object, Roof Style: object, Roof Matl: object, Exterior 1st: object, Exterior 2nd: object, Foundation: object, Bsmt Exposure: 

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:MS Zoning: object, Lot Shape: object, Land Contour: object, Utilities: object, Lot Config: object, Land Slope: object, Neighborhood: object, Condition 1: object, Condition 2: object, Bldg Type: object, House Style: object, Roof Style: object, Roof Matl: object, Exterior 1st: object, Exterior 2nd: object, Foundation: object, Bsmt Exposure: object, BsmtFin Type 1: object, BsmtFin Type 2: object, Heating: object, Electrical: object, Functional: object, Garage Type: object, Garage Finish: object, Paved Drive: object, Fence: object, Sale Type: object, Sale Condition: object