In [75]:
import optuna
import optuna.visualization as ov
from optuna.samplers import TPESampler
import plotly
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [76]:
data = pd.read_csv('cleanedData.csv')
# data = data.drop("Order", axis = 'columns')
# data = data.drop("PID", axis = 'columns')
print(data.shape)
data.head()

(2930, 76)


Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice,HasFence
0,1,526301100,20,RL,141.0,31770,1,IR1,Lvl,AllPub,...,0,0,NoFence,0,5,2010,WD,Normal,215000,0
1,2,526350040,20,RH,80.0,11622,1,Reg,Lvl,AllPub,...,0,120,MnPrv,0,6,2010,WD,Normal,105000,1
2,3,526351010,20,RL,81.0,14267,1,IR1,Lvl,AllPub,...,0,0,NoFence,12500,6,2010,WD,Normal,172000,0
3,4,526353030,20,RL,93.0,11160,1,Reg,Lvl,AllPub,...,0,0,NoFence,0,4,2010,WD,Normal,244000,0
4,5,527105010,60,RL,74.0,13830,1,IR1,Lvl,AllPub,...,0,0,MnPrv,0,3,2010,WD,Normal,189900,1


In [77]:
X, y = data.drop("SalePrice", axis = 'columns'), data['SalePrice']
X.head()


Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,...,Enclosed Porch,3Ssn Porch,Screen Porch,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,HasFence
0,1,526301100,20,RL,141.0,31770,1,IR1,Lvl,AllPub,...,0,0,0,NoFence,0,5,2010,WD,Normal,0
1,2,526350040,20,RH,80.0,11622,1,Reg,Lvl,AllPub,...,0,0,120,MnPrv,0,6,2010,WD,Normal,1
2,3,526351010,20,RL,81.0,14267,1,IR1,Lvl,AllPub,...,0,0,0,NoFence,12500,6,2010,WD,Normal,0
3,4,526353030,20,RL,93.0,11160,1,Reg,Lvl,AllPub,...,0,0,0,NoFence,0,4,2010,WD,Normal,0
4,5,527105010,60,RL,74.0,13830,1,IR1,Lvl,AllPub,...,0,0,0,MnPrv,0,3,2010,WD,Normal,1


In [78]:
y_classesI = pd.cut(
    y,
    bins=10, 
    labels=False, 
    include_lowest=True 
)
print(y_classesI.head())

0    2
1    1
2    2
3    3
4    2
Name: SalePrice, dtype: int64


In [79]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encodedI = le.fit_transform(y_classesI.dropna())


In [80]:
X_numeric = pd.get_dummies(X) 
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_encodedI, test_size=0.1, random_state=42
)

In [81]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }

    model = XGBClassifier(
        **params,
        eval_metric='mlogloss',
        objective='multi:softprob',
        num_class=len(np.unique(y_classesI))
    )

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)

    f1 = f1_score(y_test, y_pred, average='macro')
    return f1


In [None]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=10)


[I 2025-11-14 11:07:53,695] A new study created in memory with name: no-name-8c7a5223-8d21-4c3b-ba77-e0df2e7aa147
[I 2025-11-14 11:07:57,161] Trial 0 finished with value: 0.5034549975237652 and parameters: {'max_depth': 11, 'learning_rate': 0.1857433224109936, 'n_estimators': 640, 'subsample': 0.9681298982249283, 'colsample_bytree': 0.9389803830398851, 'gamma': 3.7750611581364963, 'reg_alpha': 0.78785285234214, 'reg_lambda': 0.30829077929640636}. Best is trial 0 with value: 0.5034549975237652.
[I 2025-11-14 11:07:59,699] Trial 1 finished with value: 0.5120042953592177 and parameters: {'max_depth': 5, 'learning_rate': 0.2423192169893095, 'n_estimators': 496, 'subsample': 0.9076532650284292, 'colsample_bytree': 0.7495872465005673, 'gamma': 2.283549326729374, 'reg_alpha': 0.0965053810662938, 'reg_lambda': 0.06258177557431288}. Best is trial 1 with value: 0.5120042953592177.
[I 2025-11-14 11:08:01,240] Trial 2 finished with value: 0.5188631756554668 and parameters: {'max_depth': 8, 'learni

In [83]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)


Best Macro F1: 0.5373539124186506
Best hyperparameters: {'max_depth': 3, 'learning_rate': 0.2110828120786334, 'n_estimators': 736, 'subsample': 0.736420152921271, 'colsample_bytree': 0.744651824983795, 'gamma': 3.483708287845774, 'reg_alpha': 0.26228706854846606, 'reg_lambda': 0.48568549736830546}


In [84]:
import nbformat
ov.plot_optimization_history(study).show()
ov.plot_param_importances(study).show()

In [85]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classesI))
)

final_model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.744651824983795
,device,
,early_stopping_rounds,
,enable_categorical,False


In [86]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predI = final_model.predict(X_test)
y_probI = final_model.predict_proba(X_test)

print("\n Accuracy:", accuracy_score(y_test, y_predI))

print("\n f1 score: ", sklearn.metrics.f1_score(y_test, y_predI, average='weighted'))

print("\n Precision:", sklearn.metrics.precision_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predI[:5])
print(" Actual class labels   :", y_test[:5])


 Accuracy: 0.8156996587030717

 f1 score:  0.8106701864230572

 Precision: 0.8069985925148042

 Recall 0.8156996587030717

 Predicted class labels: [1 1 2 1 1]
 Actual class labels   : [1 1 2 1 1]


In [87]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=50)


[I 2025-11-14 11:09:02,796] A new study created in memory with name: no-name-16de7e1c-2a18-4e09-8a64-41d5c6d7b24d
[I 2025-11-14 11:09:04,802] Trial 0 finished with value: 0.5179233333468474 and parameters: {'max_depth': 5, 'learning_rate': 0.24108010786940923, 'n_estimators': 392, 'subsample': 0.5494226465288818, 'colsample_bytree': 0.6100179247360547, 'gamma': 4.409176241862282, 'reg_alpha': 0.8924010305981653, 'reg_lambda': 0.5304844517883497}. Best is trial 0 with value: 0.5179233333468474.
[I 2025-11-14 11:09:07,084] Trial 1 finished with value: 0.5266708131296693 and parameters: {'max_depth': 10, 'learning_rate': 0.2002914047885054, 'n_estimators': 275, 'subsample': 0.576157741061803, 'colsample_bytree': 0.8788400245580086, 'gamma': 0.19014152287449015, 'reg_alpha': 0.648828901086326, 'reg_lambda': 0.22080535620883424}. Best is trial 1 with value: 0.5266708131296693.
[I 2025-11-14 11:09:08,200] Trial 2 finished with value: 0.5420110368686701 and parameters: {'max_depth': 10, 'lear

In [88]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)


Best Macro F1: 0.5594198393898722
Best hyperparameters: {'max_depth': 12, 'learning_rate': 0.15693732300546206, 'n_estimators': 267, 'subsample': 0.655962289692099, 'colsample_bytree': 0.6014955837868312, 'gamma': 0.5097900854044788, 'reg_alpha': 0.9000569593998526, 'reg_lambda': 0.5182264410820546}


In [89]:
ov.plot_optimization_history(study).show()
ov.plot_param_importances(study).show()

In [90]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classesI))
)

final_model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6014955837868312
,device,
,early_stopping_rounds,
,enable_categorical,False


In [91]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predI = final_model.predict(X_test)
y_probI = final_model.predict_proba(X_test)

print("\n Accuracy:", accuracy_score(y_test, y_predI))

print("\n f1 score: ", sklearn.metrics.f1_score(y_test, y_predI, average='weighted'))

print("\n Precision:", sklearn.metrics.precision_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predI[:5])
print(" Actual class labels   :", y_test[:5])


 Accuracy: 0.8293515358361775

 f1 score:  0.8258141302205995

 Precision: 0.8240051609551969

 Recall 0.8293515358361775

 Predicted class labels: [1 1 2 1 1]
 Actual class labels   : [1 1 2 1 1]


In [92]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=30)

[I 2025-11-14 11:12:11,863] A new study created in memory with name: no-name-0bb0cbc5-2cb8-452b-b26d-4808cbb7c27c
[I 2025-11-14 11:12:13,500] Trial 0 finished with value: 0.5160169830283126 and parameters: {'max_depth': 3, 'learning_rate': 0.24825373213211754, 'n_estimators': 282, 'subsample': 0.8159508933330476, 'colsample_bytree': 0.755436224240794, 'gamma': 2.0317626292173423, 'reg_alpha': 0.39371611862546574, 'reg_lambda': 0.8987713715845498}. Best is trial 0 with value: 0.5160169830283126.
[I 2025-11-14 11:12:15,827] Trial 1 finished with value: 0.5286059890876 and parameters: {'max_depth': 4, 'learning_rate': 0.19044231400441916, 'n_estimators': 293, 'subsample': 0.84234822956441, 'colsample_bytree': 0.9850146019931469, 'gamma': 0.08579730882933845, 'reg_alpha': 0.9004605178959418, 'reg_lambda': 0.5609379213040127}. Best is trial 1 with value: 0.5286059890876.
[I 2025-11-14 11:12:19,807] Trial 2 finished with value: 0.5195280362725829 and parameters: {'max_depth': 8, 'learning_ra

In [93]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)

Best Macro F1: 0.543276607815423
Best hyperparameters: {'max_depth': 10, 'learning_rate': 0.06074539384733357, 'n_estimators': 582, 'subsample': 0.5122551409256948, 'colsample_bytree': 0.5495685390872551, 'gamma': 2.3849645736455973, 'reg_alpha': 0.5628745394734677, 'reg_lambda': 0.6286218201890884}


In [94]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classesI))
)

final_model.fit(X_train, y_train)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.5495685390872551
,device,
,early_stopping_rounds,
,enable_categorical,False


In [95]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predI = final_model.predict(X_test)
y_probI = final_model.predict_proba(X_test)

print("\n Accuracy:", accuracy_score(y_test, y_predI))

print("\n f1 score: ", sklearn.metrics.f1_score(y_test, y_predI, average='weighted'))

print("\n Precision:", sklearn.metrics.precision_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predI[:5])
print(" Actual class labels   :", y_test[:5])


 Accuracy: 0.825938566552901

 f1 score:  0.8220431330179043

 Precision: 0.8193713555747822

 Recall 0.825938566552901

 Predicted class labels: [1 1 2 1 1]
 Actual class labels   : [1 1 2 1 1]
