In [4]:
import optuna
import optuna.visualization as ov
from optuna.samplers import TPESampler
import plotly
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [5]:
data = pd.read_csv('cleanedData.csv')
data = data.drop("Order", axis = 'columns')
data = data.drop("PID", axis = 'columns')
print(data.shape)
data.head()

(2930, 74)


Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,3Ssn Porch,Screen Porch,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice,HasFence
0,20,RL,141.0,31770,1,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,NoFence,0,5,2010,WD,Normal,215000,0
1,20,RH,80.0,11622,1,Reg,Lvl,AllPub,Inside,Gtl,...,0,120,MnPrv,0,6,2010,WD,Normal,105000,1
2,20,RL,81.0,14267,1,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,NoFence,12500,6,2010,WD,Normal,172000,0
3,20,RL,93.0,11160,1,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,NoFence,0,4,2010,WD,Normal,244000,0
4,60,RL,74.0,13830,1,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,MnPrv,0,3,2010,WD,Normal,189900,1


In [6]:
X, y = data.drop("SalePrice", axis = 'columns'), data['SalePrice']
X.head()


Unnamed: 0,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,...,Enclosed Porch,3Ssn Porch,Screen Porch,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,HasFence
0,20,RL,141.0,31770,1,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,NoFence,0,5,2010,WD,Normal,0
1,20,RH,80.0,11622,1,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,120,MnPrv,0,6,2010,WD,Normal,1
2,20,RL,81.0,14267,1,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,NoFence,12500,6,2010,WD,Normal,0
3,20,RL,93.0,11160,1,Reg,Lvl,AllPub,Corner,Gtl,...,0,0,0,NoFence,0,4,2010,WD,Normal,0
4,60,RL,74.0,13830,1,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,MnPrv,0,3,2010,WD,Normal,1


In [7]:
y_classesI = pd.cut(
    y,
    bins=30, 
    labels=False, 
    include_lowest=True 
)
print(y_classesI.head())

0    8
1    3
2    6
3    9
4    7
Name: SalePrice, dtype: int64


In [8]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encodedI = le.fit_transform(y_classesI.dropna())


In [9]:
X_numeric = pd.get_dummies(X) 
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_encodedI, test_size=0.1, random_state=42
)

In [10]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }

    model = XGBClassifier(
        **params,
        eval_metric='mlogloss',
        objective='multi:softprob',
        num_class=len(np.unique(y_classesI))
    )

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)

    f1 = f1_score(y_test, y_pred, average='macro')
    return f1


In [21]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=10)


[I 2025-11-10 11:00:13,805] A new study created in memory with name: no-name-531e9cb1-b124-42a5-bae9-74fb7fe46dfe
[I 2025-11-10 11:00:17,264] Trial 0 finished with value: 0.2715833095367273 and parameters: {'max_depth': 7, 'learning_rate': 0.2523070023601915, 'n_estimators': 114, 'subsample': 0.9593102044107737, 'colsample_bytree': 0.7556706971341328, 'gamma': 2.9434628854504403, 'reg_alpha': 0.6364936343152076, 'reg_lambda': 0.2220311485290145}. Best is trial 0 with value: 0.2715833095367273.
[I 2025-11-10 11:00:33,805] Trial 1 finished with value: 0.2950956080125671 and parameters: {'max_depth': 4, 'learning_rate': 0.05858911464357657, 'n_estimators': 687, 'subsample': 0.8047384929818768, 'colsample_bytree': 0.5178930371861064, 'gamma': 2.8419810882207823, 'reg_alpha': 0.8120538494238488, 'reg_lambda': 0.708156792595479}. Best is trial 1 with value: 0.2950956080125671.
[I 2025-11-10 11:00:37,535] Trial 2 finished with value: 0.3044191710887138 and parameters: {'max_depth': 10, 'learn

In [22]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)


Best Macro F1: 0.3044191710887138
Best hyperparameters: {'max_depth': 10, 'learning_rate': 0.15886973602220342, 'n_estimators': 110, 'subsample': 0.8713770340951208, 'colsample_bytree': 0.9180929011351336, 'gamma': 2.9235950475168733, 'reg_alpha': 0.7235509296385123, 'reg_lambda': 0.0016075573221312478}


In [23]:
ov.plot_optimization_history(study).show()
ov.plot_param_importances(study).show()

In [24]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classesI))
)

final_model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9180929011351336
,device,
,early_stopping_rounds,
,enable_categorical,False


In [25]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predI = final_model.predict(X_test)
y_probI = final_model.predict_proba(X_test)

print("\n Accuracy:", accuracy_score(y_test, y_predI))

print("\n f1 score: ", sklearn.metrics.f1_score(y_test, y_predI, average='weighted'))

print("\n Precision:", sklearn.metrics.precision_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predI[:5])
print(" Actual class labels   :", y_test[:5])


 Accuracy: 0.4778156996587031

 f1 score:  0.4706964215776391

 Precision: 0.47608478634852536

 Recall 0.4778156996587031

 Predicted class labels: [4 4 7 4 4]
 Actual class labels   : [5 4 7 4 4]


In [16]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=50)


[I 2025-11-08 16:51:45,089] A new study created in memory with name: no-name-74ba3f6e-4972-4f3b-ac86-716a3825938e
[I 2025-11-08 16:51:55,006] Trial 0 finished with value: 0.286137266437839 and parameters: {'max_depth': 11, 'learning_rate': 0.06485773363805027, 'n_estimators': 933, 'subsample': 0.830302810112661, 'colsample_bytree': 0.5665798371674127, 'gamma': 3.5743347724202867, 'reg_alpha': 0.7164188816937308, 'reg_lambda': 0.7295458214662205}. Best is trial 0 with value: 0.286137266437839.
[I 2025-11-08 16:52:01,306] Trial 1 finished with value: 0.27031476157706885 and parameters: {'max_depth': 6, 'learning_rate': 0.26695886151386067, 'n_estimators': 629, 'subsample': 0.8563092262840948, 'colsample_bytree': 0.7044095365992605, 'gamma': 4.713650294838698, 'reg_alpha': 0.6648896463194283, 'reg_lambda': 0.2665366158456336}. Best is trial 0 with value: 0.286137266437839.
[I 2025-11-08 16:52:06,630] Trial 2 finished with value: 0.2858122829397984 and parameters: {'max_depth': 4, 'learnin

In [17]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)


Best Macro F1: 0.3299176748782477
Best hyperparameters: {'max_depth': 9, 'learning_rate': 0.13672361110837905, 'n_estimators': 933, 'subsample': 0.7359176395837738, 'colsample_bytree': 0.9484981971533545, 'gamma': 0.3574626568652035, 'reg_alpha': 1.450497664635085e-05, 'reg_lambda': 0.8105321134139173}


In [18]:
ov.plot_optimization_history(study).show()
ov.plot_param_importances(study).show()

In [19]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classesI))
)

final_model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9484981971533545
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predI = final_model.predict(X_test)
y_probI = final_model.predict_proba(X_test)

print("\n Accuracy:", accuracy_score(y_test, y_predI))

print("\n f1 score: ", sklearn.metrics.f1_score(y_test, y_predI, average='weighted'))

print("\n Precision:", sklearn.metrics.precision_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_test, y_predI, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predI[:5])
print(" Actual class labels   :", y_test[:5])


 Accuracy: 0.5187713310580204

 f1 score:  0.5154798254006071

 Precision: 0.5207063495581292

 Recall 0.5187713310580204

 Predicted class labels: [5 4 6 5 4]
 Actual class labels   : [5 4 7 4 4]
