In [3]:
import optuna
from optuna.samplers import TPESampler
import plotly
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [4]:
data = pd.read_csv('cleanedData.csv')
print(data.shape)
data.head()

(2930, 76)


Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,...,3Ssn Porch,Screen Porch,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice,HasFence
0,1,526301100,20,RL,141.0,31770,1,IR1,Lvl,AllPub,...,0,0,NoFence,0,5,2010,WD,Normal,215000,0
1,2,526350040,20,RH,80.0,11622,1,Reg,Lvl,AllPub,...,0,120,MnPrv,0,6,2010,WD,Normal,105000,1
2,3,526351010,20,RL,81.0,14267,1,IR1,Lvl,AllPub,...,0,0,NoFence,12500,6,2010,WD,Normal,172000,0
3,4,526353030,20,RL,93.0,11160,1,Reg,Lvl,AllPub,...,0,0,NoFence,0,4,2010,WD,Normal,244000,0
4,5,527105010,60,RL,74.0,13830,1,IR1,Lvl,AllPub,...,0,0,MnPrv,0,3,2010,WD,Normal,189900,1


In [5]:
X, y = data.drop("SalePrice", axis = 'columns'), data['SalePrice']
X.head()


Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Lot Shape,Land Contour,Utilities,...,Enclosed Porch,3Ssn Porch,Screen Porch,Fence,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,HasFence
0,1,526301100,20,RL,141.0,31770,1,IR1,Lvl,AllPub,...,0,0,0,NoFence,0,5,2010,WD,Normal,0
1,2,526350040,20,RH,80.0,11622,1,Reg,Lvl,AllPub,...,0,0,120,MnPrv,0,6,2010,WD,Normal,1
2,3,526351010,20,RL,81.0,14267,1,IR1,Lvl,AllPub,...,0,0,0,NoFence,12500,6,2010,WD,Normal,0
3,4,526353030,20,RL,93.0,11160,1,Reg,Lvl,AllPub,...,0,0,0,NoFence,0,4,2010,WD,Normal,0
4,5,527105010,60,RL,74.0,13830,1,IR1,Lvl,AllPub,...,0,0,0,MnPrv,0,3,2010,WD,Normal,1


In [6]:
total = 0 
bins = 0
lims = [] #limita superioara = valori pana la ....
frequency = [] #numarul de valori pana la limita superioara corespunzatoare
sorted_values = data['SalePrice'].value_counts().sort_index()
for i, j in sorted_values.items():
    if total + j > 100:
        bins += 1 
        lims.append(str(i))
        frequency.append(total)
        total = j
    else:
        total = total + j
print(bins)
print(lims)
print(frequency)

30
['81500', '95000', '106000', '113000', '119000', '124000', '128000', '131000', '135000', '138000', '141500', '145000', '148800', '154000', '159000', '164990', '170440', '176000', '181000', '187500', '194500', '204000', '214000', '225000', '237000', '252678', '272000', '300000', '338931', '438780']
[100, 98, 99, 98, 90, 98, 99, 97, 76, 92, 100, 77, 99, 99, 100, 100, 100, 89, 96, 98, 100, 97, 98, 96, 98, 100, 100, 98, 100, 100]


In [7]:
lims_numeric = sorted([int(x) for x in lims])
lims_numeric.append(int(data['SalePrice'].max() + 1))
y_classEq = pd.cut(data['SalePrice'], bins=[0]+lims_numeric, labels=False, include_lowest=True).astype(int)


In [8]:
X_numeric = pd.get_dummies(X) 
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric, y_classEq, test_size=0.1, random_state=42
)

In [9]:
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }

    model = XGBClassifier(
        **params,
        eval_metric='mlogloss',
        objective='multi:softprob',
        num_class=len(np.unique(y_classEq))
    )

    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)
    y_pred = np.argmax(y_pred_proba, axis=1)

    f1 = f1_score(y_test, y_pred, average='macro')
    return f1

In [10]:
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=30)


[I 2025-11-04 20:57:04,181] A new study created in memory with name: no-name-4324aad4-25b3-447c-9742-fc7ffd689e54
[I 2025-11-04 20:57:19,259] Trial 0 finished with value: 0.19959688561961436 and parameters: {'max_depth': 11, 'learning_rate': 0.029327819830763317, 'n_estimators': 883, 'subsample': 0.8304999917568873, 'colsample_bytree': 0.9903926142401551, 'gamma': 4.390795156264044, 'reg_alpha': 0.12518656047763344, 'reg_lambda': 0.04923793175717517}. Best is trial 0 with value: 0.19959688561961436.
[I 2025-11-04 20:57:31,250] Trial 1 finished with value: 0.21456232854664894 and parameters: {'max_depth': 7, 'learning_rate': 0.16598925019849153, 'n_estimators': 801, 'subsample': 0.8201532715170989, 'colsample_bytree': 0.9710140992098035, 'gamma': 1.6219454613498951, 'reg_alpha': 0.007562070147671296, 'reg_lambda': 0.4842217175166309}. Best is trial 1 with value: 0.21456232854664894.
[I 2025-11-04 20:57:43,323] Trial 2 finished with value: 0.1946559270742832 and parameters: {'max_depth':

In [11]:
best_trial = study.best_trial
print("Best Macro F1:", best_trial.value)
print("Best hyperparameters:", best_trial.params)


Best Macro F1: 0.24416658089826448
Best hyperparameters: {'max_depth': 10, 'learning_rate': 0.06667949602323966, 'n_estimators': 422, 'subsample': 0.872347610279336, 'colsample_bytree': 0.8434196411961002, 'gamma': 2.1733233308502053, 'reg_alpha': 0.7039531952299232, 'reg_lambda': 0.740829600990393}


In [12]:
best_params = best_trial.params
final_model = XGBClassifier(
    **best_params,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classEq))
)

final_model.fit(X_train, y_train)


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8434196411961002
,device,
,early_stopping_rounds,
,enable_categorical,False
