In [1]:
import optuna
import optuna.visualization as ov
from optuna.samplers import TPESampler
import plotly
import plotly.express as px
import numpy as np
import pandas as pd
from sklearn.metrics import roc_curve, precision_recall_curve, f1_score, auc
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import pickle
import os
from sklearn.metrics import (
    roc_curve, auc,
    precision_recall_curve, f1_score
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ndata = pd.read_csv('incercare.csv')
print(ndata.shape)
ndata.head()

(2922, 170)


Unnamed: 0,MS SubClass,Lot Frontage,Lot Area,Street,Lot Shape,Utilities,Land Slope,Overall Qual,Overall Cond,Year Built,...,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD,Sale Condition_AdjLand,Sale Condition_Alloca,Sale Condition_Family,Sale Condition_Normal,Sale Condition_Partial
0,0.0,0.410959,0.14242,1.0,0.0,0.0,0.0,0.555556,0.5,0.637681,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.202055,0.048246,1.0,1.0,0.0,0.0,0.444444,0.625,0.644928,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.205479,0.060609,1.0,0.0,0.0,0.0,0.555556,0.625,0.623188,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.246575,0.046087,1.0,1.0,0.0,0.0,0.666667,0.5,0.695652,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.235294,0.181507,0.058566,1.0,0.0,0.0,0.0,0.444444,0.5,0.905797,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [3]:
price = ndata["SalePrice"].copy() #dezechilibru
    
low_thr = price.quantile(0.05)
high_thr = price.quantile(0.95)
    
middle = price[(price >= low_thr) & (price <= high_thr)]
bins = np.linspace(low_thr, high_thr, 9)
    
classes = pd.cut(
    price,
    bins=[-np.inf] + list(bins) + [np.inf],
    labels=list(range(0, 10)),
    include_lowest=True
    )
y_classD = classes.astype(int)
y_classD

0       5
1       1
2       3
3       6
4       4
       ..
2917    2
2918    2
2919    2
2920    3
2921    4
Name: SalePrice, Length: 2922, dtype: int64

In [4]:
class_counts = y_classD.value_counts(normalize=True)
class_weights = 1 / (class_counts ** 0.5)
weights = y_classD.map(class_weights)

In [5]:
Xd = ndata.drop(columns=["SalePrice"])
# Xd = pd.get_dummies(Xd)
X_trainD, X_testD, y_trainD, y_testD = train_test_split(
    Xd, y_classD, 
    stratify=y_classD, 
    test_size=0.1,
    random_state=42
)

In [6]:
weights_train = y_trainD.map(class_weights)

In [7]:
def objectiveD(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0)
    }


    model = XGBClassifier(
        **params,
        eval_metric='mlogloss',
        objective='multi:softprob',
        num_class=len(np.unique(y_classD))
    )

    model.fit(
        X_trainD,
        y_trainD,
        sample_weight=weights_train,
        eval_set=[(X_testD, y_testD)],
        verbose=False
    )

    y_pred_probaD = model.predict_proba(X_testD)
    y_predD = np.argmax(y_pred_probaD, axis=1)

    f1 = f1_score(y_testD, y_predD, average='macro')
    return f1

In [9]:
studyD = optuna.create_study(direction='maximize', sampler=TPESampler())
studyD.optimize(objectiveD, n_trials=50)

[I 2025-11-19 14:30:24,765] A new study created in memory with name: no-name-5f6587b9-d0cc-4267-8795-5c78ff7b71ad
[I 2025-11-19 14:30:29,680] Trial 0 finished with value: 0.5586520307222859 and parameters: {'max_depth': 6, 'learning_rate': 0.13771180783862838, 'n_estimators': 635, 'subsample': 0.5785045487384508, 'colsample_bytree': 0.5183883443359747, 'gamma': 1.120827979010094, 'reg_alpha': 0.7925281341842879, 'reg_lambda': 0.08197718420064792}. Best is trial 0 with value: 0.5586520307222859.
[I 2025-11-19 14:30:38,652] Trial 1 finished with value: 0.5625014828871188 and parameters: {'max_depth': 7, 'learning_rate': 0.014940347800276062, 'n_estimators': 663, 'subsample': 0.6486604732567913, 'colsample_bytree': 0.5234624525390712, 'gamma': 3.279324080226293, 'reg_alpha': 0.41846801963884006, 'reg_lambda': 0.7374911264610869}. Best is trial 1 with value: 0.5625014828871188.
[I 2025-11-19 14:30:41,132] Trial 2 finished with value: 0.5683838143286722 and parameters: {'max_depth': 12, 'le

In [10]:
best_trialD = studyD.best_trial
print("Best Macro F1:", best_trialD.value)
print("Best hyperparameters:", best_trialD.params)

Best Macro F1: 0.6249604066417828
Best hyperparameters: {'max_depth': 7, 'learning_rate': 0.26195398681517956, 'n_estimators': 822, 'subsample': 0.6072381568502935, 'colsample_bytree': 0.554179081331013, 'gamma': 4.301944332457242, 'reg_alpha': 0.5408068480386198, 'reg_lambda': 0.010495172678354231}


In [11]:
best_paramsD = best_trialD.params
final_modelD = XGBClassifier(
    **best_paramsD,
    eval_metric='mlogloss',
    objective='multi:softprob',
    num_class=len(np.unique(y_classD))
)

final_modelD.fit(X_trainD, y_trainD)

0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.554179081331013
,device,
,early_stopping_rounds,
,enable_categorical,False


In [14]:
from sklearn.metrics import accuracy_score, classification_report
import sklearn
y_predD = final_modelD.predict(X_testD)
y_probD = final_modelD.predict_proba(X_testD)

print("\n Accuracy:", accuracy_score(y_testD, y_predD))

print("\n f1 score: ", sklearn.metrics.f1_score(y_testD, y_predD, average='macro'))

print("\n Precision:", sklearn.metrics.precision_score(y_testD, y_predD, average='weighted', zero_division=0))

print("\n Recall", sklearn.metrics.recall_score(y_testD, y_predD, average='weighted', zero_division=0))

print("\n Predicted class labels:", y_predD[:5])
print(" Actual class labels   :", y_testD[:5])


 Accuracy: 0.5699658703071673

 f1 score:  0.5355479458635843

 Precision: 0.5658229891127328

 Recall 0.5699658703071673

 Predicted class labels: [2 3 4 4 3]
 Actual class labels   : 2208    4
1812    2
2427    5
1289    4
386     3
Name: SalePrice, dtype: int64


In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.cluster import KMeans
import xgboost as xgb
import optuna

data = pd.read_csv("incercare.csv")


n_bins = 6
y_values = np.array(data["SalePrice"]).reshape(-1,1)
kmeans = KMeans(n_clusters=n_bins, random_state=42).fit(y_values)
data['SalePrice_cat'] = kmeans.labels_
y = data['SalePrice_cat']

X = data.drop(columns=['SalePrice','SalePrice_cat'])
categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

RARE_THRESHOLD = 15
for col in categorical_cols:
    vc = X[col].value_counts()
    rare_vals = vc[vc < RARE_THRESHOLD].index
    X[col] = X[col].fillna('NA').replace(rare_vals, 'RARE')

X_num = X[numeric_cols].fillna(X[numeric_cols].median())
# X_cat = pd.get_dummies(X[categorical_cols], drop_first=False)
# X = pd.concat([X_num, X_cat], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

classes = np.unique(y_train)
base_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, base_weights))
y_train_series = pd.Series(y_train).reset_index(drop=True)
sample_weight_train_base = y_train_series.map(lambda c: class_weights[int(c)]).to_numpy()

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 6, 14),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.06, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'objective': 'multi:softprob',
        'num_class': len(classes),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'tree_method': 'hist',
        'n_jobs': -1,
        'random_state': 42,
    }
    weight_factor = trial.suggest_float('weight_factor', 0.5, 4.0)
    sample_weight = y_train_series.map(lambda c: class_weights[int(c)] * weight_factor).to_numpy()
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, sample_weight=sample_weight, eval_set=[(X_valid, y_valid)], verbose=False)
    probs = model.predict_proba(X_valid)
    preds = np.argmax(probs, axis=1)
    return f1_score(y_valid, preds, average='macro')

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=200, n_jobs=1)




[I 2025-11-19 15:02:47,456] A new study created in memory with name: no-name-75e415ae-bc07-482b-b052-07f447380882
[I 2025-11-19 15:02:56,838] Trial 0 finished with value: 0.7783435712966358 and parameters: {'max_depth': 9, 'learning_rate': 0.05492872600571018, 'n_estimators': 1518, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'reg_alpha': 0.2904180608409973, 'reg_lambda': 4.330880728874676, 'min_child_weight': 13, 'max_delta_step': 7, 'grow_policy': 'lossguide', 'weight_factor': 3.413549242801476}. Best is trial 0 with value: 0.7783435712966358.
[I 2025-11-19 15:03:07,744] Trial 1 finished with value: 0.7959163404167126 and parameters: {'max_depth': 7, 'learning_rate': 0.013851197621057671, 'n_estimators': 530, 'subsample': 0.721696897183815, 'colsample_bytree': 0.762378215816119, 'gamma': 2.1597250932105787, 'reg_alpha': 1.4561457009902097, 'reg_lambda': 3.0592644736118975, 'min_child_weight': 3, 'max_delta_step': 3, 'grow_polic

In [24]:
study.optimize(objective, n_trials=20)

[I 2025-11-19 15:42:17,038] Trial 200 finished with value: 0.797031000643018 and parameters: {'max_depth': 7, 'learning_rate': 0.011572563562977698, 'n_estimators': 337, 'subsample': 0.803099461222028, 'colsample_bytree': 0.5613911706411573, 'gamma': 3.408996840545147, 'reg_alpha': 1.4382924182330497, 'reg_lambda': 0.13344291914496545, 'min_child_weight': 2, 'max_delta_step': 2, 'grow_policy': 'lossguide', 'weight_factor': 3.5411850738157984}. Best is trial 55 with value: 0.8142286375346438.
[I 2025-11-19 15:42:26,166] Trial 201 finished with value: 0.7940867979581067 and parameters: {'max_depth': 7, 'learning_rate': 0.010939478550564301, 'n_estimators': 447, 'subsample': 0.8221991780189147, 'colsample_bytree': 0.5793178434050805, 'gamma': 3.254102357420984, 'reg_alpha': 1.7501934283020333, 'reg_lambda': 0.3374091447940026, 'min_child_weight': 1, 'max_delta_step': 2, 'grow_policy': 'lossguide', 'weight_factor': 3.3671760452408073}. Best is trial 55 with value: 0.8142286375346438.
[I 20

In [23]:
best_params = study.best_trial.params.copy()
best_params.update({'objective':'multi:softprob','num_class':len(classes),'eval_metric':'mlogloss','verbosity':0,'tree_method':'hist','n_jobs':-1,'random_state':42})
weight_factor = best_params.pop('weight_factor',1.0)
valid_keys = {k for k in xgb.XGBClassifier().get_params().keys()}
best_params = {k:v for k,v in best_params.items() if k in valid_keys}

final_model = xgb.XGBClassifier(**best_params)
sample_weight_final = y_train_series.map(lambda c: class_weights[int(c)] * weight_factor).to_numpy()
final_model.fit(X_train, y_train, sample_weight=sample_weight_final, eval_set=[(X_valid, y_valid)], verbose=False)
probs = final_model.predict_proba(X_valid)
preds = np.argmax(probs, axis=1)

f1 = f1_score(y_valid, preds, average='macro')
acc = accuracy_score(y_valid, preds)

print(f"Final Macro F1: {f1:.4f}")
print(f"Final Accuracy: {acc:.4f}")

Final Macro F1: 0.8142
Final Accuracy: 0.8137
