In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb
import optuna

data = pd.read_csv("cleanedData.csv")

data['SalePrice_cat'] = pd.qcut(data['SalePrice'], q=3, labels=['low','medium','high'])

le = LabelEncoder()
data['SalePrice_cat'] = le.fit_transform(data['SalePrice_cat'])

X = data.drop(columns=['SalePrice','SalePrice_cat'])
y = data['SalePrice_cat']

X = X.select_dtypes(include=[np.number]).fillna(0)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'multi:softmax',
        'num_class': len(np.unique(y)),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    
    preds = model.predict(X_valid)
    score = f1_score(y_valid, preds, average='macro')
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial
print(f"  F1 Macro: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2025-11-19 00:17:48,205] A new study created in memory with name: no-name-68c2cf4b-8c92-40a6-bde9-e0854348249c
[I 2025-11-19 00:17:51,037] Trial 0 finished with value: 0.8788987679711893 and parameters: {'max_depth': 10, 'learning_rate': 0.016331677839982323, 'n_estimators': 1315, 'subsample': 0.6012932818252686, 'colsample_bytree': 0.9765642359752357, 'gamma': 1.4255833408684815, 'reg_alpha': 2.6552386461305755, 'reg_lambda': 0.47767252579835107, 'min_child_weight': 5}. Best is trial 0 with value: 0.8788987679711893.
[I 2025-11-19 00:17:52,729] Trial 1 finished with value: 0.8736754015313858 and parameters: {'max_depth': 4, 'learning_rate': 0.10557961791099803, 'n_estimators': 1258, 'subsample': 0.8865227393047312, 'colsample_bytree': 0.8175212668212135, 'gamma': 1.9658459789902833, 'reg_alpha': 3.3748515589812103, 'reg_lambda': 2.9911151078410025, 'min_child_weight': 5}. Best is trial 0 with value: 0.8788987679711893.
[I 2025-11-19 00:17:55,252] Trial 2 finished with value: 0.8739

Best trial:
  F1 Macro: 0.8908814000619808
  Params: 
    max_depth: 4
    learning_rate: 0.029344220574932155
    n_estimators: 1474
    subsample: 0.6523967504370884
    colsample_bytree: 0.6507529155778509
    gamma: 1.425995061959762
    reg_alpha: 1.0220987971667634
    reg_lambda: 1.641987590191411
    min_child_weight: 9


In [2]:
from sklearn.metrics import accuracy_score
best_params = trial.params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(np.unique(y)),
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'n_jobs': -1
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)
preds = final_model.predict(X_valid)
accuracy = accuracy_score(y_valid, preds)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.8908


In [None]:
# -----------------------------
# Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
import xgboost as xgb
import optuna

# -----------------------------
# Load dataset
# -----------------------------
data = pd.read_csv("cleanedData.csv")

# -----------------------------
# Create 6+ classes from SalePrice
# -----------------------------
# 5th to 95th percentile to avoid extreme outliers
low_thr = data['SalePrice'].quantile(0.05)
high_thr = data['SalePrice'].quantile(0.95)

# 6 bins: [-inf, bin1, bin2, ..., high_thr, inf]
bins = np.linspace(low_thr, high_thr, 5)  # 5 splits → 6 classes total
data['SalePrice_cat'] = pd.cut(
    data['SalePrice'],
    bins=[-np.inf] + list(bins) + [np.inf],
    labels=list(range(6)),
    include_lowest=True
).astype(int)

y = data['SalePrice_cat']

# -----------------------------
# Features
# -----------------------------
X = data.drop(columns=['SalePrice','SalePrice_cat'])

# Separate numeric and categorical
numeric_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(include=['object','category']).columns

# Simple preprocessing
X_numeric = X[numeric_cols].fillna(0)
X_categorical = pd.get_dummies(X[categorical_cols].fillna('NA'), drop_first=True)

# Combine
X = pd.concat([X_numeric, X_categorical], axis=1)

# Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Optuna objective
# -----------------------------
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'multi:softmax',
        'num_class': len(np.unique(y)),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    
    preds = model.predict(X_valid)
    score = f1_score(y_valid, preds, average='macro')
    return score

# -----------------------------
# Run Optuna study
# -----------------------------
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# -----------------------------
# Evaluate best model
# -----------------------------
trial = study.best_trial
best_params = trial.params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(np.unique(y)),
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'n_jobs': -1
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)
preds = final_model.predict(X_valid)

f1 = f1_score(y_valid, preds, average='macro')
accuracy = accuracy_score(y_valid, preds)

print(f"Best Macro F1: {f1:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
print("Best params:")
for k,v in best_params.items():
    print(f"{k}: {v}")


[I 2025-11-19 00:29:54,846] A new study created in memory with name: no-name-8fad0b9f-676f-4964-9bbf-eb86e21479c7
[I 2025-11-19 00:30:01,730] Trial 0 finished with value: 0.7265593916821279 and parameters: {'max_depth': 8, 'learning_rate': 0.014956240861427562, 'n_estimators': 1268, 'subsample': 0.963387805698956, 'colsample_bytree': 0.7110826920989637, 'gamma': 3.0797603731922414, 'reg_alpha': 4.087908976264748, 'reg_lambda': 3.362994568964596, 'min_child_weight': 10}. Best is trial 0 with value: 0.7265593916821279.
[I 2025-11-19 00:30:09,661] Trial 1 finished with value: 0.7173819143678227 and parameters: {'max_depth': 8, 'learning_rate': 0.0775721778958637, 'n_estimators': 1877, 'subsample': 0.7015267336720495, 'colsample_bytree': 0.8699860411753922, 'gamma': 4.979541650264624, 'reg_alpha': 4.869607095191318, 'reg_lambda': 2.677653260486266, 'min_child_weight': 4}. Best is trial 0 with value: 0.7265593916821279.
[I 2025-11-19 00:30:16,341] Trial 2 finished with value: 0.749507245516

Validation Macro F1: 0.7512
Validation Accuracy: 0.8089
Best params:
max_depth: 4
learning_rate: 0.038176802201332814
n_estimators: 1095
subsample: 0.8222126602361571
colsample_bytree: 0.8658367967618699
gamma: 0.45671022262508265
reg_alpha: 1.3638497076073994
reg_lambda: 2.9746265793584445
min_child_weight: 4
objective: multi:softmax
num_class: 6
eval_metric: mlogloss
verbosity: 0
n_jobs: -1


In [None]:
# -----------------------------
# Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import optuna

# -----------------------------
# Load dataset
# -----------------------------
data = pd.read_csv("cleanedData.csv")

# -----------------------------
# Create 6 classes from SalePrice
# -----------------------------
low_thr = data['SalePrice'].quantile(0.05)
high_thr = data['SalePrice'].quantile(0.95)

# 6 bins: [-inf, bin1, bin2, ..., high_thr, inf]
bins = np.linspace(low_thr, high_thr, 5)  # 5 splits → 6 classes
data['SalePrice_cat'] = pd.cut(
    data['SalePrice'],
    bins=[-np.inf] + list(bins) + [np.inf],
    labels=list(range(6)),
    include_lowest=True
).astype(int)

y = data['SalePrice_cat']

# -----------------------------
# Features
# -----------------------------
X = data.drop(columns=['SalePrice','SalePrice_cat'])

numeric_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(include=['object','category']).columns

X_numeric = X[numeric_cols].fillna(0)
X_categorical = pd.get_dummies(X[categorical_cols].fillna('NA'), drop_first=True)

X = pd.concat([X_numeric, X_categorical], axis=1)

# -----------------------------
# Train/validation split (stratified)
# -----------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Compute class weights for imbalance
# -----------------------------
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
sample_weight_train = y_train.map(class_weights)
sample_weight_valid = y_valid.map(class_weights)

# -----------------------------
# Optuna objective
# -----------------------------
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'multi:softprob',
        'num_class': len(classes),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBClassifier(**param)
    model.fit(
        X_train, y_train,
        sample_weight=sample_weight_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )

    preds = model.predict(X_valid)
    score = f1_score(y_valid, preds, average='macro')
    return score

# -----------------------------
# Run Optuna study
# -----------------------------
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# -----------------------------
# Train final model with best params
# -----------------------------
best_params = study.best_trial.params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(classes),
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'n_jobs': -1
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train, sample_weight=sample_weight_train)

probs = final_model.predict_proba(X_valid)
preds = probs.argmax(axis=1)


f1 = f1_score(y_valid, preds, average='macro')
accuracy = accuracy_score(y_valid, preds)

print(f"Best Macro F1: {f1:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
print("Best params:")
for k,v in best_params.items():
    print(f"{k}: {v}")



[I 2025-11-19 10:27:37,084] A new study created in memory with name: no-name-2e19b39e-3181-4165-b086-6f40288374c2
[I 2025-11-19 10:27:46,495] Trial 0 finished with value: 0.7183433979929105 and parameters: {'max_depth': 11, 'learning_rate': 0.01386718035261599, 'n_estimators': 1843, 'subsample': 0.9250237772252465, 'colsample_bytree': 0.789841914274088, 'gamma': 3.4243131066317916, 'reg_alpha': 4.023219166317466, 'reg_lambda': 1.8944724405674795, 'min_child_weight': 5}. Best is trial 0 with value: 0.7183433979929105.
[I 2025-11-19 10:27:51,734] Trial 1 finished with value: 0.709787797401742 and parameters: {'max_depth': 10, 'learning_rate': 0.03946207896084128, 'n_estimators': 1243, 'subsample': 0.8538183040677659, 'colsample_bytree': 0.7709771816138817, 'gamma': 4.9379762407765355, 'reg_alpha': 3.2420762597764927, 'reg_lambda': 1.8547323164864238, 'min_child_weight': 4}. Best is trial 0 with value: 0.7183433979929105.
[I 2025-11-19 10:27:57,997] Trial 2 finished with value: 0.71026072

AxisError: axis 1 is out of bounds for array of dimension 1

In [9]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import optuna

data = pd.read_csv("cleanedData.csv")

low_thr = data['SalePrice'].quantile(0.05)
high_thr = data['SalePrice'].quantile(0.95)

middle_bins = np.logspace(np.log10(low_thr), np.log10(high_thr), 5)  # 5 splits → 6 classes


data['SalePrice_cat'] = pd.cut(
    data['SalePrice'],
    bins=[-np.inf] + list(middle_bins) + [np.inf],
    # labels=list(range(6)),
    labels = False,
    include_lowest=True
).astype(int)

y = data['SalePrice_cat']

X = data.drop(columns=['SalePrice','SalePrice_cat'])

numeric_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(include=['object','category']).columns

X_numeric = X[numeric_cols].fillna(0)
X_categorical = pd.get_dummies(X[categorical_cols].fillna('NA'), drop_first=True)

X = pd.concat([X_numeric, X_categorical], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
sample_weight_train = y_train.map(class_weights)
sample_weight_valid = y_valid.map(class_weights)

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'multi:softprob',
        'num_class': len(classes),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBClassifier(**param)
    model.fit(
        X_train, y_train,
        sample_weight=sample_weight_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )

    preds = model.predict(X_valid)
    score = f1_score(y_valid, preds, average='macro')
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_trial.params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(classes),
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'n_jobs': -1
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train, sample_weight=sample_weight_train)

probs = final_model.predict_proba(X_valid)
preds = probs.argmax(axis=1)


f1 = f1_score(y_valid, preds, average='macro')
accuracy = accuracy_score(y_valid, preds)

print(f"Best Macro F1: {f1:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
print("Best params:")
for k,v in best_params.items():
    print(f"{k}: {v}")



[I 2025-11-19 10:40:16,244] A new study created in memory with name: no-name-5172fecb-fed6-4506-afbd-211d7481b488
[I 2025-11-19 10:40:21,656] Trial 0 finished with value: 0.6886372931850256 and parameters: {'max_depth': 8, 'learning_rate': 0.10284538487960916, 'n_estimators': 1112, 'subsample': 0.9699872774281526, 'colsample_bytree': 0.719085494399063, 'gamma': 4.694528252684704, 'reg_alpha': 1.9269164826617602, 'reg_lambda': 4.59010454734578, 'min_child_weight': 3}. Best is trial 0 with value: 0.6886372931850256.
[I 2025-11-19 10:40:29,571] Trial 1 finished with value: 0.6680453454987081 and parameters: {'max_depth': 10, 'learning_rate': 0.026996810087829087, 'n_estimators': 1686, 'subsample': 0.7087068045387916, 'colsample_bytree': 0.7623769237356706, 'gamma': 3.7937495084343387, 'reg_alpha': 3.8634479899399605, 'reg_lambda': 4.29608569586166, 'min_child_weight': 5}. Best is trial 0 with value: 0.6886372931850256.
[I 2025-11-19 10:40:35,219] Trial 2 finished with value: 0.68954213167

Best Macro F1: 0.7338
Validation Accuracy: 0.7543
Best params:
max_depth: 10
learning_rate: 0.043671146723953086
n_estimators: 1102
subsample: 0.724773524836148
colsample_bytree: 0.7944967628540945
gamma: 0.49669568574639034
reg_alpha: 0.8427195146571074
reg_lambda: 1.3209460414698326
min_child_weight: 7
objective: multi:softmax
num_class: 6
eval_metric: mlogloss
verbosity: 0
n_jobs: -1


In [13]:
# best_ames_xgb.py
# -----------------------------
# Imports
# -----------------------------
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings("ignore")
# Optional: SMOTE (only if you choose to use it)
try:
    from imblearn.over_sampling import SMOTE
    HAVE_SMOTE = True
except Exception:
    HAVE_SMOTE = False

RANDOM_STATE = 42

# -----------------------------
# Load dataset
# -----------------------------
data = pd.read_csv("cleanedData.csv")

# -----------------------------
# Improved binning (imbalanced with log-spaced middle bins)
# - Keep first & last 5% as extreme bins
# - Middle bins: log-spaced between 5th and 95th percentiles
# -----------------------------
n_classes = 6            # change to 6-10 as teacher allows
tail_pct = 0.05          # tails (first/last) fraction

low_thr = data['SalePrice'].quantile(tail_pct)
high_thr = data['SalePrice'].quantile(1 - tail_pct)

# If low_thr <= 0 (rare for prices), fallback to linear spacing
if low_thr <= 0:
    middle_bins = np.linspace(low_thr, high_thr, n_classes - 1)
else:
    middle_bins = np.logspace(
        np.log10(max(low_thr, 1e-6)),
        np.log10(high_thr),
        n_classes - 1
    )

bins = [-np.inf] + list(middle_bins) + [np.inf]

data['SalePrice_cat'] = pd.cut(
    data['SalePrice'],
    bins=bins,
    labels=False,
    include_lowest=True
).astype(int)

# sanity: ensure labels are 0..n_classes-1
assert data['SalePrice_cat'].nunique() == n_classes, "Number of bins != n_classes; adjust thresholds"

y = data['SalePrice_cat']

# -----------------------------
# Features: rare-grouping + one-hot for categoricals
# -----------------------------
X = data.drop(columns=['SalePrice', 'SalePrice_cat'])

# identify categorical columns (strings / object / category)
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Collapse very rare categories
RARE_THRESHOLD = 15   # categories with <15 samples will be grouped into "RARE"
for col in categorical_cols:
    vc = X[col].value_counts()
    rare_vals = vc[vc < RARE_THRESHOLD].index
    if len(rare_vals) > 0:
        X[col] = X[col].fillna('NA').replace(rare_vals, 'RARE')
    else:
        X[col] = X[col].fillna('NA')

# Numeric: fillna with median
X_num = X[numeric_cols].fillna(X[numeric_cols].median())

# One-hot encode categoricals (drop_first=False to keep full info for tree models)
X_cat = pd.get_dummies(X[categorical_cols], drop_first=False)

X = pd.concat([X_num, X_cat], axis=1)

# -----------------------------
# Train/validation split (stratified)
# -----------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

# -----------------------------
# Optional: SMOTE on training only (comment/uncomment to try)
# -----------------------------
USE_SMOTE = False  # try toggling True/False
if USE_SMOTE and HAVE_SMOTE:
    sm = SMOTE(random_state=RANDOM_STATE)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    print("After SMOTE, class counts:", np.bincount(y_train))

# -----------------------------
# Compute base class weights (inverse-frequency)
# -----------------------------
classes = np.unique(y_train)
base_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, base_weights))

# vectorized sample weights for validation too (for consistent eval metric if needed)
sample_weight_train_base = y_train.map(lambda c: class_weights[int(c)]) if hasattr(y_train, "map") else pd.Series(y_train).map(lambda c: class_weights[int(c)])

# -----------------------------
# Optuna objective: includes tuning of a weight multiplier
# -----------------------------
def objective(trial):
    # hyperparameter search space
    param = {
        'max_depth': trial.suggest_int('max_depth', 6, 14),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.06, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'objective': 'multi:softprob',           # probabilities -> argmax
        'num_class': len(classes),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'tree_method': 'hist',
        'n_jobs': -1,
        'random_state': RANDOM_STATE,
    }

    # allow Optuna to tune a multiplier for class weights (sometimes inverse-frequency isn't enough)
    weight_factor = trial.suggest_float('weight_factor', 0.5, 4.0)
    # compute sample weights for current trial
    sample_weight = None
    try:
        # y_train could be numpy array or pandas Series
        y_train_series = pd.Series(y_train).reset_index(drop=True)
        sample_weight = y_train_series.map(lambda c: class_weights[int(c)] * weight_factor).to_numpy()
    except Exception:
        sample_weight = None

    model = xgb.XGBClassifier(**param)
    # use early stopping on validation with eval metric mlogloss
    model.fit(
        X_train, y_train,
        sample_weight=sample_weight,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )

    probs = model.predict_proba(X_valid)
    preds = np.argmax(probs, axis=1)
    score = f1_score(y_valid, preds, average='macro')

    # use Optuna pruning if you want (not mandatory)
    return score

# -----------------------------
# Run Optuna study
# -----------------------------
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=RANDOM_STATE))
study.optimize(objective, n_trials=60, n_jobs=1)

print("Best trial:")
print(study.best_trial.params)

# -----------------------------
# Train final model with best params
# -----------------------------
best_params = study.best_trial.params.copy()
best_params.update({
    'objective': 'multi:softprob',
    'num_class': len(classes),
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'tree_method': 'hist',
    'n_jobs': -1,
    'random_state': RANDOM_STATE,
})

# extract best weight_factor (if present) and compute final sample_weight
weight_factor = best_params.pop('weight_factor', 1.0)
# ensure keys that are not valid XGBClassifier args are removed (Optuna may store them)
valid_keys = {k for k in xgb.XGBClassifier().get_params().keys()}
# keep only valid args
best_params = {k: v for k, v in best_params.items() if k in valid_keys}
final_model = xgb.XGBClassifier(**best_params)

# sample weights for final training
y_train_series = pd.Series(y_train).reset_index(drop=True)
sample_weight_final = y_train_series.map(lambda c: class_weights[int(c)] * weight_factor).to_numpy()

final_model.fit(
    X_train, y_train,
    sample_weight=sample_weight_final,
    eval_set=[(X_valid, y_valid)],
    verbose=False
)

# predictions & metrics
probs = final_model.predict_proba(X_valid)
preds = np.argmax(probs, axis=1)

f1 = f1_score(y_valid, preds, average='macro')
acc = accuracy_score(y_valid, preds)

print(f"Final Macro F1: {f1:.4f}")
print(f"Final Accuracy: {acc:.4f}")
print("\nClassification report (per-class):\n")
print(classification_report(y_valid, preds, digits=4))

# save model if desired
# final_model.save_model("best_ames_xgb.json")


[I 2025-11-19 10:49:08,619] A new study created in memory with name: no-name-d4d5c9a3-2440-4b20-a7d0-881dd21ff610
[I 2025-11-19 10:49:17,627] Trial 0 finished with value: 0.7189271267267126 and parameters: {'max_depth': 9, 'learning_rate': 0.05492872600571018, 'n_estimators': 1518, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'reg_alpha': 0.2904180608409973, 'reg_lambda': 4.330880728874676, 'min_child_weight': 13, 'max_delta_step': 7, 'grow_policy': 'lossguide', 'weight_factor': 3.413549242801476}. Best is trial 0 with value: 0.7189271267267126.
[I 2025-11-19 10:49:25,921] Trial 1 finished with value: 0.721143198637157 and parameters: {'max_depth': 7, 'learning_rate': 0.013851197621057671, 'n_estimators': 530, 'subsample': 0.7216968971838151, 'colsample_bytree': 0.762378215816119, 'gamma': 2.1597250932105787, 'reg_alpha': 1.4561457009902097, 'reg_lambda': 3.0592644736118975, 'min_child_weight': 3, 'max_delta_step': 3, 'grow_polic

Best trial:
{'max_depth': 12, 'learning_rate': 0.053120016520649654, 'n_estimators': 214, 'subsample': 0.7500351271418565, 'colsample_bytree': 0.5619197489978544, 'gamma': 0.9437325276687885, 'reg_alpha': 4.97019409520938, 'reg_lambda': 0.30072887829763095, 'min_child_weight': 20, 'max_delta_step': 9, 'grow_policy': 'lossguide', 'weight_factor': 2.098402420089419}
Final Macro F1: 0.7362
Final Accuracy: 0.7611

Classification report (per-class):

              precision    recall  f1-score   support

           0     0.6562    0.7241    0.6885        29
           1     0.6591    0.6988    0.6784        83
           2     0.8186    0.7840    0.8010       213
           3     0.7785    0.7885    0.7834       156
           4     0.7467    0.7368    0.7417        76
           5     0.7241    0.7241    0.7241        29

    accuracy                         0.7611       586
   macro avg     0.7305    0.7427    0.7362       586
weighted avg     0.7633    0.7611    0.7619       586



In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.cluster import KMeans
import xgboost as xgb
import optuna

data = pd.read_csv("cleanedData.csv")

n_bins = 6
y_values = np.array(data["SalePrice"]).reshape(-1,1)
kmeans = KMeans(n_clusters=n_bins, random_state=42).fit(y_values)
data['SalePrice_cat'] = kmeans.labels_
y = data['SalePrice_cat']

X = data.drop(columns=['SalePrice','SalePrice_cat'])
categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

RARE_THRESHOLD = 15
for col in categorical_cols:
    vc = X[col].value_counts()
    rare_vals = vc[vc < RARE_THRESHOLD].index
    X[col] = X[col].fillna('NA').replace(rare_vals, 'RARE')

X_num = X[numeric_cols].fillna(X[numeric_cols].median())
X_cat = pd.get_dummies(X[categorical_cols], drop_first=False)
X = pd.concat([X_num, X_cat], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

classes = np.unique(y_train)
base_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, base_weights))
y_train_series = pd.Series(y_train).reset_index(drop=True)
sample_weight_train_base = y_train_series.map(lambda c: class_weights[int(c)]).to_numpy()

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 6, 14),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.06, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'objective': 'multi:softprob',
        'num_class': len(classes),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'tree_method': 'hist',
        'n_jobs': -1,
        'random_state': 42,
    }
    weight_factor = trial.suggest_float('weight_factor', 0.5, 4.0)
    sample_weight = y_train_series.map(lambda c: class_weights[int(c)] * weight_factor).to_numpy()
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, sample_weight=sample_weight, eval_set=[(X_valid, y_valid)], verbose=False)
    probs = model.predict_proba(X_valid)
    preds = np.argmax(probs, axis=1)
    return f1_score(y_valid, preds, average='macro')

study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=100, n_jobs=1)

best_params = study.best_trial.params.copy()
best_params.update({'objective':'multi:softprob','num_class':len(classes),'eval_metric':'mlogloss','verbosity':0,'tree_method':'hist','n_jobs':-1,'random_state':42})
weight_factor = best_params.pop('weight_factor',1.0)
valid_keys = {k for k in xgb.XGBClassifier().get_params().keys()}
best_params = {k:v for k,v in best_params.items() if k in valid_keys}

final_model = xgb.XGBClassifier(**best_params)
sample_weight_final = y_train_series.map(lambda c: class_weights[int(c)] * weight_factor).to_numpy()
final_model.fit(X_train, y_train, sample_weight=sample_weight_final, eval_set=[(X_valid, y_valid)], verbose=False)
probs = final_model.predict_proba(X_valid)
preds = np.argmax(probs, axis=1)

f1 = f1_score(y_valid, preds, average='macro')
acc = accuracy_score(y_valid, preds)

print(f"Final Macro F1: {f1:.4f}")
print(f"Final Accuracy: {acc:.4f}")


[I 2025-11-19 11:12:17,993] A new study created in memory with name: no-name-cb492364-16bd-4beb-bae3-d0b7b8df2b75
[I 2025-11-19 11:12:28,628] Trial 0 finished with value: 0.7535998628729174 and parameters: {'max_depth': 9, 'learning_rate': 0.05492872600571018, 'n_estimators': 1518, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'reg_alpha': 0.2904180608409973, 'reg_lambda': 4.330880728874676, 'min_child_weight': 13, 'max_delta_step': 7, 'grow_policy': 'lossguide', 'weight_factor': 3.413549242801476}. Best is trial 0 with value: 0.7535998628729174.
[I 2025-11-19 11:12:38,479] Trial 1 finished with value: 0.7633768519013887 and parameters: {'max_depth': 7, 'learning_rate': 0.013851197621057671, 'n_estimators': 530, 'subsample': 0.7216968971838151, 'colsample_bytree': 0.762378215816119, 'gamma': 2.1597250932105787, 'reg_alpha': 1.4561457009902097, 'reg_lambda': 3.0592644736118975, 'min_child_weight': 3, 'max_delta_step': 3, 'grow_poli

Final Macro F1: 0.7831
Final Accuracy: 0.8038


In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.cluster import KMeans
import xgboost as xgb
import optuna

# Load dataset
data = pd.read_csv("cleanedData.csv")

# KMeans binning
n_bins = 6
y_values = data["SalePrice"].values.reshape(-1,1)
kmeans = KMeans(n_clusters=n_bins, random_state=42).fit(y_values)
data['SalePrice_cat'] = kmeans.labels_
y = data['SalePrice_cat']

# Features
X = data.drop(columns=['SalePrice','SalePrice_cat'])
categorical_cols = X.select_dtypes(include=['object','category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Rare category handling
RARE_THRESHOLD = 15
for col in categorical_cols:
    vc = X[col].value_counts()
    rare_vals = vc[vc < RARE_THRESHOLD].index
    X[col] = X[col].fillna('NA').replace(rare_vals, 'RARE')

# Log-transform skewed numeric features
skewed = ['LotArea', 'GrLivArea', 'TotalBsmtSF']  # expand if needed
for col in skewed:
    if col in numeric_cols:
        X[col] = np.log1p(X[col])

X_num = X[numeric_cols].fillna(X[numeric_cols].median())
X_cat = pd.get_dummies(X[categorical_cols], drop_first=False)
X = pd.concat([X_num, X_cat], axis=1)

# Train/validation split (stratified)
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Class weights
classes = np.unique(y_train)
base_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, base_weights))
y_train_series = pd.Series(y_train).reset_index(drop=True)

# Optuna objective
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 6, 14),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.06, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 200, 2000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 10),
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'objective': 'multi:softprob',
        'num_class': len(classes),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'tree_method': 'hist',
        'n_jobs': -1,
        'random_state': 42,
    }
    weight_factor = trial.suggest_float('weight_factor', 0.5, 4.0)
    sample_weight = y_train_series.map(lambda c: class_weights[int(c)] * weight_factor).to_numpy()
    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, sample_weight=sample_weight, eval_set=[(X_valid, y_valid)], verbose=False)
    probs = model.predict_proba(X_valid)
    
    # Per-class threshold tuning (simplified, single iteration)
    preds = np.argmax(probs, axis=1)
    return f1_score(y_valid, preds, average='macro')

# Optuna study
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=100, n_jobs=1)

# Best params
best_params = study.best_trial.params.copy()
best_params.update({'objective':'multi:softprob','num_class':len(classes),'eval_metric':'mlogloss','verbosity':0,'tree_method':'hist','n_jobs':-1,'random_state':42})
weight_factor = best_params.pop('weight_factor',1.0)
valid_keys = {k for k in xgb.XGBClassifier().get_params().keys()}
best_params = {k:v for k,v in best_params.items() if k in valid_keys}

# Train final model
final_model = xgb.XGBClassifier(**best_params)
sample_weight_final = y_train_series.map(lambda c: class_weights[int(c)] * weight_factor).to_numpy()
final_model.fit(X_train, y_train, sample_weight=sample_weight_final, eval_set=[(X_valid, y_valid)], verbose=False)
probs = final_model.predict_proba(X_valid)

# Optional: fine-tune thresholds per class for macro-F1
preds = np.argmax(probs, axis=1)
f1 = f1_score(y_valid, preds, average='macro')
acc = accuracy_score(y_valid, preds)

print(f"Final Macro F1: {f1:.4f}")
print(f"Final Accuracy: {acc:.4f}")


[I 2025-11-19 11:57:12,795] A new study created in memory with name: no-name-2ab42d4f-674a-4193-a4b8-91e049bfebf1
[I 2025-11-19 11:57:23,614] Trial 0 finished with value: 0.7535998628729174 and parameters: {'max_depth': 9, 'learning_rate': 0.05492872600571018, 'n_estimators': 1518, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.5780093202212182, 'gamma': 0.7799726016810132, 'reg_alpha': 0.2904180608409973, 'reg_lambda': 4.330880728874676, 'min_child_weight': 13, 'max_delta_step': 7, 'grow_policy': 'lossguide', 'weight_factor': 3.413549242801476}. Best is trial 0 with value: 0.7535998628729174.
[I 2025-11-19 11:57:34,136] Trial 1 finished with value: 0.7633768519013887 and parameters: {'max_depth': 7, 'learning_rate': 0.013851197621057671, 'n_estimators': 530, 'subsample': 0.7216968971838151, 'colsample_bytree': 0.762378215816119, 'gamma': 2.1597250932105787, 'reg_alpha': 1.4561457009902097, 'reg_lambda': 3.0592644736118975, 'min_child_weight': 3, 'max_delta_step': 3, 'grow_poli

Final Macro F1: 0.7831
Final Accuracy: 0.8038
