In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb
import optuna

data = pd.read_csv("cleanedData.csv")

data['SalePrice_cat'] = pd.qcut(data['SalePrice'], q=3, labels=['low','medium','high'])

le = LabelEncoder()
data['SalePrice_cat'] = le.fit_transform(data['SalePrice_cat'])

X = data.drop(columns=['SalePrice','SalePrice_cat'])
y = data['SalePrice_cat']

X = X.select_dtypes(include=[np.number]).fillna(0)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'multi:softmax',
        'num_class': len(np.unique(y)),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    
    preds = model.predict(X_valid)
    score = f1_score(y_valid, preds, average='macro')
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial
print(f"  F1 Macro: {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


[I 2025-11-19 00:17:48,205] A new study created in memory with name: no-name-68c2cf4b-8c92-40a6-bde9-e0854348249c
[I 2025-11-19 00:17:51,037] Trial 0 finished with value: 0.8788987679711893 and parameters: {'max_depth': 10, 'learning_rate': 0.016331677839982323, 'n_estimators': 1315, 'subsample': 0.6012932818252686, 'colsample_bytree': 0.9765642359752357, 'gamma': 1.4255833408684815, 'reg_alpha': 2.6552386461305755, 'reg_lambda': 0.47767252579835107, 'min_child_weight': 5}. Best is trial 0 with value: 0.8788987679711893.
[I 2025-11-19 00:17:52,729] Trial 1 finished with value: 0.8736754015313858 and parameters: {'max_depth': 4, 'learning_rate': 0.10557961791099803, 'n_estimators': 1258, 'subsample': 0.8865227393047312, 'colsample_bytree': 0.8175212668212135, 'gamma': 1.9658459789902833, 'reg_alpha': 3.3748515589812103, 'reg_lambda': 2.9911151078410025, 'min_child_weight': 5}. Best is trial 0 with value: 0.8788987679711893.
[I 2025-11-19 00:17:55,252] Trial 2 finished with value: 0.8739

Best trial:
  F1 Macro: 0.8908814000619808
  Params: 
    max_depth: 4
    learning_rate: 0.029344220574932155
    n_estimators: 1474
    subsample: 0.6523967504370884
    colsample_bytree: 0.6507529155778509
    gamma: 1.425995061959762
    reg_alpha: 1.0220987971667634
    reg_lambda: 1.641987590191411
    min_child_weight: 9


In [2]:
from sklearn.metrics import accuracy_score
best_params = trial.params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(np.unique(y)),
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'n_jobs': -1
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)
preds = final_model.predict(X_valid)
accuracy = accuracy_score(y_valid, preds)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.8908


In [None]:
# -----------------------------
# Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
import xgboost as xgb
import optuna

# -----------------------------
# Load dataset
# -----------------------------
data = pd.read_csv("cleanedData.csv")

# -----------------------------
# Create 6+ classes from SalePrice
# -----------------------------
# 5th to 95th percentile to avoid extreme outliers
low_thr = data['SalePrice'].quantile(0.05)
high_thr = data['SalePrice'].quantile(0.95)

# 6 bins: [-inf, bin1, bin2, ..., high_thr, inf]
bins = np.linspace(low_thr, high_thr, 5)  # 5 splits → 6 classes total
data['SalePrice_cat'] = pd.cut(
    data['SalePrice'],
    bins=[-np.inf] + list(bins) + [np.inf],
    labels=list(range(6)),
    include_lowest=True
).astype(int)

y = data['SalePrice_cat']

# -----------------------------
# Features
# -----------------------------
X = data.drop(columns=['SalePrice','SalePrice_cat'])

# Separate numeric and categorical
numeric_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(include=['object','category']).columns

# Simple preprocessing
X_numeric = X[numeric_cols].fillna(0)
X_categorical = pd.get_dummies(X[categorical_cols].fillna('NA'), drop_first=True)

# Combine
X = pd.concat([X_numeric, X_categorical], axis=1)

# Train/validation split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Optuna objective
# -----------------------------
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'multi:softmax',
        'num_class': len(np.unique(y)),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)
    
    preds = model.predict(X_valid)
    score = f1_score(y_valid, preds, average='macro')
    return score

# -----------------------------
# Run Optuna study
# -----------------------------
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# -----------------------------
# Evaluate best model
# -----------------------------
trial = study.best_trial
best_params = trial.params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(np.unique(y)),
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'n_jobs': -1
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train)
preds = final_model.predict(X_valid)

f1 = f1_score(y_valid, preds, average='macro')
accuracy = accuracy_score(y_valid, preds)

print(f"Best Macro F1: {f1:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
print("Best params:")
for k,v in best_params.items():
    print(f"{k}: {v}")


[I 2025-11-19 00:29:54,846] A new study created in memory with name: no-name-8fad0b9f-676f-4964-9bbf-eb86e21479c7
[I 2025-11-19 00:30:01,730] Trial 0 finished with value: 0.7265593916821279 and parameters: {'max_depth': 8, 'learning_rate': 0.014956240861427562, 'n_estimators': 1268, 'subsample': 0.963387805698956, 'colsample_bytree': 0.7110826920989637, 'gamma': 3.0797603731922414, 'reg_alpha': 4.087908976264748, 'reg_lambda': 3.362994568964596, 'min_child_weight': 10}. Best is trial 0 with value: 0.7265593916821279.
[I 2025-11-19 00:30:09,661] Trial 1 finished with value: 0.7173819143678227 and parameters: {'max_depth': 8, 'learning_rate': 0.0775721778958637, 'n_estimators': 1877, 'subsample': 0.7015267336720495, 'colsample_bytree': 0.8699860411753922, 'gamma': 4.979541650264624, 'reg_alpha': 4.869607095191318, 'reg_lambda': 2.677653260486266, 'min_child_weight': 4}. Best is trial 0 with value: 0.7265593916821279.
[I 2025-11-19 00:30:16,341] Trial 2 finished with value: 0.749507245516

Validation Macro F1: 0.7512
Validation Accuracy: 0.8089
Best params:
max_depth: 4
learning_rate: 0.038176802201332814
n_estimators: 1095
subsample: 0.8222126602361571
colsample_bytree: 0.8658367967618699
gamma: 0.45671022262508265
reg_alpha: 1.3638497076073994
reg_lambda: 2.9746265793584445
min_child_weight: 4
objective: multi:softmax
num_class: 6
eval_metric: mlogloss
verbosity: 0
n_jobs: -1


In [5]:
# -----------------------------
# Imports
# -----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb
import optuna

# -----------------------------
# Load dataset
# -----------------------------
data = pd.read_csv("cleanedData.csv")

# -----------------------------
# Create 6 classes from SalePrice
# -----------------------------
low_thr = data['SalePrice'].quantile(0.05)
high_thr = data['SalePrice'].quantile(0.95)

# 6 bins: [-inf, bin1, bin2, ..., high_thr, inf]
bins = np.linspace(low_thr, high_thr, 5)  # 5 splits → 6 classes
data['SalePrice_cat'] = pd.cut(
    data['SalePrice'],
    bins=[-np.inf] + list(bins) + [np.inf],
    labels=list(range(6)),
    include_lowest=True
).astype(int)

y = data['SalePrice_cat']

# -----------------------------
# Features
# -----------------------------
X = data.drop(columns=['SalePrice','SalePrice_cat'])

numeric_cols = X.select_dtypes(include=[np.number]).columns
categorical_cols = X.select_dtypes(include=['object','category']).columns

X_numeric = X[numeric_cols].fillna(0)
X_categorical = pd.get_dummies(X[categorical_cols].fillna('NA'), drop_first=True)

X = pd.concat([X_numeric, X_categorical], axis=1)

# -----------------------------
# Train/validation split (stratified)
# -----------------------------
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# -----------------------------
# Compute class weights for imbalance
# -----------------------------
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
sample_weight_train = y_train.map(class_weights)
sample_weight_valid = y_valid.map(class_weights)

# -----------------------------
# Optuna objective
# -----------------------------
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 4, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 5),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'objective': 'multi:softmax',
        'num_class': len(classes),
        'eval_metric': 'mlogloss',
        'verbosity': 0,
        'n_jobs': -1
    }

    model = xgb.XGBClassifier(**param)
    model.fit(
        X_train, y_train,
        sample_weight=sample_weight_train,
        eval_set=[(X_valid, y_valid)],
        verbose=False
    )

    preds = model.predict(X_valid)
    score = f1_score(y_valid, preds, average='macro')
    return score

# -----------------------------
# Run Optuna study
# -----------------------------
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# -----------------------------
# Train final model with best params
# -----------------------------
best_params = study.best_trial.params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': len(classes),
    'eval_metric': 'mlogloss',
    'verbosity': 0,
    'n_jobs': -1
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train, y_train, sample_weight=sample_weight_train)

preds = final_model.predict(X_valid)
f1 = f1_score(y_valid, preds, average='macro')
accuracy = accuracy_score(y_valid, preds)

print(f"Best Macro F1: {f1:.4f}")
print(f"Validation Accuracy: {accuracy:.4f}")
print("Best params:")
for k,v in best_params.items():
    print(f"{k}: {v}")


[I 2025-11-19 00:49:02,499] A new study created in memory with name: no-name-45cfdbe2-518a-414b-b7f0-27fcb9b53f27
[I 2025-11-19 00:49:09,225] Trial 0 finished with value: 0.7199315918919859 and parameters: {'max_depth': 4, 'learning_rate': 0.048490067420842116, 'n_estimators': 1361, 'subsample': 0.7637631331697291, 'colsample_bytree': 0.9622802180632417, 'gamma': 2.6897265650306226, 'reg_alpha': 2.674030322961447, 'reg_lambda': 3.1050806141031617, 'min_child_weight': 5}. Best is trial 0 with value: 0.7199315918919859.
[I 2025-11-19 00:49:15,956] Trial 1 finished with value: 0.7170792009500012 and parameters: {'max_depth': 10, 'learning_rate': 0.04309523348786515, 'n_estimators': 1467, 'subsample': 0.9289337139234339, 'colsample_bytree': 0.8619369726479871, 'gamma': 1.5297391615952494, 'reg_alpha': 4.122276452105427, 'reg_lambda': 4.863213490031834, 'min_child_weight': 9}. Best is trial 0 with value: 0.7199315918919859.
[I 2025-11-19 00:49:21,084] Trial 2 finished with value: 0.70509476

Best Macro F1: 0.7662
Validation Accuracy: 0.8020
Best params:
max_depth: 6
learning_rate: 0.08894941189312122
n_estimators: 1014
subsample: 0.9149061944128936
colsample_bytree: 0.905038464632803
gamma: 0.06364527964376628
reg_alpha: 1.4827641953812911
reg_lambda: 0.9288061117489679
min_child_weight: 7
objective: multi:softmax
num_class: 6
eval_metric: mlogloss
verbosity: 0
n_jobs: -1
