In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

# === Your best params ===
best_params = {
     'subsample': 0.8,
    'reg_lambda': 3.07, 
    'reg_alpha': np.float64(0.738), 
    'n_estimators': 352, 
    'min_child_weight': 4.5, 
    'max_depth': 3, 
    'learning_rate': np.float64(0.1805), 
    'gamma': np.float64(0.5025), 
    'colsample_bytree': 0.6
}

# === Load data ===
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# === Features / target split ===
X_train = train.drop(['id', 'WeightCategory'], axis=1)
y_train = train['WeightCategory']
X_test  = test.drop(['id'], axis=1)

# === One-hot encode categorical features (drop_first like your reference) ===
X_train = pd.get_dummies(X_train, drop_first=True)
X_test  = pd.get_dummies(X_test,  drop_first=True)

# === Align columns: ensures test has same columns as train ===
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# === Scale numeric features (fit on train, apply to both) ===
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# === Encode target labels ===
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

# === Build model with your best params and fixed options ===
model = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    tree_method='hist',
    **best_params
)

# === Train on full training data ===
print("Training model with best_params...")
model.fit(X_train_s, y_train_enc)

# === Predict on test set ===
y_pred = model.predict(X_test_s)
y_pred_labels = le.inverse_transform(y_pred)

# === Create submission DataFrame and save CSV ===
submission = pd.DataFrame({
    'id': test['id'],
    'WeightCategory': y_pred_labels
})

filename = 'submission_best_params.csv'
submission.to_csv(filename, index=False)
print(f"Saved submission file: {filename}")
