In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

# Load the training and test data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# =============================================
# Separate features and target
# =============================================
X_train = train.drop(['id', 'WeightCategory'], axis=1)
y_train = train['WeightCategory']

X_test = test.drop(['id'], axis=1)  # Only features, no target

# One-hot encode categorical features
X_train = pd.get_dummies(X_train, drop_first=True)
X_test  = pd.get_dummies(X_test,  drop_first=True)

# Align columns
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)

# Scale numeric features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# Encode target
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

# Base model
xgb = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

# Focused Grid Search Space based on your top performers
# More focused grid search (fewer combinations)
param_grid = {
    'n_estimators': [340, 350],
    'learning_rate': [ 0.15 ],
    'max_depth': [3],
    'min_child_weight': [5],
    'subsample': [0.8],
    'colsample_bytree': [0.6, 0.7],
    'gamma': [0.2,0.3, 0.4],
    'reg_alpha': [ 0.6, 0.8],
    'reg_lambda': [  2.7, 3.0],
}
print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")

# Grid search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    return_train_score=False
)

grid_search.fit(X_train_s, y_train_enc)

print("✅ Best parameters:", grid_search.best_params_)
print("✅ Best CV Accuracy:", grid_search.best_score_ * 100)

# ====== Extract top 10 parameter sets ======
import pandas as pd
results_df = pd.DataFrame(grid_search.cv_results_)

# Sort by mean_test_score descending and take top 10
top10 = results_df.sort_values('mean_test_score', ascending=False).head(10)

print(f"\nTop 10 parameter sets:")
for idx, (_, row) in enumerate(top10.iterrows(), 1):
    print(f"{idx}: {row['mean_test_score']:.4f} => params={row['params']}")

# ====== Loop through top 10 and train/predict & save CSVs ======
for i, (_, row) in enumerate(top10.iterrows(), 1):
    params_i = row['params']
    print(f"\nTraining model #{i} with score {row['mean_test_score']:.4f}")
    print(f"Params: {params_i}")
    
    model_i = XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1,
        tree_method='hist',
        **params_i
    )
    
    # Train on full training data
    model_i.fit(X_train_s, y_train_enc)
    
    # Predict on test data
    y_pred_i = model_i.predict(X_test_s)
    y_pred_labels_i = le.inverse_transform(y_pred_i)
    
    # Create submission file
    submission_i = pd.DataFrame({'id': test['id'], 'WeightCategory': y_pred_labels_i})
    filename = f"model{i}_score_{row['mean_test_score']:.4f}.csv"
    submission_i.to_csv(filename, index=False)
    print(f"✅ Saved {filename}")

print(f"\nAll done. Generated 10 submission files: submission_top10_model1.csv … submission_top10_model10.csv")


Total combinations: 48
Fitting 5 folds for each of 48 candidates, totalling 240 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


✅ Best parameters: {'colsample_bytree': 0.7, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 350, 'reg_alpha': 0.8, 'reg_lambda': 2.7, 'subsample': 0.8}
✅ Best CV Accuracy: 90.83245754399171

Top 10 parameter sets:
1: 0.9083 => params={'colsample_bytree': 0.7, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 350, 'reg_alpha': 0.8, 'reg_lambda': 2.7, 'subsample': 0.8}
2: 0.9083 => params={'colsample_bytree': 0.6, 'gamma': 0.4, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 350, 'reg_alpha': 0.8, 'reg_lambda': 2.7, 'subsample': 0.8}
3: 0.9082 => params={'colsample_bytree': 0.6, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 340, 'reg_alpha': 0.8, 'reg_lambda': 3.0, 'subsample': 0.8}
4: 0.9081 => params={'colsample_bytree': 0.6, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 340, 'reg

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model1_score_0.9083.csv

Training model #2 with score 0.9083
Params: {'colsample_bytree': 0.6, 'gamma': 0.4, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 350, 'reg_alpha': 0.8, 'reg_lambda': 2.7, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model2_score_0.9083.csv

Training model #3 with score 0.9082
Params: {'colsample_bytree': 0.6, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 340, 'reg_alpha': 0.8, 'reg_lambda': 3.0, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model3_score_0.9082.csv

Training model #4 with score 0.9081
Params: {'colsample_bytree': 0.6, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 340, 'reg_alpha': 0.6, 'reg_lambda': 2.7, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model4_score_0.9081.csv

Training model #5 with score 0.9081
Params: {'colsample_bytree': 0.6, 'gamma': 0.4, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 350, 'reg_alpha': 0.6, 'reg_lambda': 2.7, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model5_score_0.9081.csv

Training model #6 with score 0.9081
Params: {'colsample_bytree': 0.6, 'gamma': 0.4, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 350, 'reg_alpha': 0.6, 'reg_lambda': 3.0, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model6_score_0.9081.csv

Training model #7 with score 0.9079
Params: {'colsample_bytree': 0.6, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 340, 'reg_alpha': 0.8, 'reg_lambda': 2.7, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model7_score_0.9079.csv

Training model #8 with score 0.9079
Params: {'colsample_bytree': 0.6, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 350, 'reg_alpha': 0.6, 'reg_lambda': 2.7, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model8_score_0.9079.csv

Training model #9 with score 0.9079
Params: {'colsample_bytree': 0.7, 'gamma': 0.3, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 340, 'reg_alpha': 0.8, 'reg_lambda': 2.7, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model9_score_0.9079.csv

Training model #10 with score 0.9078
Params: {'colsample_bytree': 0.6, 'gamma': 0.4, 'learning_rate': 0.15, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 340, 'reg_alpha': 0.8, 'reg_lambda': 2.7, 'subsample': 0.8}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Saved model10_score_0.9078.csv

All done. Generated 10 submission files: submission_top10_model1.csv … submission_top10_model10.csv
