In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier

# Load the training and test data
train = pd.read_csv("train.csv")
test  = pd.read_csv("test.csv")

# =============================================
# Separate features and target
# =============================================
X_train = train.drop(['id', 'WeightCategory'], axis=1)
y_train = train['WeightCategory']

X_test = test.drop(['id'], axis=1)  # Only features, no target

# One-hot encode categorical features
X_train = pd.get_dummies(X_train, drop_first=True)
X_test  = pd.get_dummies(X_test,  drop_first=True)

# Align columns
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)

# Scale numeric features
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# Encode target
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)

# Base model
xgb = XGBClassifier(
    objective='multi:softprob',
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42,
    n_jobs=-1,
    tree_method='hist'
)

# Search space
param_dist = {
    'n_estimators': np.arange(300, 600, 50),
    'learning_rate': np.linspace(0.01, 0.2, 10),
    'max_depth': [3,5,7,9],
    'min_child_weight': [1,3,5,7],
    'subsample': [0.6,0.8,1.0],
    'colsample_bytree': [0.6,0.8,1.0],
    'gamma': np.linspace(0,0.4,5),
    'reg_alpha': [0,0.1,0.4,0.5,0.6,0.8],
    'reg_lambda': [1,1.5,2,2.5],
}

# Randomized search
rand_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=60,
    scoring='accuracy',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    return_train_score=False
)

rand_search.fit(X_train_s, y_train_enc)

print("✅ Best parameters (single best):", rand_search.best_params_)
print("✅ Best CV Accuracy:", rand_search.best_score_ * 100)

# ====== New part: extract top 5 parameter sets ======
import pandas as pd
results_df = pd.DataFrame(rand_search.cv_results_)

# Sort by mean_test_score descending and take top 5 unique parameter sets
top5 = results_df.sort_values('mean_test_score', ascending=False).head(5)

print("\nTop 5 parameter sets:")
for idx, row in top5.iterrows():
    print(f"Rank {idx}: mean_test_score={row['mean_test_score']:.4f}, params={row['params']}")

# ====== Loop through those top 5 and train/predict & save CSVs ======
for i, (_, row) in enumerate(top5.iterrows(), start=1):
    params_i = row['params']
    print(f"\nTraining model #{i} with params: {params_i}")
    model_i = XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42,
        n_jobs=-1,
        tree_method='hist',
        **params_i
    )
    model_i.fit(X_train_s, y_train_enc)
    y_pred_i = model_i.predict(X_test_s)
    y_pred_labels_i = le.inverse_transform(y_pred_i)
    submission_i = pd.DataFrame({'id': test['id'], 'WeightCategory': y_pred_labels_i})
    filename = f"submission_top5_model{i}.csv"
    submission_i.to_csv(filename, index=False)
    print(f"✅ Saved {filename}")

print("\nAll done. Generated 5 submission files: submission_top5_model1.csv … submission_top5_model5.csv")
