In [35]:
# importing libraries

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from catboost import CatBoostClassifier
import numpy as np
from scipy.stats import uniform, randint
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# loading dataset
df = pd.read_csv("df_salary.csv")

In [4]:
# binning the target variable - salary
bins = [-1, 30000, 60000, 90000, 150000, float('inf')]
labels = ['0-30k', '30k-60k', '60k-90k', '90k-150k', 'Unclassified']

df['Salary_Group'] = pd.cut(df['SALARY'], bins=bins, labels=labels)

print("\nTarget Class Distribution (New Bins):")
print(df['Salary_Group'].value_counts())


Target Class Distribution (New Bins):
Salary_Group
30k-60k         151078
60k-90k         100257
90k-150k         81008
Unclassified     78782
0-30k            66233
Name: count, dtype: int64


In [28]:
TARGET = "Salary_Group"

X_full = df.drop(["SALARY", TARGET], axis=1)
y_full = df[TARGET]

# One-Hot Encode
X_full = pd.get_dummies(X_full, drop_first=True)

# Scaling
scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)

In [29]:
df_sampled = df.sample(5000, random_state=42)

X_sample = df_sampled.drop(["SALARY", TARGET], axis=1)
y_sample = df_sampled[TARGET]

X_sample = pd.get_dummies(X_sample, drop_first=True)

# match columns with full dataset
X_sample = X_sample.reindex(columns=X_full.columns, fill_value=0)

# scale
X_sample_scaled = scaler.transform(X_sample)

In [30]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_sample_scaled, y_sample, test_size=0.2, random_state=42, stratify=y_sample
)

In [32]:
cat_model = CatBoostClassifier(
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=False
)

param_dist = {
    "learning_rate": [0.01, 0.05, 0.1],
    "depth": [4, 6, 8, 10],
    "l2_leaf_reg": [1, 3, 5, 7, 9],
    "iterations": [300, 500, 800],
    "bagging_temperature": [0, 1, 3]
}

random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=15,
    scoring="accuracy",
    cv=3,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train_s, y_train_s)

print("Best Parameters:", random_search.best_params_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
Best Parameters: {'learning_rate': 0.01, 'l2_leaf_reg': 1, 'iterations': 800, 'depth': 6, 'bagging_temperature': 0}


In [33]:
best_params = random_search.best_params_

final_cat = CatBoostClassifier(
    **best_params,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=False
)

final_cat.fit(X_full_scaled, y_full)

<catboost.core.CatBoostClassifier at 0x7a3355104950>

In [42]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Predict
y_pred_full = final_cat.predict(X_full_scaled)

# Convert both to 1D NumPy arrays of strings
y_true = np.array(y_full).ravel().astype(str)
y_pred = np.array(y_pred_full).ravel().astype(str)

# Now compute metrics
print("\nFinal Model Accuracy (Full Data):", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred))



Final Model Accuracy (Full Data): 0.7028295266246158


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Classification Report:
               precision    recall  f1-score   support

       0-30k       0.76      0.61      0.68     66233
     30k-60k       0.65      0.76      0.70    151078
     60k-90k       0.59      0.46      0.52    100257
    90k-150k       0.61      0.68      0.64     81008
Unclassified       1.00      1.00      1.00     78782
         nan       0.00      0.00      0.00         1

    accuracy                           0.70    477359
   macro avg       0.60      0.59      0.59    477359
weighted avg       0.70      0.70      0.70    477359



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
