In [21]:
# importing libraries

import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier

In [20]:
# loading dataset
df = pd.read_csv("df_salary.csv")

In [22]:
# binning the target variable - salary
bins = [-1, 30000, 60000, 90000, 150000, float('inf')]
labels = ['0-30k', '30k-60k', '60k-90k', '90k-150k', 'Unclassified']

df['Salary_Group'] = pd.cut(df['SALARY'], bins=bins, labels=labels)

print("\nTarget Class Distribution (New Bins):")
print(df['Salary_Group'].value_counts())


Target Class Distribution (New Bins):
Salary_Group
30k-60k         167156
60k-90k         115678
90k-150k         98431
Unclassified     89670
0-30k            75778
Name: count, dtype: int64


In [23]:
target_col = 'Salary_Group'

# Drop raw salary + target
X = df.drop(['SALARY', 'Salary_Group'], axis=1)
y = df[target_col]

# Encode target bins (0–30k, 30–60k, 60–90k, 90–150k, Unclassified)
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encode all categorical columns
X = pd.get_dummies(X, drop_first=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [24]:
# splitting the data into train and test (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [29]:
# searching the best parameters using only a sample of data

X_sample = pd.DataFrame(X_train).sample(n=5000, random_state=42)
y_sample = pd.Series(y_train).loc[X_sample.index]

X_train_s, X_val_s, y_train_s, y_val_s = train_test_split(
    X_sample, y_sample, test_size=0.2, random_state=42
)

lgb_model = LGBMClassifier(
    objective="multiclass",
    metric="multi_logloss",
    num_class=len(le.classes_),
    verbosity=-1,
    random_state=42
)

param_dist = {
    'num_leaves': [15, 31, 63, 127],
    'max_depth': [-1, 3, 5, 7, 9],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 400, 600],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'min_child_samples': [5, 10, 20, 30],
}

search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=25,
    scoring='accuracy',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Run search on small sample
search.fit(X_train_s, y_train_s)

print("\nBest Hyperparameters Found:")
print(search.best_params_)
print("Best sample accuracy:", search.best_score_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits

Best Hyperparameters Found:
{'subsample': 0.8, 'num_leaves': 127, 'n_estimators': 400, 'min_child_samples': 10, 'max_depth': 5, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
Best sample accuracy: 0.6662497708385117


In [32]:
# training on the full model

best_params = search.best_params_

best_model = LGBMClassifier(
    objective="multiclass",
    metric="multi_logloss",
    num_class=len(le.classes_),
    verbosity=-1,
    random_state=42,
    **best_params
)

best_model.fit(X_train, y_train)

In [33]:
y_pred = best_model.predict(X_test)

print(f"\nTest Set Accuracy: {accuracy_score(y_test, y_pred):.4f}\n")

print("Classification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=le.classes_
))




Test Set Accuracy: 0.6961

Classification Report:
              precision    recall  f1-score   support

       0-30k       0.75      0.63      0.68     15156
     30k-60k       0.64      0.75      0.69     33431
     60k-90k       0.59      0.43      0.50     23136
    90k-150k       0.61      0.70      0.65     19686
Unclassified       1.00      1.00      1.00     17934

    accuracy                           0.70    109343
   macro avg       0.72      0.70      0.70    109343
weighted avg       0.70      0.70      0.69    109343

