In [1]:
# importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

In [2]:
# loading dataset
df = pd.read_csv("df_salary.csv")

In [3]:
df.shape

(546713, 81)

In [4]:
# binning the target variable - salary
bins = [-1, 30000, 60000, 90000, 150000, float('inf')]
labels = ['0-30k', '30k-60k', '60k-90k', '90k-150k', 'Unclassified']

df['Salary_Group'] = pd.cut(df['SALARY'], bins=bins, labels=labels)

In [5]:
print("\nTarget Class Distribution (New Bins):")
print(df['Salary_Group'].value_counts())


Target Class Distribution (New Bins):
Salary_Group
30k-60k         167156
60k-90k         115678
90k-150k         98431
Unclassified     89670
0-30k            75778
Name: count, dtype: int64


In [6]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# --- Preprocessing ---
# features (X) and target (y).
target_col = 'Salary_Group'
X = df.drop(['SALARY', 'Salary_Group'], axis=1)
y = df[target_col]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encode
X = pd.get_dummies(X, drop_first=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# splitting the data into train and test (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [8]:
params = {
    "max_depth": [5, 10, 20, None],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 5, 10],
    "criterion": ["gini", "entropy"]
}

dt = DecisionTreeClassifier(random_state=42)

grid = GridSearchCV(
    estimator=dt,
    param_grid=params,
    scoring="accuracy",
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train, y_train)

print("Best Parameters:", grid.best_params_)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}


In [9]:
# training the model with the best parameters
best_dt = grid.best_estimator_
best_dt.fit(X_train, y_train)

In [11]:
# evaluating model
y_pred = best_dt.predict(X_test)
y_test_labels = le.inverse_transform(y_test)
y_pred_labels = le.inverse_transform(y_pred)

# Compute accuracy
acc = accuracy_score(y_test_labels, y_pred_labels)

print(f"Test Set Accuracy: {acc:.4f}\n")

print("Classification Report:")
print(classification_report(
    y_test_labels,
    y_pred_labels,
    target_names=le.classes_,
    digits=4
))


Test Set Accuracy: 0.6752

Classification Report:
              precision    recall  f1-score   support

       0-30k     0.7477    0.5910    0.6602     15156
     30k-60k     0.6284    0.7136    0.6683     33431
     60k-90k     0.5465    0.3997    0.4617     23136
    90k-150k     0.5637    0.7032    0.6258     19686
Unclassified     1.0000    0.9993    0.9996     17934

    accuracy                         0.6752    109343
   macro avg     0.6973    0.6814    0.6831    109343
weighted avg     0.6769    0.6752    0.6702    109343

