In [73]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import cudf
import cuml
from cuml.neighbors import KNeighborsClassifier as cuKNeighbors

In [74]:
df = pd.read_csv('df_salary.csv')
df.shape

(546713, 80)

In [75]:
# --- Binning ---

# These yield .65 accuracy
bins = [-1, 30000, 60000, 90000, 150000, float('inf')]
labels = ['0-30k', '30k-60k', '60k-90k', '90k-150k', 'Unclassified']

# Using salary_mi bins (.38 accuracy)
# bins = [-1]
# labels = []
# for i in range(10000,170000,10000):
#   bins.append(i)
#   labels.append(str(i))
# bins.append(float('inf'))
# labels.append("Unclassified")

df['Salary_Group'] = pd.cut(df['SALARY'], bins=bins, labels=labels)

In [76]:
print("\nTarget Class Distribution (New Bins):")
print(df['Salary_Group'].value_counts())


Target Class Distribution (New Bins):
Salary_Group
30k-60k         167156
60k-90k         115678
90k-150k         98431
Unclassified     89670
0-30k            75778
Name: count, dtype: int64


In [77]:
# --- Preprocessing ---
# features (X) and target (y).
target_col = 'Salary_Group'
X = df.drop(['SALARY', 'Salary_Group'], axis=1)
y = df[target_col]

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encode
X = pd.get_dummies(X, drop_first=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [78]:
# ---Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [79]:
# ---Hyperparameter Tuning (Fast Subsampling Method) ---
sample_size = 10000
X_tune, _, y_tune, _ = train_test_split(X_train, y_train, train_size=sample_size, random_state=42, stratify=y_train)

In [80]:
param_grid = {'n_neighbors': range(1, 31, 2)} # Check odd numbers
knn = KNeighborsClassifier(n_jobs=-1)

grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_tune, y_tune)

best_k = grid_search.best_params_['n_neighbors']
print(f"Best K found: {best_k}")

Best K found: 25


In [81]:
# ---Evaluation ---
print(f"\nTraining final model with K={best_k} on full training set")
final_model = KNeighborsClassifier(n_neighbors=best_k, n_jobs=-1)
final_model.fit(X_train, y_train)

print("Predicting on test set")
y_pred = final_model.predict(X_test)


Training final model with K=25 on full training set
Predicting on test set


In [82]:
# --- Results ---
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {accuracy:.4f}")
print("\nReport:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Test Set Accuracy: 0.6569

Classification Report:
              precision    recall  f1-score   support

       0-30k       0.69      0.56      0.62     15156
     30k-60k       0.62      0.68      0.65     33431
     60k-90k       0.52      0.43      0.47     23136
    90k-150k       0.55      0.64      0.59     19686
Unclassified       1.00      1.00      1.00     17934

    accuracy                           0.66    109343
   macro avg       0.67      0.66      0.67    109343
weighted avg       0.66      0.66      0.65    109343

