In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, make_scorer, recall_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import seaborn as sns
import pickle
import matplotlib.pyplot as plt
import shap
from tqdm import tqdm
import pickle

In [2]:
data = pd.read_csv('../data/LLCP_agg_cleaned.csv')
# top 3 relevant features (from SHAP analysis of XGBoost model)
relevant_features = ['_AGEG5YR', 'Heart Attack', 'GenHealth', 'Sex', 'CVDCRHD4']
data = data[relevant_features]
# drop rows with missing values
data = data.dropna()

In [3]:
std_scaler = StandardScaler()
X = data.drop('CVDCRHD4', axis=1)
y = data['CVDCRHD4']
print(X.columns, X.shape)

X = std_scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# split train set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

Index(['_AGEG5YR', 'Heart Attack', 'GenHealth', 'Sex'], dtype='object') (2074811, 4)


In [18]:
# Apply a vanilla kNN model to the data and see how it performs. This is a baseline model to compare against.
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [19]:
y_pred = knn.predict(X_val)

In [20]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97    313252
         1.0       0.53      0.20      0.29     18718

    accuracy                           0.95    331970
   macro avg       0.74      0.60      0.63    331970
weighted avg       0.93      0.95      0.93    331970



Yep, the recall of the minority class is very low. We will now try to improve it

In [21]:
knn_weighted = KNeighborsClassifier(n_neighbors=5, weights='distance')
knn_weighted.fit(X_train, y_train)

In [22]:
y_pred = knn_weighted.predict(X_val)

In [23]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97    313252
         1.0       0.53      0.20      0.29     18718

    accuracy                           0.95    331970
   macro avg       0.74      0.60      0.63    331970
weighted avg       0.93      0.95      0.93    331970



didn't change the result by one bit... Now we try to change the class weights. Sklearn doesn't have a built-in way to do this, so we will have to do it manually

In [74]:
vc = y_train.value_counts()
class_weight = {0: vc[1] / vc[0], 1: vc[1]}

# perform kNN with class weights
# convert X_train and X_val to numpy arrays for faster computation, use 10% of validation data and train data
k = 5
X_train_np = X_train[:int(len(X_train) * 0.1)]
y_train_np = np.array(y_train.iloc[:int(len(y_train) * 0.1)])
X_val_np = X_val[:int(len(X_val) * 0.1)]
y_val_np = np.array(y_val.iloc[:int(len(y_val) * 0.1)])
y_pred = np.zeros(len(y_val_np))
for i in tqdm(range(len(y_val_np))):
    # find k nearest neighbors
    dist = np.sum((X_train_np - X_val_np[i])**2, axis=1)
    idx = np.argsort(dist)[:k]
    # find class weights
    w = np.array([class_weight[y_train_np[j]] for j in idx])
    # predict class
    y_pred[i] = 1 if np.sum(w[y_train_np[idx] == 1]) > np.sum(w[y_train_np[idx] == 0]) else 0

100%|██████████| 33197/33197 [03:57<00:00, 139.82it/s]


In [77]:
print(classification_report(np.array(y_val_np), y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.81      0.88     31357
         1.0       0.16      0.63      0.26      1840

    accuracy                           0.80     33197
   macro avg       0.57      0.72      0.57     33197
weighted avg       0.93      0.80      0.85     33197



That works. The recall is much higher now.  Let's try to multiply the minority class weight by 2

In [80]:
class_weight = {0: vc[1] / vc[0], 1: 2 * vc[1]}
y_pred_2 = np.zeros(len(y_val_np))
for i in tqdm(range(len(y_val_np))):
    # find k nearest neighbors
    dist = np.sum((X_train_np - X_val_np[i])**2, axis=1)
    idx = np.argsort(dist)[:k]
    # find class weights
    w = np.array([class_weight[y_train_np[j]] for j in idx])
    # predict class
    y_pred_2[i] = 1 if np.sum(w[y_train_np[idx] == 1]) > np.sum(w[y_train_np[idx] == 0]) else 0

100%|██████████| 33197/33197 [04:03<00:00, 136.42it/s]


In [81]:
print(classification_report(np.array(y_val_np), y_pred_2))

              precision    recall  f1-score   support

         0.0       0.97      0.81      0.88     31357
         1.0       0.16      0.63      0.26      1840

    accuracy                           0.80     33197
   macro avg       0.57      0.72      0.57     33197
weighted avg       0.93      0.80      0.85     33197



Hmm, the result didn't change. But if we increase the minority class weight further, the recall will increase, but the precision will decrease. Now let's try to find the optimal k.

In [19]:
# use randomized search to find best hyperparameters (just k)
# use recall of minority class as scoring metric
# samplle 10% of training data
idx = np.random.randint(0, len(X_train), int(len(X_train) * 0.1))
X_train_sample = X_train[idx]
y_train_sample = y_train.iloc[idx]
scorer = make_scorer(recall_score, pos_label=1)
param_grid = {'n_neighbors': np.arange(1, int(np.sqrt(len(X_train))), 2)}
knn = KNeighborsClassifier()
knn_cv = RandomizedSearchCV(knn, param_grid, cv=3, n_iter=10, verbose=3, scoring=scorer)
knn_cv.fit(X_train_sample, y_train_sample)

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV 1/3] END ...................n_neighbors=221;, score=0.273 total time=   2.2s
[CV 2/3] END ...................n_neighbors=221;, score=0.265 total time=   2.2s
[CV 3/3] END ...................n_neighbors=221;, score=0.272 total time=   2.2s
[CV 1/3] END ...................n_neighbors=399;, score=0.298 total time=   3.8s
[CV 2/3] END ...................n_neighbors=399;, score=0.302 total time=   3.8s
[CV 3/3] END ...................n_neighbors=399;, score=0.298 total time=   3.8s
[CV 1/3] END ...................n_neighbors=597;, score=0.291 total time=   6.1s
[CV 2/3] END ...................n_neighbors=597;, score=0.257 total time=   6.1s
[CV 3/3] END ...................n_neighbors=597;, score=0.320 total time=   6.1s
[CV 1/3] END ...................n_neighbors=915;, score=0.292 total time=  10.2s
[CV 2/3] END ...................n_neighbors=915;, score=0.288 total time=  10.2s
[CV 3/3] END ...................n_neighbors=915;

In [20]:
print(knn_cv.best_params_)
print(knn_cv.best_score_)

{'n_neighbors': 399}
0.29944709821488946


In [21]:
# since we missed small k values, we manually do some more searches
param_grid = {'n_neighbors': np.arange(1, 10, 2)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv=3, verbose=3, scoring=scorer)
knn_cv.fit(X_train_sample, y_train_sample)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV 1/3] END .....................n_neighbors=1;, score=0.645 total time=   1.1s
[CV 2/3] END .....................n_neighbors=1;, score=0.300 total time=   1.1s
[CV 3/3] END .....................n_neighbors=1;, score=0.303 total time=   1.1s
[CV 1/3] END .....................n_neighbors=3;, score=0.512 total time=   1.1s
[CV 2/3] END .....................n_neighbors=3;, score=0.321 total time=   1.1s
[CV 3/3] END .....................n_neighbors=3;, score=0.294 total time=   1.1s
[CV 1/3] END .....................n_neighbors=5;, score=0.444 total time=   1.1s
[CV 2/3] END .....................n_neighbors=5;, score=0.325 total time=   1.1s
[CV 3/3] END .....................n_neighbors=5;, score=0.312 total time=   1.1s
[CV 1/3] END .....................n_neighbors=7;, score=0.336 total time=   1.1s
[CV 2/3] END .....................n_neighbors=7;, score=0.280 total time=   1.1s
[CV 3/3] END .....................n_neighbors=7;,

In [22]:
print(knn_cv.best_params_)
print(knn_cv.best_score_)

{'n_neighbors': 1}
0.41582445084844716


These results are not very good when comparing to the results of the XGBoost model. Perhaps the reason is that we only used 10% of the data. Let's try to use the whole dataset and use weighted kNN and use k = 1.

In [23]:
# now predict on test set
knn = KNeighborsClassifier(n_neighbors=1, weights='distance')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96    391681
         1.0       0.31      0.30      0.30     23282

    accuracy                           0.92    414963
   macro avg       0.63      0.63      0.63    414963
weighted avg       0.92      0.92      0.92    414963



In [None]:
# save model
pickle.dump(knn, open('../models/knn_model.pkl', 'wb'))