In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.neighbors import NearestNeighbors
import numpy as np

In [2]:
data_url = "https://storage.googleapis.com/ai-experts/diabetes.csv"

# Load data
df = pd.read_csv(data_url)
df = df.drop(df[(df['BMI'] < 15) | (df['Insulin'] == 0) | (df['SkinThickness'] == 0)].index)

In [6]:


# Fit KNN on your (scaled) data
knn = NearestNeighbors(n_neighbors=5)
knn.fit(df)




In [7]:
# Get distances to the 5 nearest neighbors
distances, _ = knn.kneighbors(df)
distances

array([[ 0.        ,  7.90526565, 12.00103679, 13.11784437, 14.15678244],
       [ 0.        , 18.33373459, 19.84435124, 24.75432859, 24.92642205],
       [ 0.        , 16.99642315, 20.52731488, 20.63104932, 20.84485838],
       ...,
       [ 0.        ,  9.94961808, 15.07248407, 21.05149201, 25.6154071 ],
       [ 0.        , 21.8875172 , 29.49149167, 30.84033011, 32.79141779],
       [ 0.        ,  8.34329527, 10.76145162, 12.825286  , 13.49181396]],
      shape=(393, 5))

In [8]:
# Use mean distance to neighbors as anomaly score
anomaly_scores = distances.mean(axis=1)
anomaly_scores

array([  9.43618585,  17.57176729,  15.79992915,  40.7978267 ,
       161.99569464,  17.55788053,  12.08310184,  23.21186469,
        10.29164656,  14.86841151,  16.02793747,  10.21387315,
        16.79980101,  21.39003594,  20.75184376,   8.81665907,
        15.12195651,  18.92308486,  30.41402355,  30.16715373,
        12.54948113,  14.15514944,  14.89902616,  26.12429274,
        21.95288274,  22.74497441,  23.89663216,  11.25264523,
        12.36981728,   8.89880975,  17.66982774,   9.52465275,
        11.84263239,  24.44018742,  11.40734469,  11.74706801,
        11.10024937,  15.54628848,  14.4342872 ,  12.7746011 ,
        17.19775584,  20.72931618,  19.17193548,  14.24941863,
        14.99213571,  10.91120303,  14.6726056 ,  13.43119081,
        15.18132496,  13.37180008,  16.88958561,  28.32935584,
        11.76017069,  19.66093857,   8.82559007,  30.26660052,
        10.96605419,  26.94597109,  13.19797535,  11.98405188,
        17.99044699,  17.53085749,  23.325941  ,   6.99

In [9]:

# Define a threshold (e.g., top 5% as outliers)
threshold = np.percentile(anomaly_scores, 95)
threshold

np.float64(33.01465330991552)

In [None]:
outliers = anomaly_scores > threshold

In [5]:
outliers

array([False, False, False,  True,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True, False,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False,  True, False, False, False,  True, False,
       False, False,

In [10]:
df[outliers]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
8,2,197,70,45,543,30.5,0.158,53,1
13,1,189,60,23,846,30.1,0.398,59,1
139,5,105,72,29,325,36.9,0.159,28,0
177,0,129,110,46,130,67.1,0.319,26,1
182,1,0,74,20,23,27.7,0.299,21,0
186,8,181,68,36,495,30.1,0.615,60,1
228,4,197,70,39,744,36.7,2.329,31,0
247,0,165,90,33,680,52.3,0.427,23,0
258,1,193,50,16,375,25.9,0.655,24,0
286,5,155,84,44,545,38.7,0.619,34,0


# Scaling

In [13]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [16]:
knn_scaled = NearestNeighbors(n_neighbors=5)
knn_scaled.fit(df_scaled)

In [17]:
distances_scaled, _ = knn_scaled.kneighbors(df_scaled)
distances_scaled

array([[0.        , 0.91270832, 0.93691439, 0.97044397, 0.98786463],
       [0.        , 4.25100219, 4.25399254, 4.2919353 , 4.30128499],
       [0.        , 1.98991292, 2.05377073, 2.12345329, 2.13628177],
       ...,
       [0.        , 1.09511741, 1.14963924, 1.15886443, 1.22596806],
       [0.        , 2.22237694, 2.29681571, 2.32150557, 2.53309419],
       [0.        , 0.85770653, 1.18361855, 1.18589889, 1.23763429]],
      shape=(393, 5))

In [18]:
anomaly_scores_scaled = distances_scaled.mean(axis=1)
anomaly_scores_scaled

array([0.76158626, 3.41964301, 1.66068374, 2.46760092, 3.13420958,
       1.54985044, 1.18644311, 1.91842003, 0.94659093, 1.21534473,
       1.55225151, 1.26259235, 0.73371536, 2.00361   , 1.16277932,
       0.81694621, 1.25372023, 2.2908502 , 1.39992523, 2.25168591,
       0.95756271, 0.85347259, 0.83118986, 1.40519294, 1.59255943,
       1.5009052 , 1.81613058, 0.86947593, 1.07651271, 0.63214312,
       1.17667394, 1.21356177, 1.06371173, 1.29149039, 1.31082372,
       0.67797961, 0.755106  , 1.57379564, 1.09663107, 1.55236124,
       1.23196776, 1.16418792, 1.08167544, 1.07097344, 1.42546236,
       0.71666749, 0.82011752, 1.1252582 , 0.96123056, 1.64101512,
       1.19237668, 1.78272149, 0.81627303, 1.57824022, 0.81821978,
       2.06008889, 0.73956767, 2.64148678, 1.07209102, 0.86583917,
       1.20468377, 0.91993912, 1.07829232, 0.68148272, 0.96251434,
       0.78660304, 0.66922204, 1.40251731, 0.8285224 , 1.32844939,
       1.47290479, 1.14704296, 1.38001331, 1.56050038, 0.69136

In [19]:
threshold_scaled = np.percentile(anomaly_scores_scaled, 95)
threshold_scaled

np.float64(2.1270472448424322)

In [21]:
outliers_scaled = anomaly_scores_scaled > threshold_scaled

In [22]:
df[outliers_scaled]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
4,0,137,40,35,168,43.1,2.288,33,1
8,2,197,70,45,543,30.5,0.158,53,1
13,1,189,60,23,846,30.1,0.398,59,1
39,4,111,72,47,207,37.1,1.39,56,1
43,9,171,110,24,240,45.4,0.721,54,1
125,1,88,30,42,99,55.0,0.496,26,1
177,0,129,110,46,130,67.1,0.319,26,1
220,0,177,60,29,478,34.6,1.072,21,1
228,4,197,70,39,744,36.7,2.329,31,0
247,0,165,90,33,680,52.3,0.427,23,0
