In [63]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
data=pd.read_csv('diabetes.csv')
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [64]:
print(data.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [65]:
print(max(data.Age))
min(data.Age)

81


21

In [66]:
bins=[20,25,45,60,85]
age_cat=pd.cut(data.Age,bins=bins,labels=range(1,5))
data['Age']=age_cat
# View distribution of age categories
print(data['Age'].value_counts().sort_index())

Age
1    267
2    383
3     91
4     27
Name: count, dtype: int64


In [67]:
mean=data['BloodPressure'].mean()
diff=data['BloodPressure']-mean
scaled_bp = (diff - np.min(diff)) / (np.max(diff) - np.min(diff))
data['BloodPressure'] = scaled_bp
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,0.590164,35,0,33.6,0.627,3,1
1,1,85,0.540984,29,0,26.6,0.351,2,0
2,8,183,0.524590,0,0,23.3,0.672,2,1
3,1,89,0.540984,23,94,28.1,0.167,1,0
4,0,137,0.327869,35,168,43.1,2.288,2,1
...,...,...,...,...,...,...,...,...,...
763,10,101,0.622951,48,180,32.9,0.171,4,0
764,2,122,0.573770,27,0,36.8,0.340,2,0
765,5,121,0.590164,23,112,26.2,0.245,2,0
766,1,126,0.491803,0,0,30.1,0.349,3,1


In [68]:
data.columns
del data['Pregnancies']
del data['SkinThickness']

In [69]:
def calculate_MLI(glucose, insulin, blood_pressure):
    # First normalize each parameter to account for different scales
    # Using clinical reference ranges:
    # Glucose normal range: 70-99 mg/dL
    # Insulin normal range: 3-25 µU/mL
    # BP normal range: 90-120 mmHg
    
    normalized_glucose = glucose / 99  # normalize by upper normal limit
    normalized_insulin = insulin / 25  # normalize by upper normal limit
    normalized_bp = blood_pressure / 120  # normalize by upper normal limit
    
    # Weight factors based on their relative importance in diabetes:
    # Glucose has highest weight as it's most direct indicator
    # Insulin second as it's directly related to diabetes
    # BP has lower weight as it's a secondary factor
    w_glucose = 0.5
    w_insulin = 0.3
    w_bp = 0.2
    
    # Calculate the combined index
    MLI = (w_glucose * normalized_glucose) + \
          (w_insulin * normalized_insulin) + \
          (w_bp * normalized_bp)
    
    return MLI

In [70]:
data['MLI']=calculate_MLI(data['Glucose'],data['Insulin'],data['BloodPressure'])
del data['BloodPressure']
del data['Glucose']
del data['Insulin']


In [71]:
scaler=MinMaxScaler()
data['MLI']=scaler.fit_transform(data[['MLI']])

In [72]:
data

Unnamed: 0,BMI,DiabetesPedigreeFunction,Age,Outcome,MLI
0,33.6,0.627,3,1,0.067329
1,26.6,0.351,2,0,0.038674
2,23.3,0.672,2,1,0.083234
3,28.1,0.167,1,0,0.142053
4,43.1,2.288,2,1,0.243799
...,...,...,...,...,...
763,32.9,0.171,4,0,0.240439
764,36.8,0.340,2,0,0.055504
765,26.2,0.245,2,0,0.176059
766,30.1,0.349,3,1,0.057310


In [73]:
data.columns

Index(['BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'MLI'], dtype='object')

In [74]:
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler


In [75]:
standardizer=StandardScaler()
features=standardizer.fit_transform(data)
nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features)
new_observation=[25,0.045,3,0,0.5]
distances,indices=nearest_neighbors.kneighbors([new_observation])

Nearest Neighbors viewed below:

In [76]:
features[indices]

array([[[ 4.45580749, -0.46170458,  0.20831006,  1.36589591,
          0.44604551],
        [ 3.21199889,  1.232582  , -1.10301883, -0.73212021,
          1.3393537 ]]])

Find two nearest neighbors based on Euclidean distance:

In [77]:
nearestneighbors_euclidean = NearestNeighbors(
 n_neighbors=2, metric='euclidean').fit(features)
distances

array([[20.78419377, 22.23066849]])

Manhattan distance:

In [78]:
nearestneighbors_manhattan = NearestNeighbors(
 n_neighbors=2, metric='manhattan').fit(features)
distances

array([[20.78419377, 22.23066849]])

Minkowski Disance:

In [79]:
nearestneighbors_manhattan = NearestNeighbors(
 n_neighbors=2, metric='minkowski').fit(features)
distances

array([[20.78419377, 22.23066849]])

Listing all observations' 3 nearest neighbors:

In [80]:
nearest_neighbors_with_self = nearestneighbors_euclidean.kneighbors_graph(
 features).toarray()
 # Remove 1's marking an observation is a nearest neighbor to itself
for i, x in enumerate(nearest_neighbors_with_self):
 x[i] = 0
 # View first observation's two nearest neighbors

In [81]:
nearest_neighbors_with_self[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [82]:
# Using train_test_split
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.4, random_state=42)


In [83]:
from sklearn.neighbors import KNeighborsClassifier
train_std=standardizer.fit_transform(train)
knn=KNeighborsClassifier(n_neighbors=5)