In [2]:
import tensorflow as tf
import pandas as pd


In [22]:
# Define the path to the diabetics.csv file
file_path = 'E:\\GOOGLE DOWNLOAD\\diabetics.csv'

# Read the CSV file using pandas
data = pd.read_csv(file_path)



In [24]:
#Check for missing values
missing_values = data.isnull().sum()
print(missing_values)

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [26]:
data_cleaned = data.dropna()

# Check if missing values still exist
missing_values_after = data_cleaned.isnull().sum()
print("Missing values after cleaning:")
print(missing_values_after)

Missing values after cleaning:
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [28]:
data_cleaned.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [30]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

categorical_columns = ['gender', 'smoking_history']

# Use .loc[] to avoid the warning and properly modify the DataFrame
for column in categorical_columns:
    data_cleaned.loc[:, column] = label_encoder.fit_transform(data_cleaned[column])

# Check the first few rows of the cleaned data
data_cleaned.head()


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,4,25.19,6.6,140,0
1,0,54.0,0,0,0,27.32,6.6,80,0
2,1,28.0,0,0,4,27.32,5.7,158,0
3,0,36.0,0,0,1,23.45,5.0,155,0
4,1,76.0,1,1,1,20.14,4.8,155,0


In [34]:
## No external libraries needed for this part

# Define features (X) and target (y)
X = data.drop('diabetes', axis=1)  # Features (all columns except the target 'diabetes')
y = data['diabetes']  # Target (label)


In [36]:
# Import necessary library for train-test split
from sklearn.model_selection import train_test_split

# Split the dataset into 70% training and 30% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [40]:
# Import necessary library for feature scaling
# Check if there are any non-numeric columns left in your dataset
print(X_train.dtypes)

# If non-numeric columns exist, they should either be dropped or encoded
# If you find any, use LabelEncoder or OneHotEncoder before applying StandardScaler


gender                  object
age                    float64
hypertension             int64
heart_disease            int64
smoking_history         object
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
dtype: object


In [42]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'gender' column
X_train['gender'] = label_encoder.fit_transform(X_train['gender'])
X_test['gender'] = label_encoder.transform(X_test['gender'])  # Apply the same transformation to test data

# Encode the 'smoking_history' column
X_train['smoking_history'] = label_encoder.fit_transform(X_train['smoking_history'])
X_test['smoking_history'] = label_encoder.transform(X_test['smoking_history'])  # Apply the same transformation to test data

# Check if encoding is successful
print(X_train.dtypes)


gender                   int32
age                    float64
hypertension             int64
heart_disease            int64
smoking_history          int32
bmi                    float64
HbA1c_level            float64
blood_glucose_level      int64
dtype: object


In [44]:
from sklearn.preprocessing import StandardScaler

# Initialize StandardScaler
scaler = StandardScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test_scaled = scaler.transform(X_test)


In [48]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Try different values of k (neighbors)
k_values = range(1, 21)  # Test k values from 1 to 20
cv_scores = []

# Perform 10-fold cross-validation for each k
for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X_train_scaled, y_train, cv=10, scoring='accuracy')  # 10-fold CV
    cv_scores.append(scores.mean())

# Find the best k (the one with the highest cross-validation score)
best_k = k_values[cv_scores.index(max(cv_scores))]
print(f"Best k value: {best_k}")



Best k value: 19


In [62]:
# Import necessary libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the KNN model with specified hyperparameters
# Setting n_neighbors=7, weights='distance' to give more influence to closer neighbors
knn = KNeighborsClassifier(n_neighbors=7, weights='uniform', metric='euclidean')

# Train the KNN model on the scaled training data
knn.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = knn.predict(X_test_scaled)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print the detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Print the confusion matrix to see how well the model classified each class
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 96.11%

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.98     27453
           1       0.92      0.60      0.72      2547

    accuracy                           0.96     30000
   macro avg       0.94      0.80      0.85     30000
weighted avg       0.96      0.96      0.96     30000

Confusion Matrix:
 [[27312   141]
 [ 1026  1521]]
