# Import libraries

In [28]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.inspection import permutation_importance
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix

# Import dataset

In [29]:
df = pd.read_csv('C:\\Users\\Nithya\\Desktop\\Major project\\Dataset\\diabetes_012_health_indicators_BRFSS2015.csv')

# Data preprocessing

## Check data imbalance

In [30]:
print('Number of unique values present to identify diabetes')
print(df['Diabetes_012'].value_counts())

Number of unique values present to identify diabetes
Diabetes_012
0.0    213703
2.0     35346
1.0      4631
Name: count, dtype: int64


## Data split

In [31]:
dfs = {}  # Dictionary to hold DataFrames

for i in range(3):
    dfs[f'df_{i}'] = df[df['Diabetes_012'] == i]

In [32]:
from sklearn.model_selection import train_test_split
train_df_0, test_df_0 = train_test_split(dfs['df_0'], test_size=0.2)
train_df_1, test_df_1 = train_test_split(dfs['df_1'], test_size=0.2)
train_df_2, test_df_2 = train_test_split(dfs['df_2'], test_size=0.2)

In [33]:
train_combined = pd.concat([train_df_0, train_df_1, train_df_2], ignore_index=True)
test_combined = pd.concat([test_df_0, test_df_1, test_df_2], ignore_index=True)

In [34]:
print(train_combined['Diabetes_012'].value_counts())

Diabetes_012
0.0    170962
2.0     28276
1.0      3704
Name: count, dtype: int64


In [35]:
print(test_combined['Diabetes_012'].value_counts())

Diabetes_012
0.0    42741
2.0     7070
1.0      927
Name: count, dtype: int64


# Smote oversampling

In [38]:
from imblearn.over_sampling import SMOTE
target_variable = 'Diabetes_012'


class_distribution = train_combined[target_variable].value_counts()
print("Class Distribution:")
print(class_distribution)



X = train_combined.drop(columns=[target_variable])
y = train_combined[target_variable]

desired_samples = {0: 170962, 1: 170962, 2: 170962}
smote = SMOTE(sampling_strategy=desired_samples, random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


df_oversampled = pd.concat([pd.DataFrame(X_resampled), pd.DataFrame(y_resampled, columns=[target_variable])], axis=1)

print("Oversampled Class Distribution:")
print(df_oversampled[target_variable].value_counts())

Class Distribution:
Diabetes_012
0.0    170962
2.0     28276
1.0      3704
Name: count, dtype: int64
Oversampled Class Distribution:
Diabetes_012
0.0    170962
1.0    170962
2.0    170962
Name: count, dtype: int64


# ANN

In [39]:
#classifier = 
#lassifier.fit(X_resampled, y_resampled)

model = Sequential()
model.add(Dense(64, input_dim=X_resampled.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

X_test = test_combined.drop(columns=[target_variable])
y_test = test_combined[target_variable]


model.fit(X_resampled, y_resampled, epochs=10, batch_size=32, validation_data=(X_test, y_test))



accuracy = model.evaluate(X_test, y_test)[1]
print(f"Accuracy on the test set: {accuracy:.4f}")

y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print("Accuracy:", accuracy * 100, "%")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy on the test set: 0.0263
Accuracy: 2.6252512909456422 %
Confusion Matrix:
 [[  405 42336     0]
 [    0   927     0]
 [    1  7069     0]]
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.01      0.02     42741
         1.0       0.02      1.00      0.04       927
         2.0       0.00      0.00      0.00      7070

    accuracy                           0.03     50738
   macro avg       0.34      0.34      0.02     50738
weighted avg       0.84      0.03      0.02     50738



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Predict test result

In [40]:
X_test = test_combined.drop(columns=[target_variable])
y_test = test_combined[target_variable]

In [42]:
y_pred = model.predict(X_test)



# Confusion matrix

In [43]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print("Classification Report:\n", classification_rep)

[[  405 42336     0]
 [    0   927     0]
 [    1  7069     0]]
Classification Report:
               precision    recall  f1-score   support

         0.0       1.00      0.01      0.02     42741
         1.0       0.02      1.00      0.04       927
         2.0       0.00      0.00      0.00      7070

    accuracy                           0.03     50738
   macro avg       0.34      0.34      0.02     50738
weighted avg       0.84      0.03      0.02     50738



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Data visualisation

In [None]:
custom_labels = ['No diabetes', 'Prediabetes', 'Diabetes']
import seaborn as sns
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False, xticklabels=custom_labels, yticklabels=custom_labels)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")

# Feature ranking

In [None]:
weights, biases = model.layers[0].get_weights()

# Calculate weight magnitudes
weight_magnitudes = np.sum(np.abs(weights), axis=1)  # Sum of absolute weights along the features

# Sort indices based on weight magnitudes
sorted_idx = np.argsort(weight_magnitudes)

# Map indices to feature names
sorted_feature_names = [column_names[i] for i in sorted_idx]

# Plot the feature importance with feature names
plt.figure(figsize=(6, 6))
plt.barh(range(len(sorted_idx)), weight_magnitudes[sorted_idx])
plt.yticks(range(len(sorted_idx)), sorted_feature_names)
plt.xlabel('Weight Magnitude')
plt.ylabel('Feature')
plt.title('ANN Feature Importance (Weight Magnitudes)')
plt.show()