# Diabetes Prediction and Anomaly Detection

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import tensorflow as tf
import plotly.graph_objs as go
import plotly.offline as pyo
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from scipy import stats
from pylab import rcParams
from keras.models import Model, load_model
from keras.layers import Input, Dense
from keras.callbacks import ModelCheckpoint, TensorBoard
from keras import regularizers
from tensorflow.keras import layers, models

In [None]:
df = pd.read_csv('diabetes.csv')

In [None]:
df.head()
df.info()
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
p=msno.bar(df)

In [None]:
df.value_counts()

In [None]:
df['Outcome'].value_counts()

In [None]:
# Visualization to check if the target is balanced or imbalanced
plt.figure(figsize=(4, 4))

# Count the occurrences of each outcome
outcome_counts = df['Outcome'].value_counts()

# Create the bar plot
outcome_counts.plot(kind='bar', color=['#6f8ea5', '#dfd9e1'])

# Add titles and labels
plt.title('Distribution of Target Variable (Outcome)', fontsize=14)
plt.xlabel('Outcome', fontsize=10)
plt.ylabel('Frequency', fontsize=10)

# Add value labels on top of the bars
for index, value in enumerate(outcome_counts):
    plt.text(index, value, str(value), ha='center', va='bottom', fontsize=9)

plt.xticks(rotation=0)  # Rotate x-axis labels if necessary
plt.tight_layout()
plt.show()


In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Upsampling the normal data ratio to 95% for anomaly detection tasks
ratio = 0.95
curr_ratio = y.value_counts(normalize=True)[0]

# Calculate the required sampling strategy for oversampling
sampling_strategy_value = (ratio * (1 - curr_ratio)) / (curr_ratio * (1 - ratio))
sampling_strategy = {0: int(y.value_counts()[0] * (1 + sampling_strategy_value))}

# Create RandomOverSampler object
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

# Apply oversampling
X_resampled, y_resampled = ros.fit_resample(X, y)
y_resampled_series = pd.Series(y_resampled)

# Convert X_resampled to a pandas DataFrame
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)

# Add noise to the BMI and DiabetesPedigreeFunction columns
noise_bmi = np.random.normal(loc=0, scale=0.01, size=X_resampled.shape[0])
noise_dpf = np.random.normal(loc=0, scale=0.01, size=X_resampled.shape[0])

X_resampled_df['BMI'] += noise_bmi
X_resampled_df['DiabetesPedigreeFunction'] += noise_dpf

# Print the results
print("Data size after oversampling:", X_resampled_df.shape)
print("Class distribution after oversampling:\n", y_resampled_series.value_counts())

In [None]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Set the target ratio for the majority class (normal data, class 0) to 95%
target_ratio = 0.95
current_ratio = y.value_counts(normalize=True)[0]

# Calculate the sampling strategy for oversampling the minority class (class 1)
sampling_strategy_value = (target_ratio * (1 - current_ratio)) / (current_ratio * (1 - target_ratio))
sampling_strategy = {0: int(y.value_counts()[0] * (1 + sampling_strategy_value))}

# Create a RandomOverSampler object with the calculated sampling strategy
ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)

# Apply RandomOverSampler to resample the dataset
X_resampled, y_resampled = ros.fit_resample(X, y)

# Convert the resampled target y_resampled to a pandas Series for easier manipulation
y_resampled_series = pd.Series(y_resampled)

# Convert X_resampled back to a pandas DataFrame with the original feature names
X_resampled_df = pd.DataFrame(X_resampled, columns=X.columns)

# Add some noise to the 'BMI' and 'DiabetesPedigreeFunction' columns
noise_bmi = np.random.normal(loc=0, scale=0.01, size=X_resampled.shape[0])
noise_dpf = np.random.normal(loc=0, scale=0.01, size=X_resampled.shape[0])

X_resampled_df['BMI'] += noise_bmi
X_resampled_df['DiabetesPedigreeFunction'] += noise_dpf

# Print the resampled data shape and class distribution
print("Resampled data shape:", X_resampled_df.shape)
print("Resampled class distribution:\n", y_resampled_series.value_counts())

# Plot the resampled class distribution
plt.figure(figsize=(6, 4))
sns.countplot(x=y_resampled_series)
plt.title('Resampled Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
# Set the visual style of the plots
sns.set(style='whitegrid')

# Plot the distribution of 'BMI' and 'DiabetesPedigreeFunction' to compare the original vs resampled datasets

# Create a function to plot distributions
def plot_distribution(original_data, resampled_data, feature_name):
    plt.figure(figsize=(12, 6))

    # Original data plot
    plt.subplot(1, 2, 1)
    sns.histplot(original_data[feature_name], kde=True, bins=20, color='blue', alpha=0.6)
    plt.title(f'Original {feature_name} Distribution', fontsize=14)
    plt.xlabel(feature_name, fontsize=12)
    plt.ylabel('Frequency', fontsize=12)

    # Resampled data plot
    plt.subplot(1, 2, 2)
    sns.histplot(resampled_data[feature_name], kde=True, bins=20, color='orange', alpha=0.6)
    plt.title(f'Resampled {feature_name} Distribution with Noise', fontsize=14)
    plt.xlabel(feature_name, fontsize=12)
    plt.ylabel('Frequency', fontsize=12)

    plt.tight_layout()
    plt.show()

# Plot distributions for 'BMI' and 'DiabetesPedigreeFunction'
plot_distribution(X, X_resampled_df, 'BMI')
plot_distribution(X, X_resampled_df, 'DiabetesPedigreeFunction')


In [None]:
# Standardization process
scaler = StandardScaler()
scaler.fit(X_resampled_df)
df_scaled = scaler.transform(X_resampled_df)

# Convert to DataFrame and copy
df_scaled = pd.DataFrame(df_scaled, columns=X_resampled_df.columns)
x_df_scaled = df_scaled.copy()

# Add target column (check if indices match)
df_scaled['target'] = y_resampled_series.reset_index(drop=True)

In [None]:
# Correlation matrix
correlation_matrix = df_scaled.corr()

# Display the correlation with the target variable
target_correlation = correlation_matrix['target'].sort_values(ascending=False)
print(target_correlation)

# Set the size of the plot
plt.figure(figsize=(10, 8))

# Create a heatmap of the correlation matrix
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True)

# Title and labels
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Set the maximum number of iterations for the model
log_reg = LogisticRegression(max_iter=10000)

# Train the logistic regression model
log_reg.fit(X_resampled, y_resampled)

# Calculate feature importance using regression coefficients
importance = log_reg.coef_[0]  # Get coefficients of the trained model
feature_importance = pd.Series(importance, index=X.columns).sort_values(ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
ax = feature_importance.plot(kind='bar', color='skyblue')
plt.title('Feature Importance using Logistic Regression', fontsize=16)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Importance', fontsize=12)

# Add coefficient values as annotations on the bars
for i in ax.patches:
    ax.annotate(f'{i.get_height():.2f}', 
                (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha='center', va='bottom', fontsize=10)

plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
# Initialize the random forest model
rf_model = RandomForestClassifier(random_state=42)  # Set random_state for reproducibility

# Train the random forest model
rf_model.fit(X_resampled, y_resampled)

# Calculate feature importance
importance = rf_model.feature_importances_

# Convert feature importance to a Series and sort
feature_importance = pd.Series(importance, index=X.columns).sort_values(ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
ax = feature_importance.plot(kind='bar', color='lightgreen')
plt.title('Feature Importance using Random Forest', fontsize=16)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Importance', fontsize=12)

# Add importance values as annotations on the bars
for i in ax.patches:
    ax.annotate(f'{i.get_height():.2f}', 
                (i.get_x() + i.get_width() / 2., i.get_height()), 
                ha='center', va='bottom', fontsize=10)

plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()
plt.show()

In [None]:
# Split into train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X_resampled_df, y_resampled_series, test_size=0.2, random_state=42)

In [None]:
# Check class distribution in y_train
unique, counts = np.unique(y_train, return_counts=True)
class_distribution = dict(zip(unique, counts))

print("Class distribution in y_train:")
print(class_distribution)
print(209/4478)

In [None]:
# Check class distribution in y_test
unique, counts = np.unique(y_test, return_counts=True)
class_distribution = dict(zip(unique, counts))

print("Class distribution in y_test:")
print(class_distribution)
print(59/1113)

In [None]:
from sklearn.ensemble import IsolationForest

# Select only the Glucose and DiabetesPedigreeFunction features
X_train_subset = X_train[['Glucose', 'DiabetesPedigreeFunction']]
X_test_subset = X_test[['Glucose', 'DiabetesPedigreeFunction']]

# Train Isolation Forest model
iso_forest = IsolationForest(contamination=0.095, random_state=42)
iso_forest.fit(X_train_subset)

# Predict anomalies
y_pred_test = iso_forest.predict(X_test_subset)
y_true_binary = (y_test == 1)
y_pred_binary = (y_pred_test == -1)

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_true_binary, y_pred_binary)

print("Confusion Matrix:")
print(cm)

# Calculate performance metrics
accuracy = accuracy_score(y_true_binary, y_pred_binary)
precision = precision_score(y_true_binary, y_pred_binary)
recall = recall_score(y_true_binary, y_pred_binary)
f1 = f1_score(y_true_binary, y_pred_binary)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# One-Class SVM for Anomaly Detection
from sklearn.svm import OneClassSVM
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer

# Select features
X_train_subset = X_train[['Glucose', 'DiabetesPedigreeFunction']]
X_test_subset = X_test[['Glucose', 'DiabetesPedigreeFunction']]

# Define One-Class SVM model
oc_svm = OneClassSVM()

# Set hyperparameter grid for random search
param_dist = {
    'nu': [0.01, 0.05, 0.1, 0.2, 0.5],  # Candidate values for nu
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Types of kernels
    'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10]  # Candidate values for gamma
}

# Define custom scorer for performance evaluation
def custom_scorer(y_true, y_pred):
    # In binary classification, set anomalies as the positive class (1)
    y_true_binary = (y_true == 1)
    y_pred_binary = (y_pred == -1)

    return f1_score(y_true_binary, y_pred_binary)

scorer = make_scorer(custom_scorer, greater_is_better=True)

# Create random search object
random_search = RandomizedSearchCV(oc_svm,
                                   param_distributions=param_dist,
                                   n_iter=50, scoring=scorer,
                                   refit=True, cv=5, random_state=42)

# Train random search model
random_search.fit(X_train_subset, y_train)

# Print best parameters
print("Best Parameters: ", random_search.best_params_)

# Predict on test data with the best model
best_oc_svm = random_search.best_estimator_
y_pred_test = best_oc_svm.predict(X_test_subset)
y_true_binary = (y_test == 1)
y_pred_binary = (y_pred_test == -1)

In [None]:
# Create confusion matrix
cm = confusion_matrix(y_true_binary, y_pred_binary)
print("Confusion Matrix:")
print(cm)

# Calculate performance metrics
accuracy = accuracy_score(y_true_binary, y_pred_binary)
precision = precision_score(y_true_binary, y_pred_binary)
recall = recall_score(y_true_binary, y_pred_binary)
f1 = f1_score(y_true_binary, y_pred_binary)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)

In [None]:
# Diabetes Prediction Using Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Select Glucose and DiabetesPedigreeFunction features
X_train_RFC = X_train[['Glucose', 'DiabetesPedigreeFunction']]
X_test_RFC = X_test[['Glucose', 'DiabetesPedigreeFunction']]

# Define Random Forest model
rf_model = RandomForestClassifier(random_state=42)

# Set hyperparameter range for random search
param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Create random search object
random_search = RandomizedSearchCV(estimator=rf_model,
                                   param_distributions=param_dist,
                                   n_iter=100, scoring='accuracy',
                                   cv=5, random_state=42,
                                   n_jobs=-1, verbose=2)

# Train random search model
random_search.fit(X_train_RFC, y_train)

# Print best parameters
print("Best Parameters: ", random_search.best_params_)

# Predict on test data with the best model
best_rf_model = random_search.best_estimator_
y_pred_test_RFC = best_rf_model.predict(X_test_RFC)

In [None]:
# Create confusion matrix
cm_RFC = confusion_matrix(y_test, y_pred_test_RFC)
print("Confusion Matrix:")
print(cm_RFC)

# Calculate performance metrics for binary classification
accuracy_RFC = accuracy_score(y_test, y_pred_test_RFC)
precision_RFC = precision_score(y_test, y_pred_test_RFC, average='macro')
recall_RFC = recall_score(y_test, y_pred_test_RFC, average='macro')
f1_RFC = f1_score(y_test, y_pred_test_RFC, average='macro')

print("Accuracy:", accuracy_RFC)
print("Precision:", precision_RFC)
print("Recall:", recall_RFC)
print("F1 Score:", f1_RFC)

In [None]:
# Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(x_df_scaled, y_resampled_series, test_size=0.2, random_state=42)

In [None]:
from sklearn.covariance import EllipticEnvelope

# Use only Glucose feature for training
X_train = X_train.iloc[:, [1]]
X_test = X_test.iloc[:, [1]]

# Initialize Elliptic Envelope model
elliptic_env = EllipticEnvelope(contamination=0.04)  # contamination represents the ratio of anomalies

# Train the Elliptic Envelope model
elliptic_env.fit(X_train)

# Predict (1: normal, -1: anomaly)
predictions = elliptic_env.predict(X_test)

# Convert predictions to 0 (normal) and 1 (anomaly)
predictions = np.where(predictions == 1, 0, 1)

# Add predictions as a new column
elliptic_X_test = X_test.copy()
elliptic_X_test['anomaly'] = predictions

In [None]:
# Actual and predicted values
actual = y_test  # Actual labels
predicted = elliptic_X_test['anomaly']  # Predicted labels

# Create confusion matrix
cm = confusion_matrix(actual, predicted)

# Print confusion matrix
print("Confusion Matrix:")
print(cm)

# Calculate performance metrics
accuracy = accuracy_score(actual, predicted)
precision = precision_score(actual, predicted)
recall = recall_score(actual, predicted)
f1 = f1_score(actual, predicted)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Split train and test dataset
X_train, X_test, y_train, y_test = train_test_split(x_df_scaled, y_resampled_series, test_size=0.2, random_state=42)

In [None]:
# Select only normal data (0) for training
X_train_normal = X_train[y_train == 0]

# Output results
print("Size of training normal data:", X_train_normal.shape)
print("Class distribution of training normal data:\n", y_train.value_counts())

In [None]:
# Set visualization libraries
%matplotlib inline

sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 14, 8

# Set RANDOM_SEED and LABELS
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

In [None]:
# Model design
input_dim = X_train.shape[1]  # Input dimension (number of features)
encoding_dim = 15  # Dimension to compress to

# Model structure
input_layer = Input(shape=(input_dim, ))
encoder = Dense(encoding_dim, activation="relu",
                activity_regularizer=regularizers.l1(1e-5))(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 2), activation='relu')(encoder)
decoder = Dense(input_dim, activation='linear')(decoder)
autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.summary()

In [None]:
# Train the autoencoder model
nb_epoch = 500
batch_size = 64

autoencoder.compile(optimizer='adam',
                    loss='mean_squared_error',
                    metrics=['accuracy'])

# Fit the model using only normal data for training
history = autoencoder.fit(X_train_normal, X_train_normal,
                          epochs=nb_epoch,
                          batch_size=batch_size,
                          shuffle=True,
                          validation_data=(X_test, X_test),  # Validate with both normal and abnormal data
                          verbose=1).history

# Recompile the model to ensure metrics are included
autoencoder.compile(metrics=['accuracy'],
                    loss='mean_squared_error',
                    optimizer='adam')

# Save the best model during training
cp = ModelCheckpoint(filepath="/kaggle/working/autoencoder_classifier.keras",
                     save_best_only=True,
                     verbose=0)

In [None]:
# Assuming 'history' is your history dictionary containing 'loss' and 'val_loss'
epochs = list(range(len(history['loss'])))

# Create traces for training and validation loss
train_loss = go.Scatter(x=epochs, y=history['loss'], mode='lines', name='Train Loss')
val_loss = go.Scatter(x=epochs, y=history['val_loss'], mode='lines', name='Validation Loss')

# Create the layout
layout = go.Layout(
    title='Model Loss',
    xaxis=dict(title='Epoch'),
    yaxis=dict(title='Loss'),
)

# Create the figure
fig = go.Figure(data=[train_loss, val_loss], layout=layout)

# Show the plot
pyo.iplot(fig)

In [None]:
# Calculate reconstruction errors on the test data
reconstructions = autoencoder.predict(X_test)
reconstruction_errors = np.mean(np.square(X_test - reconstructions), axis=1)

# Predict anomalies based on reconstruction errors
threshold = 0.18  # Set threshold for identifying anomalies
predicted_anomalies = (reconstruction_errors > threshold).astype(int)

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test, predicted_anomalies)
accuracy = accuracy_score(y_test, predicted_anomalies)
precision = precision_score(y_test, predicted_anomalies)
recall = recall_score(y_test, predicted_anomalies)
f1 = f1_score(y_test, predicted_anomalies)

# Print performance metrics
print("Confusion Matrix:")
print(conf_matrix)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

In [None]:
# Create DataFrame for reconstruction errors
error_df = pd.DataFrame({'reconstruction_error': reconstruction_errors,
                         'true_class': y_test})

groups = error_df.groupby('true_class')

# Create traces for each class
traces = []
for name, group in groups:
    trace = go.Scatter(
        x=group.index,
        y=group.reconstruction_error,
        mode='markers',
        marker=dict(size=3.5),
        name="Fraud" if name == 1 else "Normal"
    )
    traces.append(trace)

# Add threshold line
threshold_line = go.Scatter(
    x=[error_df.index.min(), error_df.index.max()],
    y=[threshold, threshold],
    mode='lines',
    line=dict(color='red', dash='dash'),
    name='Threshold'
)

# Create the layout
layout = go.Layout(
    title="Reconstruction Error for Different Classes",
    xaxis=dict(title="Data Point Index"),
    yaxis=dict(title="Reconstruction Error"),
    yaxis_range=[0, 1.5],
)

# Create the figure
fig = go.Figure(data=traces + [threshold_line], layout=layout)

# Show the plot
pyo.iplot(fig)