In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('diabetes.csv')

# Display properties
print(df.head())
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.info())
print(df['Outcome'].value_counts())


FileNotFoundError: [Errno 2] No such file or directory: 'diabetes.csv'

In [2]:
#Step 2: Identify relationships between features (Heatmap)
import seaborn as sns
import matplotlib.pyplot as plt

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()


NameError: name 'df' is not defined

<Figure size 1000x800 with 0 Axes>

In [None]:
#Step 3: Prediction using one feature (Logistic Regression)
from sklearn.linear_model import LogisticRegression
from scipy.special import expit

# Create Logistic Regression model
logreg = LogisticRegression()
X_age = df[['Age']]
y = df['Outcome']
logreg.fit(X_age, y)

# Query for a 60-year-old person
age_60_pred = logreg.predict_proba([[60]])[:, 1]
result = 'YES' if age_60_pred > 0.5 else 'NO'
print(f"For a person aged 60, will he be diabetic? {result}")


In [None]:
#Step 4: Prediction using multiple features (Logistic Regression)
X_multi = df[['Glucose', 'BMI', 'Age']]
logreg_multi = LogisticRegression()
logreg_multi.fit(X_multi, y)

# Query for a person with specific values
person_pred_prob = logreg_multi.predict_proba([[150, 30, 40]])[:, 1]
print(f"Probability of being diabetic: {person_pred_prob}")


In [None]:
#Step 5: Build Logistic Regression model with all features
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Create Logistic Regression model with all features
X_all = df.drop('Outcome', axis=1)
logreg_all = LogisticRegression()
logreg_all.fit(X_all, y)

# Predict and compute AUC
y_pred_all = logreg_all.predict_proba(X_all)[:, 1]
auc_all = roc_auc_score(y, y_pred_all)
print(f"AUC for Logistic Regression with all features: {auc_all}")


In [None]:
#Step 6: Forward Selection Procedure
# Define a function to calculate AUC given a certain set of variables
def get_auc(variables):
    X_subset = df[variables]
    logreg_subset = LogisticRegression()
    logreg_subset.fit(X_subset, y)
    y_pred_subset = logreg_subset.predict_proba(X_subset)[:, 1]
    return roc_auc_score(y, y_pred_subset)

# Define a function to return the next best variable in combination with current variables
def best_next(current_variables, remaining_variables):
    best_auc = -1
    best_variable = None

    for variable in remaining_variables:
        auc = get_auc(current_variables + [variable])
        if auc > best_auc:
            best_auc = auc
            best_variable = variable

    return best_variable

# Loop until desired number of variables
selected_variables = []
remaining_variables = list(X_all.columns)

desired_num_variables = 4  # Change this to your desired number
for _ in range(desired_num_variables):
    next_variable = best_next(selected_variables, remaining_variables)
    selected_variables.append(next_variable)
    remaining_variables.remove(next_variable)

print(f"Selected variables: {selected_variables}")


In [None]:
#Step 7: Plot Line graph of AUC values and select cut-off
# Split the dataset equally for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_all, y, test_size=0.5, random_state=42)

# Plot AUC values for each variable
auc_values = []
for variable in X_all.columns:
    auc_values.append(get_auc(selected_variables + [variable]))

plt.figure(figsize=(10, 6))
plt.plot(X_all.columns, auc_values, marker='o')
plt.xlabel('Variables')
plt.ylabel('AUC Score')
plt.title('AUC Values for Each Variable')
plt.show()


In [None]:
#Step 8: Draw Cumulative Gain Chart and Lift Chart
#pip install scikit-plot
import scikitplot as skplt

# Predict probabilities on the test set using the selected features
y_pred_prob = logreg_all.predict_proba(X_test)[:, 1]

# Cumulative Gain Chart
skplt.metrics.plot_cumulative_gain(y_test, y_pred_prob)
plt.title('Cumulative Gain Chart')
plt.show()

# Lift Chart
skplt.metrics.plot_lift_curve(y_test, y_pred_prob)
plt.title('Lift Chart')
plt.show()
