# Diabetes Detection App

## IMPORT LIBRAIRIES

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st #need to install and check if it works by typing "streamlit hello" in prompt command
from PIL import Image
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix
from mlxtend.plotting import plot_confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier # testé comme étant l'algo ayant la plus grande accuracy

sns.set(style='white') ## Set the style with seaborn plots: I Like white style

In [None]:
# Write title of the App
st.write("""
# Diabetes Detection
""")

In [None]:
# Open and display an image
image = Image.open('diabetes_img_1.jpg')
st.image(image, caption='TEST', use_column_width=True)

## LOADING DATASET

In [None]:
# Reading a CSV dataset
df = pd.read_csv('diabetes_2.csv')
st.subheader('Data Information:') # Set a title for the futur display dataframe

### Overview of the dataset

In [None]:
df.shape # Give us number of (rows, columns) of the dataframe

In [None]:
df.info() # Give use the index of each columns and the dtypes of them 

In [None]:
df.describe() # Insight of mean, min, max, quartiles, count and std of each columns

Some features in the dataset have a value of 0, which denotes missing data. --> replace 0 for NaN


## CLEANING DATASET

In [None]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = \
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [None]:
# Check the number of missing values in the dataframe (per columns)
df.isnull().sum()

Glucose, blood pressure, skin thickness, insulin, and BMI, all have missing values. Use the ‘Outcome’ variable to find the mean to replace missing data

In [None]:
# Function to find the mean to replace missing data
def median_target(var):   
    temp = df[df[var].notnull()]
    temp = round(temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].mean().reset_index(), 1)
    return temp

### GLUCOSE

In [None]:
median_target("Glucose")
df.loc[(df['Outcome'] == 0 ) & (df['Glucose'].isnull()), 'Glucose'] = 110.6
df.loc[(df['Outcome'] == 1 ) & (df['Glucose'].isnull()), 'Glucose'] = 142.3

### BLOOD PRESSURE

In [None]:
median_target("BloodPressure")
df.loc[(df['Outcome'] == 0 ) & (df['BloodPressure'].isnull()), 'BloodPressure'] = 70.9
df.loc[(df['Outcome'] == 1 ) & (df['BloodPressure'].isnull()), 'BloodPressure'] = 75.3

 ### SKIN THICKNESS

In [None]:
median_target("SkinThickness")
df.loc[(df['Outcome'] == 0 ) & (df['SkinThickness'].isnull()), 'SkinThickness'] = 27.2
df.loc[(df['Outcome'] == 1 ) & (df['SkinThickness'].isnull()), 'SkinThickness'] = 33.0

### INSULIN

In [None]:
median_target("Insulin")
df.loc[(df['Outcome'] == 0 ) & (df['Insulin'].isnull()), 'Insulin'] = 130.3
df.loc[(df['Outcome'] == 1 ) & (df['Insulin'].isnull()), 'Insulin'] = 206.8

### BMI

In [None]:
median_target("BMI")
df.loc[(df['Outcome'] == 0 ) & (df['BMI'].isnull()), 'BMI'] = 30.9
df.loc[(df['Outcome'] == 1 ) & (df['BMI'].isnull()), 'BMI'] = 35.4

### Displaying the describe function of the dataframe in the App

In [None]:
st.write(df.describe())

## EXPLORATORY DATA ANALYSIS

In [None]:
# Creating a boxplot of each variable in one chart
f, ax= plt.subplots(figsize=(15, 10))
ax.set(xlim=(-.05, 768))
plt.ylabel('Variables')
plt.title("Dataframe Overview")
ax = sns.boxplot(data = df, orient = 'v', palette = 'Set2')
plt.savefig('boxplot')

### Viewing the distribution of the target variable (Data Imbalance)

In [None]:
# Creating a count histogram to see how many 1 and 0 are there in the variable 'Outcome'
g = sns.countplot(x='Outcome',data=df, palette='pastel')
plt.title('Count of Outcome Variable')
plt.xlabel('Outcome')
plt.ylabel('Count')
for p in g.patches:
    g.annotate(format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), ha = 'center', \
                   va = 'center', xytext = (0, 7), textcoords = 'offset points')

The bar chart above shows that the data is unbalanced, with 500 non-diabetics and 268 people with diabetes (far from reality wich should be more like a ratio 500:2 diabetics)

In [None]:
# Correlation plot
f, ax = plt.subplots(figsize=(11, 9))
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
sns.heatmap(df.corr(), mask=mask, vmax=.3, center=0, annot=True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

Features having the highest correlation with the target variable are glucose, insulin, BMI, skin thickness, and age

### Distribution of each variable

In [None]:
plt.rcParams["figure.figsize"] = (20, 10)
df.hist(grid=False, alpha=0.5)

### Scatter Plots

In [None]:
# GLUCOSE AND BLOOD PRESSURE
plt.rcParams["figure.figsize"] = (10, 8)
sns.scatterplot(x='Glucose', y='BloodPressure', hue='Outcome', data=df, s=60, alpha=0.8)
plt.title('Glucose vs Blood Pressure')

There is a difference between diabetics and non-diabetics: most persons with glucose levels within the normal range do not haver diabetes

In [None]:
# INSULIN AND BLOOD PRESSURE
plt.rcParams["figure.figsize"] = (10, 8)
sns.scatterplot(x='Insulin', y='BloodPressure', hue='Outcome', data=df, s=60, alpha=0.8)
plt.xticks([0, 166, 200, 400, 600])
plt.title('Insulin vs Blood Pressure')

The graph above also illustrates that most persons with insulin levels in the normal range do not have diabetes

In [None]:
# GLUCOSE AND AGE
plt.rcParams["figure.figsize"] = (10, 8)
sns.scatterplot(x='Glucose', y='Age', hue='Outcome', data=df, s=60, alpha=0.8)
plt.title('Glucose vs Age')

Young people (≈21 – 40), and people with an average glucose concentration are less likely to have diabetes.

In [None]:
# BMI AND AGE
plt.rcParams["figure.figsize"] = (10, 8)
sns.scatterplot(x='BMI', y='Age', hue='Outcome', data=df, s=60, alpha=0.8)
plt.title('BMI vs Age')

Most persons between the ages of 21 and 40 with a BMI less than 30 do not have diabetes

In [None]:
# SKIN THICKNESS AND DIABETES PEDIGREE FUNCTION
plt.rcParams["figure.figsize"] = (10, 8)
sns.scatterplot(x='SkinThickness', y='DiabetesPedigreeFunction', hue='Outcome', data=df, s=60, alpha=0.8)
plt.title('Skin Thickness vs DPF')

Most people with a skin thickness of less than 30 and a DPF of less than one do not have diabetes.

## MODEL BUILDING

In [None]:
# Function to split the data into independent 'X' and dependent 'y' variables (predictor and target variables)
@st.cache(persist=True)
def split(df):
    X = df.drop(columns='Outcome')
    y = df['Outcome']
    # Put all variables in X on the same scale
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])
    # Split the dataset into 80% Training set and 20% Testing set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split(df)

# Create and train the model
GradientBoostingClassifier = GradientBoostingClassifier()
GradientBoostingClassifier.fit(X_train, y_train)

In [None]:
# Function for the input from the user
def get_user_input():
    pregnancies = st.sidebar.number_input('pregnancies', min_value=0, max_value=None, value=0, step=1)
    glucose = st.sidebar.number_input('glucose', min_value=0.00, max_value=None, value=121.69, step=0.01)
    blood_pressure = st.sidebar.number_input('blood_pressure', min_value=0.00, max_value=None, value=72.42, step=0.01) 
    skin_thickness = st.sidebar.number_input('skin_thickness', min_value=0.00, max_value=None, value=29.24, step=0.01)
    insulin = st.sidebar.number_input('insulin', min_value=0.00, max_value=None, value=159.99, step=0.01)
    BMI = st.sidebar.number_input('BMI', min_value=0.00, max_value=None, value=32.44, step=0.01)
    DPF = st.sidebar.number_input('DPF', min_value=0.00, max_value=None, value=0.47, step=0.01)
    age = st.sidebar.number_input('age', min_value=0, max_value=None, value=33, step=1)
    
    # Store a dictionary into a variable
    user_data = {'pregnancies': pregnancies,
                 'glucose': glucose,
                 'blood_pressure': blood_pressure,
                 'skin_thickness': skin_thickness,
                 'insulin': insulin,
                 'BMI': BMI,
                 'DPF': DPF,
                 'age': age
                 }
    
    # Transform the data into a data frame
    features = pd.DataFrame(user_data, index=[0])
    return features

# Store the user input into a variable  
user_input = get_user_input()

In [None]:
st.subheader('User Input :') # Set a title for the User input table
st.write(user_input)

### Boxplots - SHOW SUBPLOTS in APP

In [None]:
# PREGNANCIES

# Set figure size 
fig1, ax1 = plt.subplots(figsize=(10,2))
# Create a boxplot of the variable 'Pregnancies' stored in df
sns.boxplot(df['Pregnancies'], color="plum", width=.5)
# Create a stripplot of the variable 'pregnancies' stored in df
sns.stripplot(data = user_input['pregnancies'], orient = 'h', color = "purple")

# Title and x-label
plt.title('PREGNANCIES Distribution', fontsize=14)
plt.xlabel('Values')

# Remove top and right borders
#sns.despine

# Show full range of possible values on x-axis
ax1.set(xlim=(0,18))

# Set the box color / style for the text input on the chart
props = dict(boxstyle='round', facecolor='plum', alpha=0.2)
# Add text input on the chart
ax1.text(0, 0.4, "Purple Dot = User Input", fontsize=8, bbox=props)

# Help to fit everything in the figure
plt.tight_layout()
# Display the figure in streamlit
st.pyplot(fig1)

In [None]:
# GLUCOSE
fig2, ax2 = plt.subplots(figsize=(10,2))
sns.boxplot(df['Glucose'], color="red", width=.5)
sns.stripplot(data = user_input['glucose'], orient = 'h', color = "black")

plt.title('GLUCOSE Distribution', fontsize=14)
plt.xlabel('Values in mg/dL')

ax2.set(xlim=(0,200))

props = dict(boxstyle='round', facecolor='red', alpha=0.2)
ax2.text(72, 0.4, " Acceptable ", fontsize=10, bbox=props)
ax2.text(99, 0.4, " Good Concentration", fontsize=10, bbox=props)
ax2.text(140, 0.4, "OK if 2h post eating", fontsize=10, bbox=props)
ax2.text(170, 0.4, "Too much", fontsize=10, bbox=props)


plt.tight_layout()
st.pyplot(fig2)

In [None]:
# BLOOD PRESSURE
fig3, ax3 = plt.subplots(figsize=(10,2))
sns.boxplot(df['BloodPressure'], color="grey", width=.5)
sns.stripplot(data = user_input['blood_pressure'], orient = 'h', color = "red")

plt.title('BLOOD PRESSURE Distribution', fontsize=14)
plt.xlabel('Values in mmHg')

ax3.set(xlim=(0,250))

props = dict(boxstyle='round', facecolor='black', alpha=0.2)
ax3.text(80, 0.4, " Normal Tension ", fontsize=10, bbox=props)
ax3.text(120, 0.4, " Tension too high ", fontsize=10, bbox=props)

plt.tight_layout()
st.pyplot(fig3)

In [None]:
# SKIN THICKNESS
fig4, ax4 = plt.subplots(figsize=(10,2))
sns.boxplot(df['SkinThickness'], color="plum", width=.5)
sns.stripplot(data = user_input['skin_thickness'], orient = 'h', color = "purple")

plt.title('SKIN THICKNESS Distribution', fontsize=14)
plt.xlabel('Values in mm')

ax4.set(xlim=(0,100))

props = dict(boxstyle='round', facecolor='plum', alpha=0.2)
ax4.text(2.5, 0.4, "   Normal for men   ", fontsize=10, bbox=props)
ax4.text(18, 0.4, "   Normal for women   ", fontsize=10, bbox=props)

plt.tight_layout()
st.pyplot(fig4)

In [None]:
# INSULIN
fig5, ax5 = plt.subplots(figsize=(10,2))
sns.boxplot(df['Insulin'], color="red", width=.5)
sns.stripplot(data = user_input['insulin'], orient = 'h', color = "black")

plt.title('INSULIN Distribution', fontsize=14)
plt.xlabel('Values in mlU/L')

ax5.set(xlim=(0,999))

props = dict(boxstyle='round', facecolor='red', alpha=0.2)
ax5.text(20, 0.4, "Acceptable Insulin Level", fontsize=10, bbox=props)
ax5.text(160, 0.4, "High Insulin Level", fontsize=10, bbox=props)

plt.tight_layout()
st.pyplot(fig5)

In [None]:
# BMI
fig6, ax6 = plt.subplots(figsize=(10,2))
sns.boxplot(df['BMI'], color="grey", width=.5)
sns.stripplot(data = user_input['BMI'], orient = 'h', color = "red")

plt.title('BODY MASS INDEX Distribution', fontsize=14)
plt.xlabel('Values')

ax6.set(xlim=(0,70))

props = dict(boxstyle='round', facecolor='black', alpha=0.2)
ax6.text(18, 0.4, "Thin", fontsize=10, bbox=props)
ax6.text(24.9, 0.4, "Normal", fontsize=10, bbox=props)
ax6.text(29.9, 0.4, "Thick", fontsize=10, bbox=props)

plt.tight_layout()
st.pyplot(fig6)

In [None]:
# Diabetes Pedigree F°
fig7, ax7 = plt.subplots(figsize=(10,2))
sns.boxplot(df['DiabetesPedigreeFunction'], color="plum", width=.5)
sns.stripplot(data = user_input['DPF'], orient = 'h', color = "purple")

plt.title('DIABETES PEDIGREE F° Distribution', fontsize=14)
plt.xlabel('Values in %')

ax7.set(xlim=(0,2.500))

props = dict(boxstyle='round', facecolor='plum', alpha=0.2)
ax7.text(.050, 0.4, "50% diabete risk", fontsize=10, bbox=props)
ax7.text(.100, 0.4, "100% diabete risk", fontsize=10, bbox=props)

plt.tight_layout()
st.pyplot(fig7)

In [None]:
# Age
fig8, ax8 = plt.subplots(figsize=(10,2))
sns.boxplot(df['Age'], color="red", width=.5)
sns.stripplot(data = user_input['age'], orient = 'h', color = "black")

plt.title('AGE Distribution', fontsize=14)
plt.xlabel('Values')

ax8.set(xlim=(0,110))

props = dict(boxstyle='round', facecolor='red', alpha=0.2)
#ax8.text(2, 0.4, "   Few times   ", fontsize=10, bbox=props)

plt.tight_layout()
st.pyplot(fig8)

### Show the MODEL METRICS

### Evaluation Metrics : Need to know what False Positives and False Negatives and difference between the two

In [None]:
st.subheader('Accessing Performance:')
gb_model = GradientBoostingClassifier
# Store the user inputs predictions in a variable
prediction = GradientBoostingClassifier.predict(user_input)

In [None]:
# Function to plot a confusion matrixc of the model
def confusion_matrix_plot (y_test, prediction2):
    
    cm = confusion_matrix(y_test, prediction2)
    classes = ['0', '1']
    figure, ax = plot_confusion_matrix(conf_mat = cm,
                                       class_names = classes,
                                       show_absolute = True,
                                       show_normed = False,
                                       colorbar = True)
# Store the model predictions in a variable
prediction2 = gb_model.predict(X_test)

fig = confusion_matrix_plot(y_test, prediction2)
# Display the confusion matrix in streamlit app
st.pyplot(fig)
# Display Accuracy score of the model (in %) Formule = (TP+TN)/(TP+TN+FP+FN)
# Model Accuracy ==> (How many diabetics are classified diabetics + how many diabetics are classified non-diabetics) / total number of diabetics and non-diabetics
# Effective measurement if autant de diabetics que de non-diabetics (si Data balanced)
st.write("Model Accuracy: ", str(accuracy_score(y_test, GradientBoostingClassifier.predict(X_test)).round(3) * 100) + '%' )


## FEATURE IMPORTANCES

In [None]:
st.subheader('Feature Importances')

In [None]:
# Create a function to plot feature importances
@st.cache(persist= True)
def plot_feature_importance(importance,names,model_type):

    # Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    # Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    # Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    # Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])
    # Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')
    
fig9 = plot_feature_importance(gb_model.feature_importances_,X_train.columns,'GB Classifier ')
st.set_option('deprecation.showPyplotGlobalUse', False)
st.pyplot(fig9)

## CLASSIFICATION + Probability Diabetic Detection

In [None]:
# Display the classification
st.subheader('Classification: ')
st.write(prediction)
# Display the precision score of the model in % Formule = (TP/TP+FP)
# Precision Score ==> Correctly classified diabetics / (Correctly classified diabetics + Incorrectly classified diabetics)
# Metrics pour Imbalanced Data
st.write("Precision Score: ", str(precision_score(y_test, GradientBoostingClassifier.predict(X_test)).round(3) * 100) + '%' )