In [5]:
import pickle
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

# Load the dataset 
data = pd.read_csv('processed_diabetes.csv')

# Display the first few rows of the dataset
print(data.head())

# Preprocessing
# Check for missing values
print(data.isnull().sum())

# Define the feature variables (X) and the target variable (y)
# Assuming 'diabetes' is the target variable (1 for positive, 0 for negative)
X = data.drop(columns=['Diabetes_012'])  # Drop the target variable from features
y = data['Diabetes_012']

# --- Changes here ---
# Convert y to binary (0 or 1) if it contains other values like 2
y = y.map({0: 0, 1: 1, 2:1}) # Assuming 2 represents a positive case like 1
# --- End of changes ---

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Optional: Using statsmodels for detailed statistics
X_train_sm = sm.add_constant(X_train)  # Add a constant for the intercept
logit_model = sm.Logit(y_train, X_train_sm)
result = logit_model.fit()

# Print the summary of the logistic regression model
print(result.summary())

#

   Diabetes_012  HighBP  HighChol   BMI  Smoker  Stroke  HeartDiseaseorAttack  \
0           0.0     1.0       1.0  40.0     1.0     0.0                   0.0   
1           0.0     0.0       0.0  25.0     1.0     0.0                   0.0   
2           0.0     1.0       1.0  28.0     0.0     0.0                   0.0   
3           0.0     1.0       0.0  27.0     0.0     0.0                   0.0   
4           0.0     1.0       1.0  24.0     0.0     0.0                   0.0   

   PhysActivity  Fruits  Veggies  HvyAlcoholConsump  AnyHealthcare  GenHlth  \
0           0.0     0.0      1.0                0.0            1.0      5.0   
1           1.0     0.0      0.0                0.0            0.0      3.0   
2           0.0     1.0      0.0                0.0            1.0      5.0   
3           1.0     1.0      1.0                0.0            1.0      2.0   
4           1.0     1.0      1.0                0.0            1.0      2.0   

   MentHlth  PhysHlth  DiffWalk  Sex  

In [7]:

import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler


print(data.isnull().sum())

# Define the feature variables (X) and the target variable (y)
# Assuming 'diabetes' is the target variable (1 for positive, 0 for negative)
X = data.drop(columns=['Diabetes_012'])  # Drop the target variable from features
y = data['Diabetes_012']

# --- Changes here ---
# Convert y to binary (0 or 1) if it contains other values like 2
y = y.map({0: 0, 1: 1, 2:1}) # Assuming 2 represents a positive case like 1
# --- End of changes ---

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Streamlit app
st.title('Diabetes Prediction App')

# Input features
st.header('Enter Patient Details:')
# Replace these with your actual feature names
highbp = st.number_input('HighBP', min_value=0, max_value=1)
highchol = st.number_input('HighChol', min_value=0, max_value=1)
cholcheck = st.number_input('CholCheck', min_value=0, max_value=1)
bmi = st.number_input('BMI', min_value=0.0)
smoker = st.number_input('Smoker', min_value=0, max_value=1)
stroke = st.number_input('Stroke', min_value=0, max_value=1)
heartdiseaseorattack = st.number_input('HeartDiseaseorAttack', min_value=0, max_value=1)
physactivity = st.number_input('PhysActivity', min_value=0, max_value=1)
fruits = st.number_input('Fruits', min_value=0, max_value=1)
veggies = st.number_input('Veggies', min_value=0, max_value=1)
hvyAlcoholConsump = st.number_input('HvyAlcoholConsump', min_value=0, max_value=1)
anyhealthcare = st.number_input('AnyHealthcare', min_value=0, max_value=1)
noDocbcCost = st.number_input('NoDocbcCost', min_value=0, max_value=1)
genhlth = st.number_input('GenHlth', min_value=1, max_value=5)
menthealth = st.number_input('MentHlth', min_value=0)
physhealth = st.number_input('PhysHlth', min_value=0)
diffwalk = st.number_input('DiffWalk', min_value=0, max_value=1)
sex = st.number_input('Sex', min_value=0, max_value=1)
age = st.number_input('Age', min_value=18)
education = st.number_input('Education', min_value=1, max_value=6)
income = st.number_input('Income', min_value=1, max_value=8)


# Create a button to predict
if st.button('Predict'):
    # Create a DataFrame with the input features
    input_data = pd.DataFrame({
        'HighBP': [highbp],
        'HighChol': [highchol],
        'CholCheck': [cholcheck],
        'BMI': [bmi],
        'Smoker': [smoker],
        'Stroke': [stroke],
        'HeartDiseaseorAttack': [heartdiseaseorattack],
        'PhysActivity': [physactivity],
        'Fruits': [fruits],
        'Veggies': [veggies],
        'HvyAlcoholConsump': [hvyAlcoholConsump],
        'AnyHealthcare': [anyhealthcare],
        'NoDocbcCost': [noDocbcCost],
        'GenHlth': [genhlth],
        'MentHlth': [menthealth],
        'PhysHlth': [physhealth],
        'DiffWalk': [diffwalk],
        'Sex': [sex],
        'Age': [age],
        'Education': [education],
        'Income': [income]
    })


    # Make the prediction using the trained model
    prediction = log_reg.predict(input_data)

    # Display the prediction
    if prediction[0] == 1:
        st.write('Prediction: The patient is likely to have diabetes.')
    else:
        st.write('Prediction: The patient is likely not to have diabetes.')


# Display model performance metrics
st.header('Model Performance')
st.write(f'Accuracy: {accuracy:.2f}')
st.write('Confusion Matrix:')
st.write(conf_matrix)
st.write('Classification Report:')
st.write(class_report)

Diabetes_012            0
HighBP                  0
HighChol                0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64


2024-11-18 13:54:18.185 
  command:

    streamlit run /Users/eseyhaile/Library/Python/3.12/lib/python/site-packages/ipykernel_launcher.py [ARGUMENTS]
2024-11-18 13:54:18.197 Session state does not function when running a script without `streamlit run`
