In [1]:
%pip install -r requirements.txt

Collecting brotli==1.0.9 (from -r requirements.txt (line 1))
  Using cached Brotli-1.0.9-cp310-cp310-macosx_10_9_x86_64.whl.metadata (1.4 kB)
Collecting cached-property==1.5.2 (from -r requirements.txt (line 2))
  Using cached cached_property-1.5.2-py2.py3-none-any.whl.metadata (11 kB)
Collecting fqdn==1.5.1 (from -r requirements.txt (line 3))
  Using cached fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting h2==4.1.0 (from -r requirements.txt (line 4))
  Using cached h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting imblearn==0.0 (from -r requirements.txt (line 5))
  Using cached imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting importlib-resources==6.4.5 (from -r requirements.txt (line 7))
  Using cached importlib_resources-6.4.5-py3-none-any.whl.metadata (4.0 kB)
Collecting isoduration==20.11.0 (from -r requirements.txt (line 8))
  Using cached isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting jaraco.collections==5.1.0 (from -r requirements

In [None]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
# data (as pandas dataframes) 
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 
  
# metadata 
print(cdc_diabetes_health_indicators.metadata) 
  
# variable information 
print(cdc_diabetes_health_indicators.variables) 


In [None]:
# what's the size of the dataset?
print(X.shape)
print(y.shape)

In [None]:
# create a dataframe
import pandas as pd
df = pd.DataFrame(X)
df['Diabetes_binary'] = y
df.head()

In [5]:
# save as csv
df.to_csv('./data/cdc_diabetes_health_indicators.csv', index=False)

# save as pickle
df.to_pickle('cdc_diabetes_health_indicators.pkl')

In [2]:
# no class weights or SMOTE
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib

file_path = './data/cdc_diabetes_health_indicators.csv'
diabetes_data = pd.read_csv(file_path)

X = diabetes_data.drop('Diabetes_binary', axis=1)
y = diabetes_data['Diabetes_binary']

# the simple imputer fills in missing values with the mean
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# prepare the training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train_scaled, y_train)

# make predictions
y_pred = logistic_model.predict(X_test_scaled)

# model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)



Accuracy: 0.865874329864396

Confusion Matrix:
 [[42773   966]
 [ 5839  1158]]

Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.98      0.93     43739
           1       0.55      0.17      0.25      6997

    accuracy                           0.87     50736
   macro avg       0.71      0.57      0.59     50736
weighted avg       0.83      0.87      0.83     50736



In [3]:
joblib.dump(logistic_model, './models/logistic_regression_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved successfully.")


Model and scaler saved successfully.


In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import ADASYN
import joblib

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Applying ADASYN to balance the classes in the training set
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train_scaled, y_train)

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X_resampled, y_resampled)

# save the model and scaler
joblib.dump(rf_model, './models/random_forest_diabetes_model.pkl')
joblib.dump(scaler, './models/scaler.pkl')

print("Model and scaler saved successfully.")


Model and scaler saved successfully.


In [43]:
# Read database for testing
import sqlite3
import pandas as pd

conn = sqlite3.connect('user_predictions.db')
# cursor = conn.cursor()
# cursor.execute('''
# UPDATE predictions 
# SET date = DATE('now');
# ''')

conn.commit()
def display_table_data(table_name):
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql_query(query, conn)
    pd.set_option('display.max_columns', None)
    if not df.empty:
        print(f"Contents of table '{table_name}':")
        print(df)
    else:
        print(f"Table '{table_name}' is empty.")

tables = ['users', 'predictions']
for table in tables:
    display_table_data(table)
conn.close()

Contents of table 'users':
   id       name                 email       unique_id
0   1      test         test@email.com       test_test
1   3       test                 email           test1
2   4          f           f@gmail.com              f1
3   5  Test User  testuser@example.com  generated_user
Contents of table 'predictions':
       id         user_id HighBP HighChol CholCheck        BMI Smoker Stroke  \
0       1            None      0        0         0  22.857143      0      0   
1       2            None      0        0         0  22.857143      0      0   
2       3            None      0        0         0  22.857143      0      0   
3       4           test1      0        0         0  22.857143      0      0   
4       5               5     No       No        No  25.000000     No     No   
..    ...             ...    ...      ...       ...        ...    ...    ...   
730  5110  generated_user     No       No        No  30.000000     No     No   
731  5111  generated_user

In [42]:
# Generate data for a test user
import random
from datetime import datetime, timedelta

conn = sqlite3.connect('user_predictions.db')
cursor = conn.cursor()

def insert_user_data(user_id, user_data, current_date):

    cursor.execute('''
        SELECT id FROM users WHERE unique_id = ?
    ''', (user_id,))
    existing_user = cursor.fetchone()

    if not existing_user:
        cursor.execute('''
        INSERT INTO users (name, email, unique_id) VALUES (?, ?, ?)
        ''', (user_data['name'], user_data['email'], user_id))
        user_id = cursor.lastrowid  

    prediction_result = "No Diabetes Present" if random.random() > 0.5 else "Diabetes Present"
    diabetes_prob = random.random()

    cursor.execute('''
    INSERT INTO predictions (
        user_id, HighBP, HighChol, CholCheck, BMI, Smoker, Stroke,
        HeartDiseaseorAttack, PhysActivity, Fruits, Veggies, 
        HvyAlcoholConsump, AnyHealthcare, NoDocbcCost, GenHlth, 
        MentHlth, PhysHlth, DiffWalk, Sex, Age, Education, Income, Prediction, Probability, date
    ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', (
        user_id,
        user_data['HighBP'], user_data['HighChol'], user_data['CholCheck'], user_data['BMI'],
        user_data['Smoker'], user_data['Stroke'], user_data['HeartDiseaseorAttack'],
        user_data['PhysActivity'], user_data['Fruits'], user_data['Veggies'],
        user_data['HvyAlcoholConsump'], user_data['AnyHealthcare'], user_data['NoDocbcCost'],
        user_data['GenHlth'], user_data['MentHlth'], user_data['PhysHlth'],
        user_data['DiffWalk'], user_data['Sex'], user_data['Age'],
        user_data['Education'], user_data['Income'], prediction_result, diabetes_prob, current_date
    ))

def generate_data_for_user(user_id, start_date, num_days=730):
    user_data = {
        'name': 'Test User',
        'email': 'testuser@example.com',
        'HighBP': 'No',
        'HighChol': 'No',
        'CholCheck': 'No',
        'BMI': 25,
        'Smoker': 'No',
        'Stroke': 'No',
        'HeartDiseaseorAttack': 'No',
        'PhysActivity': 'Yes',
        'Fruits': 'Yes',
        'Veggies': 'Yes',
        'HvyAlcoholConsump': 'No',
        'AnyHealthcare': 'Yes',
        'NoDocbcCost': 'No',
        'GenHlth': 3,
        'MentHlth': 0,
        'PhysHlth': 0,
        'DiffWalk': 'No',
        'Sex': 1,
        'Age': 3,
        'Education': 4,
        'Income': 6
    }

    start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
    print(start_date)
    start_date = start_date.replace(year=start_date.year - 2)

    current_date = start_date

    for i in range(num_days):
        if i > 365 and i <= 548:  
            user_data['BMI'] = 30  

        insert_user_data(user_id, user_data, current_date)

        current_date += timedelta(days=1)
        today_date = current_date.strftime('%Y-%m-%d')

user_id = "generated_user"  
start_date = "2022-01-01" 
generate_data_for_user(user_id, start_date)

conn.commit()
conn.close()


2022-01-01


In [41]:
conn = sqlite3.connect('user_predictions.db')
cursor = conn.cursor()

cursor.execute('''
    DELETE FROM predictions
    WHERE user_id = 'generated_user'
''')
conn.commit()
conn.close()