In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVR
from sklearn.compose import make_column_transformer
import pickle
import firebase_admin as fb_a
from firebase_admin import credentials, db

In [2]:
def read_models():
    hd_df = pd.read_csv("Datasets/hd_dataset.csv")
    hd_X, hd_y = hd_df.drop(columns='HeartDisease'), hd_df['HeartDisease']

    lc_df = pd.read_csv("Datasets/lc_dataset.csv")
    lc_X, lc_y = lc_df.drop(columns=['index', 'Patient Id', 'Level']), lc_df['Level']

    cc_df = pd.read_csv("Datasets/cc_dataset.csv")
    cc_X, cc_y = cc_df.drop(columns='hasColonCancer'), cc_df['hasColonCancer']
    
    return hd_X, hd_y, lc_X, lc_y, cc_X, cc_y

hd_X, hd_y, lc_X, lc_y, cc_X, cc_y = read_models()

In [3]:
yn_map = {'Yes': 1, 'No': 0}
yn_cols = ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'PhysicalActivity', 'Asthma', 'KidneyDisease', 'SkinCancer']

hd_X[yn_cols] = hd_X[yn_cols].replace(yn_map)
hd_X['Sex'] = hd_X['Sex'].replace({'Male': 0, 'Female': 1})

In [4]:
def split_categories(features):
    cat_features = features.select_dtypes(include=['object', 'bool'])
    num_features = features.select_dtypes(include=['int64', 'float64'])
    return cat_features, num_features

hd_cat, hd_num = split_categories(hd_X)
lc_cat, lc_num = split_categories(lc_X)
cc_cat, cc_num = split_categories(cc_X)

In [8]:
def create_scalers(hd_num, lc_num, cc_num):
    hd_mms = MinMaxScaler()

    hd_mms.fit(hd_num)
    num_portion = hd_mms.transform(hd_num)
    num_feature_names = hd_mms.get_feature_names_out(hd_num.columns)

    lc_mms = MinMaxScaler()

    lc_mms.fit(lc_num)
    num_portion = lc_mms.transform(lc_num)
    num_feature_names = lc_mms.get_feature_names_out(lc_num.columns)

    cc_mms = MinMaxScaler()

    cc_mms.fit(cc_num)
    num_portion = cc_mms.transform(cc_num)
    num_feature_names = cc_mms.get_feature_names_out(cc_num.columns)
    
    return hd_mms, lc_mms, cc_mms

save_scalers(*create_scalers(hd_num, lc_num, cc_num))

In [9]:
def transform_df(cat_cols, num_cols):
    ohe = OneHotEncoder()
    mms = MinMaxScaler()
    
    ohe.fit(cat_cols)
    cat_portion = ohe.transform(cat_cols).toarray()
    cat_feature_names = ohe.get_feature_names_out(cat_cols.columns)
    
    mms.fit(num_cols)
    num_portion = mms.transform(num_cols)
    num_feature_names = mms.get_feature_names_out(num_cols.columns)
    
    result = pd.concat([pd.DataFrame(cat_portion,columns=cat_feature_names).astype(int), 
               pd.DataFrame(num_portion,columns=num_feature_names).astype(float)], axis=1)
    
    return result

hd_X = transform_df(hd_cat, hd_num)
lc_X = transform_df(lc_cat, lc_num)
cc_X = transform_df(cc_cat, cc_num)

In [11]:
def train_perceptron(X, y, n_splits=10, shuffle=True):
    kf = KFold(n_splits = n_splits, shuffle = shuffle)
    model = Perceptron()
    accuracy_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
    
        y_pred = model.predict(X_test)
    
        accuracy = accuracy_score(y_test, y_pred)
        accuracy_scores.append(accuracy)
        
    mean_scores = np.mean(accuracy_scores)
    
    return model, mean_scores
    
hd_model, hd_mean_acc = train_perceptron(hd_X, hd_y)
lc_model, lc_mean_acc = train_perceptron(lc_X, lc_y)
cc_model, cc_mean_acc = train_perceptron(cc_X, cc_y)

In [12]:
def find_best_n_splits(X, y):
    best_accuracy = 0
    best_n_splits = None
    best_model = None
    for i in range(2, 10):
        current_model, current_accuracy = train_perceptron(X, y, n_splits=i)
        if current_accuracy > best_accuracy:
            best_n_splits = i
            best_accuracy = current_accuracy
            best_model = current_model
            
    return best_accuracy, best_model
   
    print(f'Best number of splits for current model: {best_n_splits}')

hd_best = None
hd_acc = 0
lc_best = None
lc_acc = 0
cc_best = None
cc_acc = 0

for i in range(10):
    print(f'=== Heart Disease ===')
    hd_new_acc, hd_curr = find_best_n_splits(hd_X, hd_y)
    if hd_new_acc > hd_acc:
        hd_best = hd_curr
        hd_acc = hd_new_acc
        
    print(f'=== Lung Cancer ===')
    lc_new_acc, lc_curr = find_best_n_splits(lc_X, lc_y)
    if lc_new_acc > lc_acc:
        lc_best = lc_curr
        lc_acc = lc_new_acc
        
    print(f'=== Colon Cancer ===')
    cc_new_acc, cc_curr = find_best_n_splits(cc_X, cc_y)
    if cc_new_acc > cc_acc:
        cc_best = cc_curr
        cc_acc = cc_new_acc

=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===
=== Heart Disease ===
=== Lung Cancer ===
=== Colon Cancer ===


In [13]:
hd_acc, lc_acc, cc_acc

(0.913750997963485, 0.9750138277661544, 0.8480000000000001)

In [14]:
def save_models(hd_model, lc_model, cc_model):
    hd_filename = 'hd_model.sav'
    lc_filename = 'lc_model.sav'
    cc_filename = 'cc_model.sav'

    pickle.dump(hd_model, open(hd_filename, 'wb'))
    pickle.dump(lc_model, open(lc_filename, 'wb'))
    pickle.dump(cc_model, open(cc_filename, 'wb'))
    
save_models(hd_best, lc_best, cc_best)

In [16]:
hd_X.columns, lc_X.columns, cc_X.columns

(Index(['AgeCategory_18-24', 'AgeCategory_25-29', 'AgeCategory_30-34',
        'AgeCategory_35-39', 'AgeCategory_40-44', 'AgeCategory_45-49',
        'AgeCategory_50-54', 'AgeCategory_55-59', 'AgeCategory_60-64',
        'AgeCategory_65-69', 'AgeCategory_70-74', 'AgeCategory_75-79',
        'AgeCategory_80 or older', 'Race_American Indian/Alaskan Native',
        'Race_Asian', 'Race_Black', 'Race_Hispanic', 'Race_Other', 'Race_White',
        'Diabetic_No', 'Diabetic_No, borderline diabetes', 'Diabetic_Yes',
        'Diabetic_Yes (during pregnancy)', 'GenHealth_Excellent',
        'GenHealth_Fair', 'GenHealth_Good', 'GenHealth_Poor',
        'GenHealth_Very good', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
        'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex',
        'PhysicalActivity', 'SleepTime', 'Asthma', 'KidneyDisease',
        'SkinCancer'],
       dtype='object'),
 Index(['Age', 'Gender', 'Air Pollution', 'Alcohol use', 'Dust Allergy',
        'OccuPational Hazard

In [6]:
def save_scalers(hd_scaler, lc_scaler, cc_scaler):
    hd_filename = 'hd_scaler.sav'
    lc_filename = 'lc_scaler.sav'
    cc_filename = 'cc_scaler.sav'
    
    pickle.dump(hd_scaler, open(hd_filename, 'wb'))
    pickle.dump(lc_scaler, open(lc_filename, 'wb'))
    pickle.dump(cc_scaler, open(cc_filename, 'wb'))

In [None]:
def load_models():
    hd_filename = 'hd_model.sav'
    lc_filename = 'lc_model.sav'
    cc_filename = 'cc_model.sav'
    
    hd_model = pickle.load(open(hd_filename, 'rb'))
    lc_model = pickle.load(open(lc_filename, 'rb'))
    cc_model = pickle.load(open(cc_filename, 'rb'))
    
    return hd_model, lc_model, cc_model

In [None]:
def load_scaler():
    scaler_filename = 'scaler.sav'
    scaler = pickle.load(open(scaler_filename, 'rb'))
    
    return scaler

In [23]:
def load_input(reference):
    cred = credentials.Certificate("dfos-healthai-firebase-adminsdk-aiz5j-a013f16cbc.json")
    fb_a.initialize_app(cred, {'databaseURL': 'https://dfos-healthai-default-rtdb.firebaseio.com/'})
    
    ref = db.reference(reference)
    data = ref.get()

    fb_a.delete_app(fb_a.get_app())
    
    return data
    
user_form = load_input('/Users/MWt7bidcbHTC1Mrj6Qp7nBxmaJH2/forms/-NkV5Lw7eexTNWPrfLfs')
user_form

{'age': 21,
 'airPollution': 1,
 'alcoholUse': 1,
 'balancedDiet': 1,
 'bmi': 123,
 'chestPain': 1,
 'chronicLungDisease': 1,
 'clubbingFingerNails': 1,
 'coughing': 1,
 'diabetic': 'Yes',
 'difficultyWalking': True,
 'doesDrinkAlcohol': True,
 'dryCough': 1,
 'dustAllergies': 1,
 'exercisesRegularly': True,
 'fatigue': 1,
 'frequentColds': 1,
 'gender': True,
 'generalHealth': 'Poor',
 'geneticRisk': 0,
 'hadKidneyDisease': True,
 'hadPreviousCancer': True,
 'hadPreviousColonCancer': True,
 'hadPreviousSkinCancer': True,
 'hasAsthma': True,
 'hasFamilyHistory': True,
 'hasHadStroke': True,
 'hasHighFatDiet': True,
 'hasIDB': True,
 'hasObesity': True,
 'obesity': 1,
 'occupationalHazards': 1,
 'oldAge': False,
 'passiveSmoker': 1,
 'physicalActivity': True,
 'physicalHealth': 25,
 'race': 'White',
 'sex': 'M',
 'shortnessOfBreath': 1,
 'sleepTime': 23,
 'smoker': True,
 'smoking': 1,
 'snoring': 1,
 'stability': 0,
 'swallowingDifficulty': 1,
 'weightLoss': 1,
 'wheezing': 1}

In [62]:
hd_X.columns, lc_X.columns, cc_X.columns

(Index(['Smoking_No', 'Smoking_Yes', 'AlcoholDrinking_No',
        'AlcoholDrinking_Yes', 'Stroke_No', 'Stroke_Yes', 'DiffWalking_No',
        'DiffWalking_Yes', 'Sex_Female', 'Sex_Male', 'AgeCategory_18-24',
        'AgeCategory_25-29', 'AgeCategory_30-34', 'AgeCategory_35-39',
        'AgeCategory_40-44', 'AgeCategory_45-49', 'AgeCategory_50-54',
        'AgeCategory_55-59', 'AgeCategory_60-64', 'AgeCategory_65-69',
        'AgeCategory_70-74', 'AgeCategory_75-79', 'AgeCategory_80 or older',
        'Race_American Indian/Alaskan Native', 'Race_Asian', 'Race_Black',
        'Race_Hispanic', 'Race_Other', 'Race_White', 'Diabetic_No',
        'Diabetic_No, borderline diabetes', 'Diabetic_Yes',
        'Diabetic_Yes (during pregnancy)', 'PhysicalActivity_No',
        'PhysicalActivity_Yes', 'GenHealth_Excellent', 'GenHealth_Fair',
        'GenHealth_Good', 'GenHealth_Poor', 'GenHealth_Very good', 'Asthma_No',
        'Asthma_Yes', 'KidneyDisease_No', 'KidneyDisease_Yes', 'SkinCancer_No',

In [22]:
hd_X.loc[0, :]

AgeCategory_18-24                      0.000000
AgeCategory_25-29                      0.000000
AgeCategory_30-34                      0.000000
AgeCategory_35-39                      0.000000
AgeCategory_40-44                      0.000000
AgeCategory_45-49                      0.000000
AgeCategory_50-54                      0.000000
AgeCategory_55-59                      1.000000
AgeCategory_60-64                      0.000000
AgeCategory_65-69                      0.000000
AgeCategory_70-74                      0.000000
AgeCategory_75-79                      0.000000
AgeCategory_80 or older                0.000000
Race_American Indian/Alaskan Native    0.000000
Race_Asian                             0.000000
Race_Black                             0.000000
Race_Hispanic                          0.000000
Race_Other                             0.000000
Race_White                             1.000000
Diabetic_No                            0.000000
Diabetic_No, borderline diabetes       0

In [21]:
def make_predictables(form):
    hd_scaler, lc_scaler, cc_scaler = load_scalers()
    
    # Heart Disease
    hd_predictable = [0] * 41
    
    ## AgeCategory
    age = form['age']
    if age < 25:
        hd_predictable[0] = 1
    elif age < 30:
        hd_predictable[1] = 1
    elif age < 35:
        hd_predictable[2] = 1
    elif age < 40:
        hd_predictable[3] = 1
    elif age < 45:
        hd_predictable[4] = 1
    elif age < 50:
        hd_predictable[5] = 1
    elif age < 55:
        hd_predictable[6] = 1
    elif age < 60:
        hd_predictable[7] = 1
    elif age < 65:
        hd_predictable[8] = 1
    elif age < 70:
        hd_predictable[9] = 1
    elif age < 75:
        hd_predictable[10] = 1
    elif age < 80:
        hd_predictable[11] = 1
    else: # 80+
        hd_predictable[12] = 1
        
    ## Race
    race = form['race']
    if race == 'American Indian/Alaskan Native':
        hd_predictable[13] = 1
    elif race == 'Asian':
        hd_predictable[14] = 1
    elif race == 'Black':
        hd_predictable[15] = 1
    elif race == 'Hispanic:'
        hd_predictable[16] = 1
    elif race == 'Other':
        hd_predictable[17] = 1
    else: #race == 'White'
        hd_predictable[18] = 1
    
    ## Diabetic
    diabetic = form['diabetic']
    if diabetic == 'No':
        hd_predictable[19] = 1
    elif diabetic == 'No, borderline diabetes':
        hd_predictable[20] = 1
    elif diabetic == 'Yes':
        hd_predictable[21] = 1
    else: #diabetic == 'Yes (during pregnancy)'
        hd_predictable[22] = 1
        
    ## Gen Health
    generalHealth = form['generalHealth']
    if generalHealth == 'Excellent':
        hd_predictable[23] = 1
    elif generalHealth == 'Fair':
        hd_predictable[24] = 1
    elif generalHealth == 'Good':
        hd_predictable[25] = 1
    elif generalHealth == 'Poor':
        hd_predictable[26] = 1
    else: #generalHealth == 'Very Good'
        hd_predictable[27] = 1
    
    ## BMI, 
    bmi = form['bmi']
    smoking = form['smoking']
    alcoholDrinking = form['alcoholUse']
    stroke = form['hasHadStroke']
    physicalHealth = form['physicalHealth']
    mentalHealth = form['']
    diffWalking = form['']
    sex = form['']
    physicalActivity = form['physicalActivity']
    sleepTime = form['']
    asthma = form['']
    kidneyDisease = form['']
    skinCancer = form['']
    
    BMI                                    0.055294
Smoking                                1.000000
AlcoholDrinking                        0.000000
Stroke                                 0.000000
PhysicalHealth                         0.100000
MentalHealth                           1.000000
DiffWalking                            0.000000
Sex                                    1.000000
PhysicalActivity                       1.000000
SleepTime                              0.173913
Asthma                                 1.000000
KidneyDisease                          0.000000
SkinCancer                             1.000000

user_form['age']

21

In [27]:
hd_num.columns, hd_cat.columns, lc_num.columns, lc_cat.columns, cc_num.columns, cc_cat.columns

(Index(['BMI', 'Smoking', 'AlcoholDrinking', 'Stroke', 'PhysicalHealth',
        'MentalHealth', 'DiffWalking', 'Sex', 'PhysicalActivity', 'SleepTime',
        'Asthma', 'KidneyDisease', 'SkinCancer'],
       dtype='object'),
 Index(['AgeCategory', 'Race', 'Diabetic', 'GenHealth'], dtype='object'),
 Index(['Age', 'Gender', 'Air Pollution', 'Alcohol use', 'Dust Allergy',
        'OccuPational Hazards', 'Genetic Risk', 'chronic Lung Disease',
        'Balanced Diet', 'Obesity', 'Smoking', 'Passive Smoker', 'Chest Pain',
        'Coughing of Blood', 'Fatigue', 'Weight Loss', 'Shortness of Breath',
        'Wheezing', 'Swallowing Difficulty', 'Clubbing of Finger Nails',
        'Frequent Cold', 'Dry Cough', 'Snoring'],
       dtype='object'),
 Index([], dtype='object'),
 Index(['hadPreviousCancer', 'hadPreviousColonCancer', 'hasFamilyHistory',
        'hadRadiationTherapy', 'isOldAge', 'hasIBD', 'hasObesity', 'isSmoker',
        'isDrinker', 'exercisesRegularly', 'hasHighFatDiet'],
       