In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('BRFSS_cleaned2.csv', index_col = 0)

Impute NaN in GeneralHealth and LastCheckup

In [3]:
df['GeneralHealth'].fillna('Unsure', inplace = True)
df['LastCheckup'].fillna('Unsure', inplace = True)

Remove all NaN rows in df['RaceEthnicityGroup']

In [4]:
df= df.dropna(subset=['RaceEthnicityGroup'])

## Encode Categorical Data

In [5]:
gender = {
    'Male': 1,
    'Female': 0
}
df['Sex'] = df['Sex'].map(gender)

In [6]:
genhealth = {
    'Unsure': 0,
    'Poor': 1,
    'Fair': 2,
    'Good': 3,
    'Very Good': 4,
    'Excellent': 5
}

df['GeneralHealth'] = df['GeneralHealth'].map(genhealth)

In [7]:
checkup = {
    'Never':0,
    'Within past year (anytime less than 12 months ago)':1,
    'Within past 2 years (1 year but less than 2 years ago)' :2,
    'Within past 5 years (2 years but less than 5 years ago)':3,
    '5 or more years ago': 4,
    'Unsure' : 5
}

df['LastCheckup'] = df['LastCheckup'].map(checkup)

In [8]:
age = {
    'Age 18 to 24': 0,
    'Age 25 to 29': 1,
    'Age 30 to 34': 2,
    'Age 35 to 39': 3,
    'Age 40 to 44': 4,
    'Age 45 to 49': 5,
    'Age 50 to 54': 6,
    'Age 55 to 59': 7,
    'Age 60 to 64': 8,
    'Age 65 to 69': 9,
    'Age 70 to 74': 10,
    'Age 75 to 79': 11,
    'Age 80 or older': 12
}

df['AgeGroup5yrs'] = df['AgeGroup5yrs'].map(age)

In [9]:
smoked = {
    'Never smoked': 0,
    'Smoked': 1
}

df['Smoked'] = df['Smoked'].map(smoked)

In [10]:
yes_no = {
    'Yes' : 1,
    'No' : 0
}

df['ExerciseLast30days'] = df['ExerciseLast30days'].map(yes_no)
df['HadHeartAttack'] = df['HadHeartAttack'].map(yes_no)
df['HadAngina'] = df['HadAngina'].map(yes_no)
df['HadStroke'] = df['HadStroke'].map(yes_no)
df['HadAsthma'] = df['HadAsthma'].map(yes_no)
df['HadSkinCancer'] = df['HadSkinCancer'].map(yes_no)
df['HadCOPD'] = df['HadCOPD'].map(yes_no)
df['HadDepressiveDisorder'] = df['HadDepressiveDisorder'].map(yes_no)
df['HadKidneyDisease'] = df['HadKidneyDisease'].map(yes_no)
df['HadArthritis'] = df['HadArthritis'].map(yes_no)
df['Deaf'] = df['Deaf'].map(yes_no)
df['Blind'] = df['Blind'].map(yes_no)
df['DifficultyConcentrating'] = df['DifficultyConcentrating'].map(yes_no)
df['DifficultyDressingOrBathing'] = df['DifficultyDressingOrBathing'].map(yes_no)
df['DifficultyAloneErrands'] = df['DifficultyAloneErrands'].map(yes_no)
df['HeartDisease'] = df['HeartDisease'].map(yes_no)
df['HadChestScan'] = df['HadChestScan'].map(yes_no)
df['FluShotLast12Months'] = df['FluShotLast12Months'].map(yes_no)
df['HadPneumoniaShot'] = df['HadPneumoniaShot'].map(yes_no)
df['HIVRisk'] = df['HIVRisk'].map(yes_no)
df['AlcoholLast30days'] = df['AlcoholLast30days'].map(yes_no)
df['TestedForHIV'] = df['TestedForHIV'].map(yes_no)
df['DifficultyWalking'] = df['DifficultyWalking'].map(yes_no)


In [11]:
diabete = {
    'No' : 0,
    'Yes' : 1,
    'No, pre-diabetes or borderline diabetes' : 0,
    'Yes, but female told only during pregnancy' : 1
}

df['HadDiabetes'] = df['HadDiabetes'].map(diabete)

In [12]:
ecig = {
    'Never used e-cigarettes in your entire life' : 0,
    'Not at all (right now)' : 0,
    'Use them every day' : 1,
    'Use them some days':1
}

df['EcigUsage'] = df['EcigUsage'].map(ecig)

In [13]:
tetshot = {
    'No, did not receive any tetanus shot in the past 10 years':0,
    'Yes, received tetanus shot but not sure what type': 1,
    'Yes, received Tdap': 1,
    'Yes, received tetanus shot, but not Tdap': 1
}

df['HadTetanusShot'] = df['HadTetanusShot'].map(tetshot)

In [14]:
covid = {
    'Tested positive using home test without health professional': 1,
    'Yes' : 1,
    'No' : 0
}

df['HadCovid'] = df['HadCovid'].map(covid)

In [15]:
race = {
    'White only, non-Hispanic' : 0,
    'Black only, non-Hispanic': 1,
    'Hispanic': 2,
    'Asian only, non-Hispanic': 3,
    'American Indian or Alaskan Native only, Non-Hispanic': 4,
    'Native Hawaiian or other Pacific Islander only, Non-Hispanic': 5,
    'Multiracial, non-Hispanic': 6
}

df['RaceEthnicityGroup'] = df['RaceEthnicityGroup'].map(race)



In [16]:
state = {
    "Alabama": 1,
    "Alaska" : 2,
    "Arizona": 3,
    "Arkansas" : 4,
    "California": 5,
    "Colorado": 6,
    "Connecticut": 7,
    "Delaware": 8,
    "District of Columbia": 9,
    "Florida": 10,
    "Georgia": 11,
    "Hawaii": 12,
    "Idaho": 13,
    "Illinois": 14,
    "Indiana": 15,
    "Iowa": 16,
    "Kansas": 17,
    "Kentucky": 18,
    "Louisiana": 19,
    "Maine": 20,
    "Maryland": 21,
    "Massachusetts": 22,
    "Michigan" : 23,
    "Minnesota": 24,
    "Mississippi": 25,
    "Missouri": 26,
    "Montana":27,
    "Nebraska":28,
    "Nevada":29,
    "New Hampshire":30,
    "New Jersey":31,
    "New Mexico":32,
    "New York":33,
    "North Carolina":34,
    "North Dakota":35,
    "Ohio":36,
    "Oklahoma":37,
    "Oregon":38,
    "Pennsylvania":39,
    "Rhode Island":40,
    "South Carolina":41,
    "South Dakota":42,
    "Tennessee":43,
    "Texas":44,
    "Utah":45,
    "Vermont":46,
    "Virginia":47,
    "Washington":48,
    "West Virginia":49,
    "Wisconsin":50,
    "Wyoming":51,
    "Guam":52,
    "Puerto Rico":53,
    "Virgin Islands":54
}

df['State'] = df['State'].map(state)

Dropping Columns based on Feature importance

In [17]:
columns_drop = ['HIVRisk',
                'isObese',
                'DifficultyConcentrating',
                'HadAsthma',
                'Blind',
                'HadDepressiveDisorder',
                'TestedForHIV',
                'AlcoholLast30days',
                'HadSkinCancer',
                'DifficultyDressingOrBathing',
                'FluShotLast12Months',
                'DifficultyAloneErrands',
                'LevelOfSmoker',
                'HadChestScan',
                'HadPneumoniaShot',
                'HadTetanusShot',
                'HadHeartAttack']

In [18]:
df = df.drop(columns = columns_drop)

In [None]:
df.to_csv('modelingdf.csv')