In [1]:
# Import packages
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import fcluster

In [2]:
# Read in CSV and list columns
df = pd.read_csv('medical_clean.csv')
df.columns

Index(['CaseOrder', 'Customer_id', 'Interaction', 'UID', 'City', 'State',
       'County', 'Zip', 'Lat', 'Lng', 'Population', 'Area', 'TimeZone', 'Job',
       'Children', 'Age', 'Income', 'Marital', 'Gender', 'ReAdmis',
       'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'vitD_supp',
       'Soft_drink', 'Initial_admin', 'HighBlood', 'Stroke',
       'Complication_risk', 'Overweight', 'Arthritis', 'Diabetes',
       'Hyperlipidemia', 'BackPain', 'Anxiety', 'Allergic_rhinitis',
       'Reflux_esophagitis', 'Asthma', 'Services', 'Initial_days',
       'TotalCharge', 'Additional_charges', 'Item1', 'Item2', 'Item3', 'Item4',
       'Item5', 'Item6', 'Item7', 'Item8'],
      dtype='object')

In [4]:
# Encode categorical features with OneHotEncoding
# Define categorical features
# Apply OneHotEncoding
df_encoded = pd.get_dummies(df, columns='Area')
df_encoded.head()

Unnamed: 0,CaseOrder,Customer_id,Interaction,UID,City,State,County,Zip,Lat,Lng,...,TimeZone_America/New_York,TimeZone_America/Nome,TimeZone_America/North_Dakota/Beulah,TimeZone_America/North_Dakota/New_Salem,TimeZone_America/Phoenix,TimeZone_America/Puerto_Rico,TimeZone_America/Sitka,TimeZone_America/Toronto,TimeZone_America/Yakutat,TimeZone_Pacific/Honolulu
0,1,C412403,8cd49b13-f45a-4b47-a2bd-173ffa932c2f,3a83ddb66e2ae73798bdf1d705dc0932,Eva,AL,Morgan,35621,34.3496,-86.72508,...,0,0,0,0,0,0,0,0,0,0
1,2,Z919181,d2450b70-0337-4406-bdbb-bc1037f1734c,176354c5eef714957d486009feabf195,Marianna,FL,Jackson,32446,30.84513,-85.22907,...,0,0,0,0,0,0,0,0,0,0
2,3,F995323,a2057123-abf5-4a2c-abad-8ffe33512562,e19a0fa00aeda885b8a436757e889bc9,Sioux Falls,SD,Minnehaha,57110,43.54321,-96.63772,...,0,0,0,0,0,0,0,0,0,0
3,4,A879973,1dec528d-eb34-4079-adce-0d7a40e82205,cd17d7b6d152cb6f23957346d11c3f07,New Richland,MN,Waseca,56072,43.89744,-93.51479,...,0,0,0,0,0,0,0,0,0,0
4,5,C544523,5885f56b-d6da-43a3-8760-83583af94266,d2f0425877b10ed6bb381f3e2579424a,West Point,VA,King William,23181,37.59894,-76.88958,...,1,0,0,0,0,0,0,0,0,0


In [11]:
# Scale numeric features with StandardScaler
numeric_features = ['Population', 'Lat', 'Lng']
scaler = StandardScaler()
df_encoded[numeric_features] = scaler.fit_transform(df_encoded[numeric_features])
df_encoded[numeric_features].head()

Unnamed: 0,Population,Lat,Lng
0,-0.473168,-0.814668,0.297134
1,0.090242,-1.463305,0.395522
2,0.482983,0.886966,-0.354788
3,-0.526393,0.95253,-0.149403
4,-0.315586,-0.213252,0.943984


In [12]:
df_encoded.columns

Index(['CaseOrder', 'Customer_id', 'Interaction', 'UID', 'City', 'State',
       'County', 'Zip', 'Lat', 'Lng', 'Population', 'Job', 'Children', 'Age',
       'Income', 'Marital', 'Gender', 'ReAdmis', 'VitD_levels', 'Doc_visits',
       'Full_meals_eaten', 'vitD_supp', 'Soft_drink', 'Initial_admin',
       'HighBlood', 'Stroke', 'Complication_risk', 'Overweight', 'Arthritis',
       'Diabetes', 'Hyperlipidemia', 'BackPain', 'Anxiety',
       'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma', 'Services',
       'Initial_days', 'TotalCharge', 'Additional_charges', 'Item1', 'Item2',
       'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8', 'Area_Rural',
       'Area_Suburban', 'Area_Urban', 'TimeZone_America/Adak',
       'TimeZone_America/Anchorage', 'TimeZone_America/Boise',
       'TimeZone_America/Chicago', 'TimeZone_America/Denver',
       'TimeZone_America/Detroit', 'TimeZone_America/Indiana/Indianapolis',
       'TimeZone_America/Indiana/Knox', 'TimeZone_America/Indiana/Mareng

In [15]:
features = ['Population', 'Lat', 'Lng', 'Area_Rural', 'Area_Suburban', 'Area_Urban']
print(features)

['Population', 'Lat', 'Lng', 'Area_Rural', 'Area_Suburban', 'Area_Urban']
