Pre-Processing Data

In this step, I will be looking at creating features for machine learning modeling.  

I will achieve this by completing the following:
1. Create dummy variables for categorical variables
2. Use a scaler to standardize the magnitude of numeric features
3. Split the data into testing and training datasets

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
health_data = pd.read_csv('../data/health_data.csv')

In [3]:
health_data.head()

Unnamed: 0,State,County,Life Expectancy,Population,% Rural,Premature Deaths,% Smoking,% Obesity,% Physical Inactivy,% Excessive Drinking,% Uninsured,PCP Number,PCP Rate,MHP Number,MHP Rate,Preventable Hospital Rate,% Mammogram,% Flu Vaccine,% Unemployed,Median Household Income
0,Alabama,Autauga,77.162581,55869,22921,787.0,20,33,31,14,10.0,26.0,47.0,16.0,29.0,6650.0,39.0,42.0,27,58233
1,Alabama,Baldwin,78.213405,223234,77060,3147.0,19,30,25,19,13.0,153.0,70.0,220.0,99.0,3471.0,43.0,46.0,27,59871
2,Alabama,Barbour,74.054741,24686,18613,515.0,26,41,28,12,14.0,8.0,32.0,3.0,12.0,5314.0,44.0,39.0,38,35972
3,Alabama,Bibb,73.408784,22394,15663,476.0,23,37,33,15,11.0,12.0,54.0,6.0,27.0,6690.0,33.0,40.0,31,47918
4,Alabama,Blount,74.370874,57826,51562,1100.0,23,33,33,16,14.0,12.0,21.0,10.0,17.0,4440.0,37.0,40.0,27,52902


In [4]:
health_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2908 entries, 0 to 2907
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      2908 non-null   object 
 1   County                     2908 non-null   object 
 2   Life Expectancy            2908 non-null   float64
 3   Population                 2908 non-null   int64  
 4   % Rural                    2908 non-null   int64  
 5   Premature Deaths           2908 non-null   float64
 6   % Smoking                  2908 non-null   int64  
 7   % Obesity                  2908 non-null   int64  
 8   % Physical Inactivy        2908 non-null   int64  
 9   % Excessive Drinking       2908 non-null   int64  
 10  % Uninsured                2908 non-null   float64
 11  PCP Number                 2908 non-null   float64
 12  PCP Rate                   2908 non-null   float64
 13  MHP Number                 2908 non-null   float

In [5]:
health_data['% Unemployed']=health_data['% Unemployed'].str.replace(',', '.').astype(float)

In [7]:
health_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2908 entries, 0 to 2907
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   State                      2908 non-null   object 
 1   County                     2908 non-null   object 
 2   Life Expectancy            2908 non-null   float64
 3   Population                 2908 non-null   int64  
 4   % Rural                    2908 non-null   int64  
 5   Premature Deaths           2908 non-null   float64
 6   % Smoking                  2908 non-null   int64  
 7   % Obesity                  2908 non-null   int64  
 8   % Physical Inactivy        2908 non-null   int64  
 9   % Excessive Drinking       2908 non-null   int64  
 10  % Uninsured                2908 non-null   float64
 11  PCP Number                 2908 non-null   float64
 12  PCP Rate                   2908 non-null   float64
 13  MHP Number                 2908 non-null   float

In [8]:
hdo=health_data.select_dtypes(include=['object'])

In [9]:
health_data=pd.concat([health_data.drop(hdo,axis=1),pd.get_dummies(hdo)],axis=1)

In [10]:
health_data.columns

Index(['Life Expectancy', 'Population', '% Rural', 'Premature Deaths',
       '% Smoking', '% Obesity', '% Physical Inactivy', '% Excessive Drinking',
       '% Uninsured', 'PCP Number',
       ...
       'County_Yellowstone', 'County_Yoakum', 'County_Yolo', 'County_York',
       'County_Young', 'County_Yuba', 'County_Yukon-Koyukuk', 'County_Yuma',
       'County_Zapata', 'County_Zavala'],
      dtype='object', length=1787)

In [11]:
health_data.head()

Unnamed: 0,Life Expectancy,Population,% Rural,Premature Deaths,% Smoking,% Obesity,% Physical Inactivy,% Excessive Drinking,% Uninsured,PCP Number,...,County_Yellowstone,County_Yoakum,County_Yolo,County_York,County_Young,County_Yuba,County_Yukon-Koyukuk,County_Yuma,County_Zapata,County_Zavala
0,77.162581,55869,22921,787.0,20,33,31,14,10.0,26.0,...,0,0,0,0,0,0,0,0,0,0
1,78.213405,223234,77060,3147.0,19,30,25,19,13.0,153.0,...,0,0,0,0,0,0,0,0,0,0
2,74.054741,24686,18613,515.0,26,41,28,12,14.0,8.0,...,0,0,0,0,0,0,0,0,0,0
3,73.408784,22394,15663,476.0,23,37,33,15,11.0,12.0,...,0,0,0,0,0,0,0,0,0,0
4,74.370874,57826,51562,1100.0,23,33,33,16,14.0,12.0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
y=health_data['Life Expectancy']
X=health_data.drop(columns ='Life Expectancy')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 1234)

In [14]:
SS_scaler = StandardScaler()
X_train = SS_scaler.fit_transform(X_train)

X_test = SS_scaler.fit_transform(X_test)