<a href="https://colab.research.google.com/github/dmburns1729/Class-Files/blob/main/Stack_2_SimpleImputer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Simple Imputer

- 'mean', fill missing values with the mean average of the column they are in.
- 'median', fill missing values with the median average of the column they are in.
- 'mode', fill missing values with the mode average of the column they are in.
- 'most_frequent', fill missing values with the most frequent value in the column they are in (equivalent to 'mode' for numeric columns).
- 'constant', provide a constant value to use to fill missing values. A common choice for categorical data is 'missing'.

1. Import necessary libraries.
2. Load and examine the data.
3. Identify which columns have missing values and decide what imputation strategy to use to fill them.
(We will take a slight detour here, but then backtrack to complete the below steps)
4. Instantiate numeric and categorical column selectors.
5. Instantiate SimpleImputer objects with the imputation strategies we want to use.
6. Use ColumnTransformer to apply each different SimpleImputer object to the appropriate columns.
7. Examine the data to ensure all missing data has been filled.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
Filename = '/content/medical_data.xlsx'
df=pd.read_excel(Filename)
df.head()


Unnamed: 0,State,Lat,Lng,Area,Children,Age,Income,Marital,Gender,ReAdmis,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge,Additional_charges
0,AL,34.3496,-86.72508,Suburban,1.0,53.0,86575.93,Divorced,Male,0.0,...,0.0,1.0,1.0,1.0,0.0,1.0,Blood Work,10.58577,3726.70286,17939.40342
1,FL,30.84513,-85.22907,Urban,3.0,51.0,46805.99,Married,Female,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,Intravenous,15.129562,4193.190458,17612.99812
2,SD,43.54321,-96.63772,Suburban,3.0,53.0,14370.14,Widowed,Female,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,Blood Work,4.772177,2434.234222,17505.19246
3,MN,43.89744,-93.51479,Suburban,0.0,78.0,39741.49,Married,Male,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,Blood Work,1.714879,2127.830423,12993.43735
4,VA,37.59894,-76.88958,Rural,1.0,22.0,1209.56,Widowed,Female,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,CT Scan,1.254807,2113.073274,3716.525786


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   State               995 non-null    object 
 1   Lat                 1000 non-null   float64
 2   Lng                 1000 non-null   float64
 3   Area                995 non-null    object 
 4   Children            993 non-null    float64
 5   Age                 1000 non-null   float64
 6   Income              1000 non-null   float64
 7   Marital             995 non-null    object 
 8   Gender              995 non-null    object 
 9   ReAdmis             1000 non-null   float64
 10  VitD_levels         1000 non-null   float64
 11  Doc_visits          1000 non-null   float64
 12  Full_meals_eaten    1000 non-null   float64
 13  vitD_supp           1000 non-null   float64
 14  Soft_drink          1000 non-null   float64
 15  Initial_admin       995 non-null    object 
 16  HighBlo

In [None]:
print(df.isna().sum().sum(), 'missing values')
# total number of missing values

72 missing values


In [None]:
df[df.isna().any(axis=1)].shape
# could use df.dropna() to drop .7% of rows with missing data

(70, 32)

In [None]:
X=df.drop(columns = ['Additional_charges'])
y=df['Additional_charges']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [None]:
#instantiate the selectors to for numeric and categorical data types
num_selector=make_column_selector(dtype_include='number')
cat_selector=make_column_selector(dtype_include='object')
#select the numeric columns of each type
num_columns=num_selector(X_train)
cat_columns=cat_selector(X_train)
# check our lists
print('numeric columns are', num_columns)
print('categorical columns are', cat_columns)

numeric columns are ['Lat', 'Lng', 'Children', 'Age', 'Income', 'ReAdmis', 'VitD_levels', 'Doc_visits', 'Full_meals_eaten', 'vitD_supp', 'Soft_drink', 'HighBlood', 'Stroke', 'Overweight', 'Arthritis', 'Diabetes', 'Hyperlipidemia', 'BackPain', 'Anxiety', 'Allergic_rhinitis', 'Reflux_esophagitis', 'Asthma', 'Initial_days', 'TotalCharge']
categorical columns are ['State', 'Area', 'Marital', 'Gender', 'Initial_admin', 'Complication_risk', 'Services']


In [None]:
# isolate the numeric columns
df_num = df[num_columns]
# isolate the columns with missing data
df_num.loc[:, df_num.isna().any()]

Unnamed: 0,Children,Arthritis,Diabetes,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis
0,1.0,1.0,1.0,0.0,1.0,1.0,1.0
1,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.0,0.0,,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...
995,3.0,0.0,1.0,1.0,0.0,0.0,0.0
996,2.0,1.0,0.0,,1.0,1.0,1.0
997,0.0,1.0,0.0,1.0,1.0,0.0,0.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [None]:
X_train.isna().any()

State                  True
Lat                   False
Lng                   False
Area                   True
Children               True
Age                   False
Income                False
Marital                True
Gender                 True
ReAdmis               False
VitD_levels           False
Doc_visits            False
Full_meals_eaten      False
vitD_supp             False
Soft_drink            False
Initial_admin          True
HighBlood             False
Stroke                False
Complication_risk      True
Overweight            False
Arthritis              True
Diabetes               True
Hyperlipidemia         True
BackPain               True
Anxiety                True
Allergic_rhinitis      True
Reflux_esophagitis    False
Asthma                False
Services               True
Initial_days          False
TotalCharge           False
dtype: bool

In [None]:
#Instantiate the imputer object from the SimpleImputer class with strategy 'median'
median_imputer = SimpleImputer(strategy='median')
#Fit the imputer object on the numeric training data with .fit()
#calculates the medians of the columns in the training set
median_imputer.fit(X_train[num_columns])
#Use the median from the training data to fill the missing values in
#the numeric columns of both the training and testing sets with .transform()
X_train.loc[:, num_columns] = median_imputer.transform(X_train[num_columns])
X_test.loc[:, num_columns] = median_imputer.transform(X_test[num_columns])

In [None]:
X_train.isna().any()

State                  True
Lat                   False
Lng                   False
Area                   True
Children              False
Age                   False
Income                False
Marital                True
Gender                 True
ReAdmis               False
VitD_levels           False
Doc_visits            False
Full_meals_eaten      False
vitD_supp             False
Soft_drink            False
Initial_admin          True
HighBlood             False
Stroke                False
Complication_risk      True
Overweight            False
Arthritis             False
Diabetes              False
Hyperlipidemia        False
BackPain              False
Anxiety               False
Allergic_rhinitis     False
Reflux_esophagitis    False
Asthma                False
Services               True
Initial_days          False
TotalCharge           False
dtype: bool

In [None]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.isna().any()

State                  True
Lat                   False
Lng                   False
Area                   True
Children               True
Age                   False
Income                False
Marital                True
Gender                 True
ReAdmis               False
VitD_levels           False
Doc_visits            False
Full_meals_eaten      False
vitD_supp             False
Soft_drink            False
Initial_admin          True
HighBlood             False
Stroke                False
Complication_risk      True
Overweight            False
Arthritis              True
Diabetes               True
Hyperlipidemia         True
BackPain               True
Anxiety                True
Allergic_rhinitis      True
Reflux_esophagitis    False
Asthma                False
Services               True
Initial_days          False
TotalCharge           False
dtype: bool

In [None]:
#instantiate the selectors to for numeric and categorical data types
num_selector = make_column_selector(dtype_include='number')
cat_selector = make_column_selector(dtype_include='object')

In [None]:
#instantiate SimpleImputers with most_frequent and median strategies
freq_imputer = SimpleImputer(strategy='most_frequent')
median_imputer = SimpleImputer(strategy='median')

In [None]:
# create tuples of (imputer, selector) for each datatype
num_tuple = (median_imputer, num_selector)
cat_tuple = (freq_imputer, cat_selector)
# instantiate ColumnTransformer
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder='passthrough')
col_transformer

In [None]:
# fit ColumnTransformer on the training data
col_transformer.fit(X_train)
# transform both the training and testing data (this will output a NumPy array)
X_train_imputed = col_transformer.transform(X_train)
X_test_imputed = col_transformer.transform(X_test)
# change the result back to a dataframe
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_train_imputed.isna().any()

State                 False
Lat                   False
Lng                   False
Area                  False
Children              False
Age                   False
Income                False
Marital               False
Gender                False
ReAdmis               False
VitD_levels           False
Doc_visits            False
Full_meals_eaten      False
vitD_supp             False
Soft_drink            False
Initial_admin         False
HighBlood             False
Stroke                False
Complication_risk     False
Overweight            False
Arthritis             False
Diabetes              False
Hyperlipidemia        False
BackPain              False
Anxiety               False
Allergic_rhinitis     False
Reflux_esophagitis    False
Asthma                False
Services              False
Initial_days          False
TotalCharge           False
dtype: bool