<a href="https://colab.research.google.com/github/eckoecho/CodingDojo/blob/Model/Model_Pipelines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
#Importing necessary libraries
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import set_config
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer, make_column_selector
set_config(transform_output="pandas")

In [28]:
#Load data
fpath="/content/drive/MyDrive/CodingDojo/02-MachineLearning/Week06/Data/medical_data.csv"
df = pd.read_csv(fpath)
df.head()

Unnamed: 0,State,Lat,Lng,Area,Children,Age,Income,Marital,Gender,ReAdmis,...,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge,Additional_charges
0,AL,34.3496,-86.72508,Suburban,1.0,53,86575.93,Divorced,Male,0,...,0.0,1.0,1.0,1.0,0,1,Blood Work,10.58577,3726.70286,17939.40342
1,FL,30.84513,-85.22907,Urban,3.0,51,46805.99,Married,Female,0,...,0.0,0.0,0.0,0.0,1,0,Intravenous,15.129562,4193.190458,17612.99812
2,SD,43.54321,-96.63772,Suburban,3.0,53,14370.14,Widowed,Female,0,...,0.0,0.0,0.0,0.0,0,0,Blood Work,4.772177,2434.234222,17505.19246
3,MN,43.89744,-93.51479,Suburban,0.0,78,39741.49,Married,Male,0,...,0.0,0.0,0.0,0.0,1,1,Blood Work,1.714879,2127.830423,12993.43735
4,VA,37.59894,-76.88958,Rural,1.0,22,1209.56,Widowed,Female,0,...,1.0,0.0,0.0,1.0,0,0,CT Scan,1.254807,2113.073274,3716.525786


# Explore Data

In [29]:
#Checking for duplicated data
df.duplicated().sum()

0

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   State               995 non-null    object 
 1   Lat                 1000 non-null   float64
 2   Lng                 1000 non-null   float64
 3   Area                995 non-null    object 
 4   Children            993 non-null    float64
 5   Age                 1000 non-null   int64  
 6   Income              1000 non-null   float64
 7   Marital             995 non-null    object 
 8   Gender              995 non-null    object 
 9   ReAdmis             1000 non-null   int64  
 10  VitD_levels         1000 non-null   float64
 11  Doc_visits          1000 non-null   int64  
 12  Full_meals_eaten    1000 non-null   int64  
 13  vitD_supp           1000 non-null   int64  
 14  Soft_drink          1000 non-null   int64  
 15  Initial_admin       995 non-null    object 
 16  HighBlo

To preprocess our data, we will:
1. Imputing missing values

2. Scale numeric data

3. One-hot encode categorical data.

# Perform a Validation Split

In [31]:
y= df["Additional_charges"]
X= df.drop(columns="Additional_charges")
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,State,Lat,Lng,Area,Children,Age,Income,Marital,Gender,ReAdmis,...,Diabetes,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge
82,TN,36.16307,-86.6651,Urban,2.0,60,8459.99,Never Married,Female,0,...,0.0,1.0,1.0,0.0,0.0,0,0,Intravenous,6.714754,3097.337588
991,AL,34.96594,-87.12179,Urban,5.0,78,22669.31,Married,Male,0,...,0.0,1.0,1.0,0.0,0.0,1,0,Blood Work,5.694359,3073.408768
789,TN,36.24648,-83.51232,Urban,1.0,60,25536.25,Married,Nonbinary,0,...,0.0,0.0,1.0,0.0,0.0,1,0,Intravenous,7.336514,3199.418504
894,SD,45.42189,-97.91165,Rural,7.0,82,94863.57,Never Married,Male,0,...,1.0,0.0,1.0,0.0,0.0,0,0,Blood Work,13.172367,3693.118743
398,MI,42.33661,-83.28292,Suburban,0.0,37,30898.36,Widowed,Female,0,...,0.0,1.0,0.0,0.0,0.0,0,0,Blood Work,7.257809,2616.316061


# Create a ColumnTransformer for Preprocessing

Note that we are preparing our preprocessing object, but we are not preprocessing the data, yet!


##Categorical

In [32]:
#instead of using .select_dtype, you an use make_column_selector
cat_selector = make_column_selector(dtype_include="object")
cat_selector(X_train)

['State',
 'Area',
 'Marital',
 'Gender',
 'Initial_admin',
 'Complication_risk',
 'Services']

In [33]:
# Create the preprocessing pipeline for categorical data
cat_selector = make_column_selector(dtype_include="object")
freq_imputer = SimpleImputer(strategy="most_frequent")
ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
cat_pipe = make_pipeline(freq_imputer, ohe)
# Make a tuple for Cl
cat_tuple = ("categorical", cat_pipe, cat_selector)
cat_tuple

('categorical',
 Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='most_frequent')),
                 ('onehotencoder',
                  OneHotEncoder(handle_unknown='ignore', sparse_output=False))]),
 <sklearn.compose._column_transformer.make_column_selector at 0x7fb53da39cc0>)

## Numerical

In [34]:
# Create the preprocessing pipeline for numeric data
num_selector = make_column_selector(dtype_include="number")
num_imputer = SimpleImputer(strategy="mean")
scaler = StandardScaler()
num_pipe = make_pipeline(num_imputer, scaler)
num_tuple = ("numeric", num_pipe, num_selector)
num_tuple

('numeric',
 Pipeline(steps=[('simpleimputer', SimpleImputer()),
                 ('standardscaler', StandardScaler())]),
 <sklearn.compose._column_transformer.make_column_selector at 0x7fb53da39a80>)

## Create the preprocessing ColumnTransformer


In [35]:
# Create the preprocessing ColumnTransformer
preprocessor = ColumnTransformer([cat_tuple, num_tuple], verbose_feature_names_out=False)
preprocessor

# Create a Model Pipeline

In [36]:
# Instantiate a linear regression model
linreg = LinearRegression()
# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline
linreg_pipe = make_pipeline(preprocessor, linreg)

# Fit and Predict the Model Pipeline on the Training Data

In [37]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 750 entries, 82 to 102
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   State               748 non-null    object 
 1   Lat                 750 non-null    float64
 2   Lng                 750 non-null    float64
 3   Area                747 non-null    object 
 4   Children            746 non-null    float64
 5   Age                 750 non-null    int64  
 6   Income              750 non-null    float64
 7   Marital             747 non-null    object 
 8   Gender              746 non-null    object 
 9   ReAdmis             750 non-null    int64  
 10  VitD_levels         750 non-null    float64
 11  Doc_visits          750 non-null    int64  
 12  Full_meals_eaten    750 non-null    int64  
 13  vitD_supp           750 non-null    int64  
 14  Soft_drink          750 non-null    int64  
 15  Initial_admin       747 non-null    object 
 16  HighBlo

In [38]:
## Fit the model pipeline on the training data
linreg_pipe.fit(X_train, y_train)
# Make predictions using the training and testing data
training_predictions = linreg_pipe.predict(X_train)
test_pred = linreg_pipe.predict(X_test)
training_predictions[:10]

array([11302., 14992., 20376., 24426.,  6003.,  4564.,  7265., 22307.,
       16768., 14085.])