<a href="https://colab.research.google.com/github/eckoecho/CodingDojo/blob/EDA/Pipelines_(Core).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Import Libraries

In [595]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn import set_config
from sklearn.pipeline import make_pipeline
set_config(transform_output="pandas")
pd.set_option("display.max_columns", 100)

### Load Data

In [596]:
fpath="/content/drive/MyDrive/CodingDojo/02-MachineLearning/Week05/Data/cereal-kaggle-crawford-modified - sheet 1.csv"
df = pd.read_csv(fpath)
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,,4,1.0,130,10.0,5.0,6.0,280,25,top,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120.0,3,5.0,15,2.0,8.0,8.0,135,0,top,1.0,1.0,33.983679
2,All-Bran,K,C,70.0,4,1.0,260,9.0,7.0,5.0,320,25,top,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50.0,4,0.0,140,14.0,8.0,0.0,330,25,top,1.0,0.5,93.704912
4,Almond Delight,R,C,,2,2.0,200,1.0,14.0,8.0,-1,25,,1.0,0.75,34.384843


# Pre-processing

### Define target and features and train-test-split:



In [597]:
#Define features (X) and target y
X= df.drop(columns=["rating","name", "sodium", "carbo", "potass", "vitamins", "weight", "cups"])
y= df["rating"]

#### Train test split the data to prepare for machine learning

In [598]:
#Spliting data into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)
X_train.head()

Unnamed: 0,mfr,type,calories,protein,fat,fiber,sugars,shelf
30,P,C,100.0,2,0.0,0.0,15.0,bottom
40,G,C,110.0,2,1.0,0.0,3.0,middle
39,K,C,140.0,3,1.0,2.0,9.0,top
16,K,C,100.0,2,0.0,1.0,2.0,bottom
65,N,C,90.0,3,0.0,3.0,0.0,bottom


#### Identify each feature as numerical, ordinal, or nominal

In [599]:
#Display the data types for X_train.
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 30 to 51
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   mfr       55 non-null     object 
 1   type      57 non-null     object 
 2   calories  57 non-null     float64
 3   protein   57 non-null     int64  
 4   fat       52 non-null     float64
 5   fiber     52 non-null     float64
 6   sugars    52 non-null     float64
 7   shelf     57 non-null     object 
dtypes: float64(4), int64(1), object(3)
memory usage: 4.0+ KB


**numeric**:

    * "calories"
    * "protein"
    * "fat"
    * "fiber"
    * "sugars"
**ordinal**:

    * "shelf"
**nominal**:

    * "mfr"
    * "type"

### Create 3 pipelines (one for numeric, ordinal, and categorical features)

####For the numeric features/pipeline:

In [630]:
num_cols = X_train.select_dtypes("number").columns
num_cols

Index(['calories', 'protein', 'fat', 'fiber', 'sugars'], dtype='object')

In [631]:
X_train[num_cols].isna().sum()

calories    0
protein     0
fat         5
fiber       5
sugars      5
dtype: int64

In [632]:
#Impute null values using SImpleImputer using the ‘mean’ strategy.
impute_mean = SimpleImputer(strategy="mean")
scaler = StandardScaler()

In [633]:
num_pipe = make_pipeline(impute_mean, scaler)
num_pipe

In [634]:
# Fit the pipeline on the numeric training data
num_pipe.fit(X_train[num_cols])

#### For categorical (nominal) pipeline:

In [605]:
#List of Nominal Columns
ohe_cols = ["mfr", "type"]

In [606]:
#Impute null values using SimpleImputer using the ‘constant’ strategy with a fill value of "MISSING."
impute_ohe_na = SimpleImputer(strategy="constant", fill_value="MISSING")

In [607]:
#Use OneHotEncoder to encode the features.
# Instantiate one hot encoder
#Be sure to include the arguments: sparse_output=False AND handle_unknown='ignore' when creating your OneHotEncoder.
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

In [608]:
# Instantiate the pipeline
ohe_pipe = make_pipeline(impute_na, ohe_encoder)
ohe_pipe

In [609]:
# Fit the pipeline on the training data
ohe_pipe.fit(X_train[ohe_cols])

#### For the ordinal pipeline:

shelf: display shelf (1, 2, or 3, counting from the floor)

In [610]:
#Stating ordinal column
ord_cols = ["shelf"]

In [611]:
#Getting each value of ord cols
df[ord_cols].value_counts()

shelf 
top       35
bottom    20
middle    20
dtype: int64

In [612]:
df[ord_cols].isna().sum()

shelf    2
dtype: int64

In [613]:
# Specifying the order of categories in quality/condition columns
shelf_order = ["bottom","middle", "top"]

In [614]:
# Making the list of order lists for OrdinalEncoder
ordinal_category_orders = [shelf_order]

In [615]:
#Impute null values using SimpleImputer using the "most_frequent" strategy.
impute_na_ord = SimpleImputer(strategy="most_frequent")

In [616]:
# Fit the imputer object on the training data with .fit
impute_na_ord.fit(X_train[ord_cols])

In [617]:
# Transform the categorical training data
X_train_ord_imputed = impute_na_ord.transform(X_train[ord_cols])

In [618]:
# Transform the categorical testing data
X_test_ord_imputed = impute_na_ord.transform(X_test[ord_cols])

In [619]:
#Use OrdinalEncoder to encode the "shelf" column.
# Instantiate the ordinalencoder
ord_encoder = OrdinalEncoder(categories=[shelf_order])
ord_encoder

In [620]:
# Fit the encoder on the training data
ord_encoder.fit(X_train_ord_imputed[ord_cols])

In [621]:
# Fit the imputer on the training data
impute_na_ord.fit(X_train[ord_cols])

In [622]:
# Transform the training data
X_train_ordinal_enc = ord_encoder.transform(X_train_ord_imputed[ord_cols])

In [623]:
# instantiate scaler
scaler_ord = StandardScaler()

In [624]:
#Scale the ordinal features using StandardScaler
# fit scaler on training data
scaler.fit(X_train_ordinal_enc)

##Transform the Features:



####Transform Ordinal

In [625]:
# Making an ord_pipe 
ord_pipe = make_pipeline(impute_na_ord, ord_encoder, scaler_ord)
ord_pipe

In [626]:
# Fit the pipeline on the training data
ord_pipe.fit(X_train[ord_cols])

In [627]:
# Transform the training and test data
X_train_ord_tf = ord_pipe.transform(X_train[ord_cols])
X_test_ord_tf = ord_pipe.transform(X_test[ord_cols])
X_train_ord_tf.head()

Unnamed: 0,shelf
30,-1.355719
40,-0.184871
39,0.985978
16,-1.355719
65,-1.355719


#### Transform Nominal

In [628]:
#Transform the training and test data
X_train_ohe_tf = ohe_pipe.transform(X_train[ohe_cols])
X_test_ohe_tf = ohe_pipe.transform(X_test[ohe_cols])
X_train_ohe_tf.head()

Unnamed: 0,mfr_A,mfr_G,mfr_K,mfr_MISSING,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
30,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
40,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
65,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


#### Transform Numeric

In [639]:
# Transform the training data
X_train_num_tf = num_pipe.transform(X_train[num_cols])
X_test_num_tf = num_pipe.transform(X_test[num_cols])
X_train_num_tf.head()
X_test_num_tf.head()

Unnamed: 0,calories,protein,fat,fiber,sugars
4,0.0,-0.524507,1.088047,-0.444692,0.366247
35,0.0,-1.403826,1.088047,-0.444692,1.063009
10,0.0,-1.403826,1.088047,-0.871334,1.295263
0,0.0,1.234133,0.040298,3.395084,-0.098261
45,0.0,1.234133,2.135796,0.408592,1.063009


## Finally, combine the data back together:

In [640]:
# re-combine the 3 dataframes of training data
X_train_tf = pd.concat([X_train_num_tf, X_train_ord_tf,
                               X_train_ohe_tf], axis=1)
X_train_tf.info()
X_train_tf.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 30 to 51
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   calories     57 non-null     float64
 1   protein      57 non-null     float64
 2   fat          57 non-null     float64
 3   fiber        57 non-null     float64
 4   sugars       57 non-null     float64
 5   shelf        57 non-null     float64
 6   mfr_A        57 non-null     float64
 7   mfr_G        57 non-null     float64
 8   mfr_K        57 non-null     float64
 9   mfr_MISSING  57 non-null     float64
 10  mfr_N        57 non-null     float64
 11  mfr_P        57 non-null     float64
 12  mfr_Q        57 non-null     float64
 13  mfr_R        57 non-null     float64
 14  type_C       57 non-null     float64
 15  type_H       57 non-null     float64
dtypes: float64(16)
memory usage: 7.6 KB


Unnamed: 0,calories,protein,fat,fiber,sugars,shelf,mfr_A,mfr_G,mfr_K,mfr_MISSING,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
30,-0.319703,-0.524507,-1.007451,-0.871334,1.992024,-1.355719,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
40,0.172812,-0.524507,0.040298,-0.871334,-0.795023,-0.184871,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39,1.650358,0.354813,0.040298,-0.01805,0.598501,0.985978,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16,-0.319703,-0.524507,-1.007451,-0.444692,-1.027277,-1.355719,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
65,-0.812218,0.354813,-1.007451,0.408592,-1.491785,-1.355719,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [641]:
# re-combine the 3 dataframes of testing data
X_test_tf = pd.concat([X_test_num_tf, X_test_ord_tf,
                               X_test_ohe_tf], axis=1)
X_test_tf.info()
X_test_tf.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 4 to 22
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   calories     20 non-null     float64
 1   protein      20 non-null     float64
 2   fat          20 non-null     float64
 3   fiber        20 non-null     float64
 4   sugars       20 non-null     float64
 5   shelf        20 non-null     float64
 6   mfr_A        20 non-null     float64
 7   mfr_G        20 non-null     float64
 8   mfr_K        20 non-null     float64
 9   mfr_MISSING  20 non-null     float64
 10  mfr_N        20 non-null     float64
 11  mfr_P        20 non-null     float64
 12  mfr_Q        20 non-null     float64
 13  mfr_R        20 non-null     float64
 14  type_C       20 non-null     float64
 15  type_H       20 non-null     float64
dtypes: float64(16)
memory usage: 2.7 KB


Unnamed: 0,calories,protein,fat,fiber,sugars,shelf,mfr_A,mfr_G,mfr_K,mfr_MISSING,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
4,0.0,-0.524507,1.088047,-0.444692,0.366247,0.985978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
35,0.0,-1.403826,1.088047,-0.444692,1.063009,0.985978,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10,0.0,-1.403826,1.088047,-0.871334,1.295263,-0.184871,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
0,0.0,1.234133,0.040298,3.395084,-0.098261,0.985978,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
45,0.0,1.234133,2.135796,0.408592,1.063009,0.985978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


We can see from the output above that all our feature are numeric by the `Dtype` as `float`.