<a href="https://colab.research.google.com/github/erick-huang15/Python_Exercises/blob/main/Pipelines_Activity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd # this is to import all the libraries
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

In [31]:
# this is to load and read the data
data_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vR5VchYrWqVdeK_jmP0Qy6jUL18YENAwAaT0OJktV48onkXTAhB-7Bh47qqGYjrmQ/pub?output=xlsx'
data = pd.read_excel(data_url)
data.head()

Unnamed: 0,name,Manufacturer,type,calories per serving,grams of protein,grams of fat,milligrams of sodium,grams of dietary fiber,grams of complex carbohydrates,grams of sugars,milligrams of potassium,vitamins and minerals (% of FDA recommendation),Display shelf,Weight in ounces per one serving,Number of cups in one serving,Rating of cereal
0,Apple Cinnamon Cheerios,General Mills,Cold,110.0,2.0,2.0,180.0,1.5,10.5,10.0,70.0,25.0,1.0,1.0,0.75,29.509541
1,Basic 4,General Mills,Cold,130.0,3.0,2.0,,2.0,18.0,,100.0,25.0,3.0,1.33,0.75,37.038562
2,Cheerios,General Mills,Cold,,6.0,2.0,290.0,2.0,17.0,1.0,105.0,25.0,1.0,1.0,1.25,50.764999
3,Cinnamon Toast Crunch,General Mills,Cold,120.0,1.0,3.0,210.0,0.0,13.0,9.0,45.0,25.0,2.0,1.0,0.75,19.823573
4,Clusters,General Mills,Cold,110.0,3.0,2.0,140.0,2.0,13.0,7.0,105.0,25.0,3.0,1.0,0.5,40.400208


In [32]:
data.info() # this is to inspect th dtype of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   name                                             77 non-null     object 
 1   Manufacturer                                     77 non-null     object 
 2   type                                             68 non-null     object 
 3   calories per serving                             70 non-null     float64
 4   grams of protein                                 77 non-null     float64
 5   grams of fat                                     69 non-null     float64
 6   milligrams of sodium                             76 non-null     float64
 7   grams of dietary fiber                           77 non-null     float64
 8   grams of complex carbohydrates                   77 non-null     float64
 9   grams of sugars                   

#### Based on our finding "type, calories per serving, grams of fat, milligram of sodium, grams of sugars, vitamins and minerals" all need to be imputed

In [33]:
print(data.isna().sum()) # this is to insepct the missing values

name                                               0
Manufacturer                                       0
type                                               9
calories per serving                               7
grams of protein                                   0
grams of fat                                       8
milligrams of sodium                               1
grams of dietary fiber                             0
grams of complex carbohydrates                     0
grams of sugars                                    9
milligrams of potassium                            0
vitamins and minerals (% of FDA recommendation)    1
Display shelf                                      0
Weight in ounces per one serving                   0
Number of cups in one serving                      0
Rating of cereal                                   0
dtype: int64


In [34]:
X = data.drop(columns='calories per serving') # Train Test Split
y = data['calories per serving']
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

#### Based on the inspection of the data, within this dataset there are numeric features and ordinal features after we converted type

In [35]:
data.info() # this is to take a look at the data again to identify each feature

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   name                                             77 non-null     object 
 1   Manufacturer                                     77 non-null     object 
 2   type                                             68 non-null     object 
 3   calories per serving                             70 non-null     float64
 4   grams of protein                                 77 non-null     float64
 5   grams of fat                                     69 non-null     float64
 6   milligrams of sodium                             76 non-null     float64
 7   grams of dietary fiber                           77 non-null     float64
 8   grams of complex carbohydrates                   77 non-null     float64
 9   grams of sugars                   

#### Pipelines and column transformers to comoplete: Impute missing values, One-hot encode the nomial features, scale the numeric columsn

In [25]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')

In [36]:
# Creating imputers
most_freg_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy='mean')

scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

In [38]:
# Instantiate Pipelines
numeric_pipeline = make_pipeline(mean_imputer, scaler) # Numeric pipeline
categoric_pipeline = make_pipeline(most_freg_imputer, ohe) # Cat pipeline

In [41]:
# Instantiate column_transformer
numeric_tuple = (numeric_pipeline, num_selector)
categoric_tuple = (categoric_pipeline, cat_selector)

preprocessor = make_column_transformer(numeric_tuple, categoric_tuple)

In [43]:
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [51]:
print(np.isnan(X_train_processed).sum().sum())
print(np.isnan(X_test_processed).sum().sum())
print(X_train_processed.dtype)
print(X_test_processed.dtype)
X_train_processed

0
0
float64
float64


array([[-1.30301442, -0.97467943,  0.56162348, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.40438378,  0.        ,  0.68120871, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.40438378, -0.97467943,  1.99664622, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [ 1.25808288,  1.94935887, -0.03630266, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.40438378,  0.97467943, -0.15588789, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.40438378,  0.        ,  0.08328257, ...,  0.        ,
         1.        ,  0.        ]])