<a href="https://colab.research.google.com/github/abunchoftigers/Prediction-of-Product-Sales/blob/main/Column_Transformer_Core.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

How well can the "rating" of cereal be predicted using the following features?

mfr, type, calories, protein, fat, fiber, sugars, shelf

- Author: David Dyer

# Create a Column Transformer

- Define 3 tuples (one for each pipeline that includes the name, the pipeline object, and the list of columns to apply it to.)
- Create one column transformer object that includes the 3 preprocessing pipelines you created in the previous assignment.
- Fit the column transformer object to the training data.
- Save the transformed training data as X_train_processed
- Display the .head() of X_train_processed
- Save the transformed testing data as X_ test_processed
- Display the .head() of the X_test_processed

In [77]:
import pandas as pd
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, ColumnTransformer

from sklearn import set_config
set_config(transform_output='pandas')

from google.colab import drive
import warnings

warnings.simplefilter('ignore')

In [78]:
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [79]:
fpath = '/content/drive/MyDrive/Coding Dojo - Data Science/02 - Intro to Machine Learning/Week 1/data/cereal.csv'
df = pd.read_csv(fpath)
df.head()

Unnamed: 0,name,mfr,type,calories,protein,fat,sodium,fiber,carbo,sugars,potass,vitamins,shelf,weight,cups,rating
0,100% Bran,N,C,70,4,1,130,10.0,5.0,6,280,25,3,1.0,0.33,68.402973
1,100% Natural Bran,Q,C,120,3,5,15,2.0,8.0,8,135,0,3,1.0,1.0,33.983679
2,All-Bran,K,C,70,4,1,260,9.0,7.0,5,320,25,3,1.0,0.33,59.425505
3,All-Bran with Extra Fiber,K,C,50,4,0,140,14.0,8.0,0,330,25,3,1.0,0.5,93.704912
4,Almond Delight,R,C,110,2,2,200,1.0,14.0,8,-1,25,3,1.0,0.75,34.384843


In [81]:
X = df[['mfr', 'type', 'calories', 'protein', 'fat', 'fiber', 'sugars', 'shelf']]
y = df['rating']

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

In [83]:
ord_cols = ['shelf']
num_cols = X_train.select_dtypes('number').columns
obj_cols = X_train[['mfr', 'type']].columns

In [84]:
ord_cols

['shelf']

## Define 3 tuples (one for each pipeline that includes the name, the pipeline object, and the list of columns to apply it to.)

In [85]:
# Ordinal pipeline
impute_common = SimpleImputer(strategy='most_frequent')
shelf_order = ['bottom','middle','top']
ord_encoder = OrdinalEncoder(categories='auto')
scaler = StandardScaler()

ord_pipe = make_pipeline(impute_common, ord_encoder, scaler)
ord_pipe

In [86]:
# Number pipeline
mean_imputer = SimpleImputer(strategy="mean")
scaler = StandardScaler()

numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

In [87]:
# Prepare Categorical pipeline

impute_missing = SimpleImputer(strategy='constant',fill_value='MISSING')
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

ohe_encoder.fit(X_train)

ohe_pipe = make_pipeline(impute_missing, ohe_encoder)
ohe_pipe

In [88]:
ord_tuple = ('ordinal', ord_pipe, ord_cols)
num_tuple = ('numeric', numeric_pipe, num_cols)
ohe_tuple = ('categorical', ohe_pipe, obj_cols)
# ord_tuple
# num_tuple
# ohe_tuple

## Create one column transformer object that includes the 3 preprocessing pipelines you created in the previous assignment.

In [89]:
col_transformer = ColumnTransformer([num_tuple, ord_tuple, ohe_tuple], verbose_feature_names_out=False)

## Fit the column transformer object to the training data.

In [90]:
col_transformer.fit(X_train)

## Save the transformed training data as X_train_processed

Display the .head() of X_train_processed

In [91]:
X_train_processed = col_transformer.transform(X_train)
X_train_processed.head()

Unnamed: 0,calories,protein,fat,fiber,sugars,shelf,shelf.1,mfr_A,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
30,-0.319703,-0.524507,-0.965827,-0.837874,1.867936,-1.355719,-1.355719,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
40,0.172812,-0.524507,0.053657,-0.837874,-0.782828,-0.184871,-0.184871,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
39,1.650358,0.354813,0.053657,0.005921,0.542554,0.985978,0.985978,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
16,-0.319703,-0.524507,-0.965827,-0.415976,-1.003725,-1.355719,-1.355719,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
65,-0.812218,0.354813,-0.965827,0.427819,-1.445519,-1.355719,-1.355719,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


## Save the transformed testing data as X_ test_processed## Display the .head() of the X_test_processed

In [92]:
X_test_processed = col_transformer.transform(X_test)
X_test_processed.head()

Unnamed: 0,calories,protein,fat,fiber,sugars,shelf,shelf.1,mfr_A,mfr_G,mfr_K,mfr_N,mfr_P,mfr_Q,mfr_R,type_C,type_H
4,0.172812,-0.524507,1.073141,-0.415976,0.321657,0.985978,0.985978,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
35,0.665328,-1.403826,1.073141,-0.415976,0.984348,-0.184871,-0.184871,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
10,0.665328,-1.403826,1.073141,-0.837874,1.205245,-0.184871,-0.184871,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
0,-1.797249,1.234133,0.053657,3.381104,-0.120137,0.985978,0.985978,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
45,2.142873,1.234133,2.092625,0.427819,0.984348,0.985978,0.985978,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
