# Module 03

## Session 05 Data Preparation & Feature Engineering

# Feature Engineering: Ridge

In this chapter, will making machine learning model on:
* data: tips
* target: tip
* preprocess:
    1. one hot encoding on: sex, smoker, time
    2. binary encoding on: day
    3. robust scaler on: total_bill
    4. no treatment on: size
* random state=10, data splitting 70:30, model ridge default

## Library

In [2]:
# dataframe
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

# preprocessing
from sklearn.preprocessing import RobustScaler, OneHotEncoder
import category_encoders as ce
from sklearn.compose import ColumnTransformer

# model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

# validation
from sklearn.metrics import mean_squared_error


## Data

In [3]:
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


## One Hot Encoding

In [13]:
onehot = OneHotEncoder()

transformer = ColumnTransformer([
    ('one hot', onehot,['sex', 'smoker', 'time'])
])

In [14]:
tips_encoded = pd.DataFrame(transformer.fit_transform(tips))
tips_encoded.columns = transformer.get_feature_names()
tips_encoded

Unnamed: 0,one hot__x0_Female,one hot__x0_Male,one hot__x1_No,one hot__x1_Yes,one hot__x2_Dinner,one hot__x2_Lunch
0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,1.0,0.0
2,0.0,1.0,1.0,0.0,1.0,0.0
3,0.0,1.0,1.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...
239,0.0,1.0,1.0,0.0,1.0,0.0
240,1.0,0.0,0.0,1.0,1.0,0.0
241,0.0,1.0,0.0,1.0,1.0,0.0
242,0.0,1.0,1.0,0.0,1.0,0.0


## Binary Encoding

In [15]:
binary_encoder = ce.BinaryEncoder()

transformer = ColumnTransformer([
    ('binary encoding', binary_encoder, ['day'])
])

In [18]:
day_encoded = pd.DataFrame(transformer.fit_transform(tips))
day_encoded['day'] = tips['day']
day_encoded.drop_duplicates()

  elif pd.api.types.is_categorical(cols):


Unnamed: 0,0,1,2,day
0,0,0,1,Sun
19,0,1,0,Sat
77,0,1,1,Thur
90,1,0,0,Fri


## Preproccessing Scheme

In [37]:
# onehot = OneHotEncoder(drop=first) --> k-1 dummy: for linear model: regression, logistic regression
# onehot = OneHotEncoder() --> k dummy: for tree, random forest, boosting, knn

transformer = ColumnTransformer([
    ('one hot', OneHotEncoder(drop='first'), ['sex', 'smoker', 'time']),
    ('binary encoding', ce.BinaryEncoder(), ['day']),
    ('robust scaler', RobustScaler(), ['total_bill'])
], remainder='passthrough')

## Data Splitting

In [38]:
var = ['sex', 'smoker', 'time', 'day', 'total_bill', 'size']

X = tips[var]
y = tips['tip']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=10
)

## Preprocess Fitting

In [40]:
X_train_preprocess = transformer.fit_transform(X_train)
X_test_preprocess = transformer.transform(X_test)

  elif pd.api.types.is_categorical(cols):


In [41]:
X_train_preprocess = pd.DataFrame(X_train_preprocess)
X_test_preprocess = pd.DataFrame(X_test_preprocess)

In [42]:
X_train_preprocess

Unnamed: 0,0,1,2,3,4,5,6,7
0,1.0,1.0,0.0,0.0,0.0,1.0,-0.566396,2.0
1,1.0,0.0,0.0,0.0,1.0,0.0,-0.647696,3.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.316170,3.0
3,1.0,0.0,0.0,0.0,0.0,1.0,0.245709,2.0
4,1.0,1.0,0.0,0.0,1.0,0.0,2.081301,2.0
...,...,...,...,...,...,...,...,...
178,1.0,0.0,0.0,0.0,0.0,1.0,0.007227,3.0
179,1.0,0.0,0.0,0.0,1.0,0.0,0.367660,2.0
180,1.0,0.0,0.0,0.0,0.0,1.0,-0.382114,2.0
181,0.0,0.0,1.0,0.0,1.0,1.0,1.110208,6.0


In [43]:
# feature name for one hot encoding
transformer.transformers_[0][1].get_feature_names()

array(['x0_Male', 'x1_Yes', 'x2_Lunch'], dtype=object)

In [44]:

# feature name for binary encoding
transformer.transformers_[1][1].get_feature_names()

['day_0', 'day_1', 'day_2']

In [45]:
feature_names = list(transformer.transformers_[0][1].get_feature_names()) + list(transformer.transformers_[1][1].get_feature_names()) + ['total_bill', 'size']
X_train_preprocess.columns = feature_names
X_test_preprocess.columns = feature_names

In [46]:
X_test_preprocess

Unnamed: 0,x0_Male,x1_Yes,x2_Lunch,day_0,day_1,day_2,total_bill,size
0,0.0,0.0,0.0,0.0,1.0,0.0,-0.117435,3.0
1,1.0,1.0,0.0,0.0,0.0,1.0,0.251129,2.0
2,1.0,1.0,0.0,0.0,0.0,1.0,-0.334237,2.0
3,1.0,1.0,0.0,0.0,0.0,1.0,0.070461,4.0
4,1.0,1.0,0.0,0.0,0.0,1.0,-0.225836,2.0
...,...,...,...,...,...,...,...,...
56,0.0,1.0,1.0,0.0,1.0,1.0,-0.430894,2.0
57,1.0,0.0,1.0,0.0,1.0,1.0,-0.903342,2.0
58,1.0,1.0,0.0,0.0,1.0,0.0,1.390244,2.0
59,0.0,1.0,1.0,1.0,0.0,0.0,-0.670280,2.0


<b>note</b>:<br>
k category --> k-1

## Modeling

In [47]:
model = Ridge()
model.fit(X_train_preprocess, y_train)

Ridge()

In [48]:
y_pred = model.predict(X_test_preprocess)
mse = mean_squared_error(y_test, y_pred)
mse

1.0577456219830776

## Coef

In [49]:
model.coef_

array([-0.21994306, -0.07107905,  0.11693755,  0.03169306, -0.00246008,
       -0.0613861 ,  1.08614386,  0.1728903 ])

In [50]:
pd.DataFrame({
    'var':feature_names,
    'coef':model.coef_
})

Unnamed: 0,var,coef
0,x0_Male,-0.219943
1,x1_Yes,-0.071079
2,x2_Lunch,0.116938
3,day_0,0.031693
4,day_1,-0.00246
5,day_2,-0.061386
6,total_bill,1.086144
7,size,0.17289
