# OneHotEncoder Vs get_dummies

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
df = sns.load_dataset('tips')
df = df[['total_bill', 'tip', 'day', 'size']]

In [3]:
df.head(5)

Unnamed: 0,total_bill,tip,day,size
0,16.99,1.01,Sun,2
1,10.34,1.66,Sun,3
2,21.01,3.5,Sun,3
3,23.68,3.31,Sun,2
4,24.59,3.61,Sun,4


## Scikit-learn OneHotEncoder

In [4]:
X = df.drop('tip', axis=1)
y = df['tip']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [5]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False, dtype='int')
ohe.fit(X_train[['day']])

def get_ohe(df):
    temp_df = pd.DataFrame(data=ohe.transform(df[['day']]), columns=ohe.get_feature_names_out())
    df.drop(columns=['day'], axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df

In [6]:
X_train = get_ohe(X_train)
X_test = get_ohe(X_test)

In [7]:
X_train.head()

Unnamed: 0,total_bill,size,day_Fri,day_Sat,day_Sun,day_Thur
0,26.88,4,0,0,1,0
1,32.68,2,0,0,0,1
2,17.89,2,0,0,1,0
3,20.49,2,0,1,0,0
4,48.17,6,0,0,1,0


In [8]:
X_test_new = pd.DataFrame( {'total_bill': [25, 45], 'day': ['Sun', 'Mon'], 'size': [2, 4]} )
get_ohe(X_test_new)

Unnamed: 0,total_bill,size,day_Fri,day_Sat,day_Sun,day_Thur
0,25,2,0,0,1,0
1,45,4,0,0,0,0


### One hot encoding in pipeline

In [9]:
X = df.drop('tip', axis=1)
y = df['tip']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
numeric_preprocessor = Pipeline(steps=[
    ("scaler", MinMaxScaler()) 
])

categorical_preprocessor = Pipeline(steps=[ 
    ("onehot", OneHotEncoder(handle_unknown="ignore")) 
])

preprocessor = ColumnTransformer([
    ("categorical", categorical_preprocessor, ["day"]),
    ("numerical", numeric_preprocessor, ["total_bill", "size"])
])

pipe = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", LinearRegression())
])

In [11]:
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.5706168878130049

In [12]:
X_test_new = pd.DataFrame( {'total_bill': [25, 45], 'day': ['Sun', 'Mon'], 'size': [2, 4]} )
X_test_new

Unnamed: 0,total_bill,day,size
0,25,Sun,2
1,45,Mon,4


In [13]:
pipe.predict(X_test_new)

array([3.27325822, 5.436413  ])

## Pandas get_dummies 

In [14]:
X = df.drop('tip', axis=1)
y = df['tip']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [15]:
X_train = pd.get_dummies(X_train, columns=['day'])
X_test = pd.get_dummies(X_test, columns=['day'])

In [16]:
X_train.head()

Unnamed: 0,total_bill,size,day_Thur,day_Fri,day_Sat,day_Sun
7,26.88,4,0,0,0,1
83,32.68,2,1,0,0,0
176,17.89,2,0,0,0,1
106,20.49,2,0,0,1,0
156,48.17,6,0,0,0,1


In [17]:
cols = X_test.columns.tolist()

In [18]:
X_test_new = pd.DataFrame( {'total_bill': [25, 45], 'day': ['Sun', 'Mon'], 'size': [2, 4]} )
pd.get_dummies(X_test_new, columns=['day']).head()

Unnamed: 0,total_bill,size,day_Mon,day_Sun
0,25,2,0,1
1,45,4,1,0


In [19]:
X_test_new

Unnamed: 0,total_bill,day,size
0,25,Sun,2
1,45,Mon,4


In [20]:
X_test_new = pd.get_dummies(X_test_new, columns=['day'])
X_test_new.reindex(columns=cols).fillna(0) 

Unnamed: 0,total_bill,size,day_Thur,day_Fri,day_Sat,day_Sun
0,25,2,0.0,0.0,0.0,1
1,45,4,0.0,0.0,0.0,0


### get_dummies in Pipeline

In [21]:
X = df.drop('tip', axis=1)
y = df['tip']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [22]:
cols = ["total_bill", "size", "day_Fri", "day_Sat", "day_Sun", "day_Thur"]

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin

class PreprocessorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        X = pd.get_dummies(X, columns=['day'])
        X = X.reindex(columns=self.cols).fillna(0) 
        return X[self.cols]

In [24]:
preprocessor = Pipeline(steps=[ 
    ("preprocessor", PreprocessorTransformer(cols)) 
])

pipe = Pipeline(steps=[
    ("preprocessor", preprocessor), 
    ("classifier", LinearRegression())
])

In [25]:
pipe.fit(X_train, y_train);

In [26]:
pipe.score(X_test, y_test)

0.5706168878130053

In [27]:
X_test_new = pd.DataFrame( {'total_bill': [25, 45], 'day': ['Sun', 'Mon'], 'size': [2, 4]} )
X_test_new

Unnamed: 0,total_bill,day,size
0,25,Sun,2
1,45,Mon,4


In [28]:
pipe.predict(X_test_new)

array([3.27325822, 5.436413  ])

## Example

In [29]:
data = {"Airline": ["American Airlines", "Delta Air Lines", "United Airlines"]}
df = pd.DataFrame(data=data)

In [30]:
pd.get_dummies(df)

Unnamed: 0,Airline_American Airlines,Airline_Delta Air Lines,Airline_United Airlines
0,1,0,0
1,0,1,0
2,0,0,1


In [31]:
ohe = OneHotEncoder(sparse=False)
pd.DataFrame(ohe.fit_transform(df), columns=ohe.categories_, dtype='int8')

Unnamed: 0,American Airlines,Delta Air Lines,United Airlines
0,1,0,0
1,0,1,0
2,0,0,1
