# Nominal/OHE Encoding

In [2]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [3]:
# Create a simple dataframe 
df = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'green', 'red', 'blue']
})

In [4]:
df.head()

Unnamed: 0,color
0,red
1,blue
2,green
3,green
4,red


In [5]:
encoder = OneHotEncoder()
encoded = encoder.fit_transform(df[['color']]).toarray()

In [6]:
encoder_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out())

In [7]:
encoder_df

Unnamed: 0,color_blue,color_green,color_red
0,0.0,0.0,1.0
1,1.0,0.0,0.0
2,0.0,1.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0


In [8]:
encoder.transform([['blue']]).toarray()



array([[1., 0., 0.]])

In [9]:
pd.concat([df, encoder_df], axis=1)

Unnamed: 0,color,color_blue,color_green,color_red
0,red,0.0,0.0,1.0
1,blue,1.0,0.0,0.0
2,green,0.0,1.0,0.0
3,green,0.0,1.0,0.0
4,red,0.0,0.0,1.0
5,blue,1.0,0.0,0.0


# Practice

In [10]:
#practice
data = sns.load_dataset('tips')

In [11]:
type(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


NoneType

In [12]:
data['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [13]:
data['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [14]:
data['day'].value_counts()

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [15]:
data['time'].value_counts()

time
Dinner    176
Lunch      68
Name: count, dtype: int64

In [16]:
sex_encoder = OneHotEncoder()
smoker_encoder = OneHotEncoder()
day_encoder = OneHotEncoder()
time_encoder = OneHotEncoder()

In [23]:
sex_encoded = sex_encoder.fit_transform(data[['sex']]).toarray()
smoker_encoded = smoker_encoder.fit_transform(data[['smoker']]).toarray()
day_encoded = day_encoder.fit_transform(data[['day']]).toarray()
time_encoded = time_encoder.fit_transform(data[['time']]).toarray()

In [26]:
sex_df = pd.DataFrame(sex_encoded, columns=sex_encoder.get_feature_names_out())
smoker_df = pd.DataFrame(smoker_encoded, columns=smoker_encoder.get_feature_names_out())
day_df = pd.DataFrame(day_encoded, columns=day_encoder.get_feature_names_out())
time_df = pd.DataFrame(time_encoded, columns=time_encoder.get_feature_names_out())

In [31]:
encoded_df = pd.concat([data.drop(['sex', 'smoker', 'day', 'time'], axis=1), sex_df, smoker_df, day_df, time_df], axis=1)
encoded_df

Unnamed: 0,total_bill,tip,size,sex_Female,sex_Male,smoker_No,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Dinner,time_Lunch
0,16.99,1.01,2,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,10.34,1.66,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,21.01,3.50,3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,23.68,3.31,2,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,24.59,3.61,4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
240,27.18,2.00,2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
241,22.67,2.00,2,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
242,17.82,1.75,2,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
