# Data Preprocessing: One Hot Encoding

## Import Modules

In [1]:
import pandas as pd

## Prepare Data

In [31]:
data = {
    'date': ['2023-10-15', '2023-10-16', '2023-10-17', '2023-10-18']
}
df = pd.DataFrame(data)
df.info()
df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    4 non-null      object
dtypes: object(1)
memory usage: 164.0+ bytes


Unnamed: 0,date
0,2023-10-15
1,2023-10-16
2,2023-10-17
3,2023-10-18


## Convert the "date" column into date format

In [32]:
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    4 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 164.0 bytes


## Add more additional information/columns

In [33]:
df['weekday_index'] = df['date'].dt.dayofweek  # (0=Monday, 6=Sunday)
df['weekday_name'] = df['date'].dt.day_name()
df

Unnamed: 0,date,weekday_index,weekday_name
0,2023-10-15,6,Sunday
1,2023-10-16,0,Monday
2,2023-10-17,1,Tuesday
3,2023-10-18,2,Wednesday


## One-Hot-Encoding with Pandas
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.get_dummies.html

### Encode by "weekday_name"

In [34]:
ohe_by_name = pd.get_dummies(df['weekday_name'])
ohe_by_name

Unnamed: 0,Monday,Sunday,Tuesday,Wednesday
0,False,True,False,False
1,True,False,False,False
2,False,False,True,False
3,False,False,False,True


In [35]:
# concat the encoded columns to the origin DataFrame
df_with_encoding = pd.concat([df, one_hot_encoded], axis=1)
df_with_encoding

Unnamed: 0,date,weekday_index,weekday_name,Monday,Sunday,Tuesday,Wednesday
0,2023-10-15,6,Sunday,False,True,False,False
1,2023-10-16,0,Monday,True,False,False,False
2,2023-10-17,1,Tuesday,False,False,True,False
3,2023-10-18,2,Wednesday,False,False,False,True


### Encode by "weekday_index"

In [29]:
ohe_by_index = pd.get_dummies(df['weekday_index'], prefix="weekday")
ohe_by_index

Unnamed: 0,weekday_0,weekday_1,weekday_2,weekday_6
0,False,False,False,True
1,True,False,False,False
2,False,True,False,False
3,False,False,True,False


In [37]:
# concat the encoded columns to the origin DataFrame
df_with_encoding = pd.concat([df, ohe_by_index, ohe_by_name], axis=1)
df_with_encoding

Unnamed: 0,date,weekday_index,weekday_name,weekday_0,weekday_1,weekday_2,weekday_6,Monday,Sunday,Tuesday,Wednesday
0,2023-10-15,6,Sunday,False,False,False,True,False,True,False,False
1,2023-10-16,0,Monday,True,False,False,False,True,False,False,False
2,2023-10-17,1,Tuesday,False,True,False,False,False,False,True,False
3,2023-10-18,2,Wednesday,False,False,True,False,False,False,False,True
