# Week 10 - One Hot Encoding

In [1]:
# one hot encoding
import seaborn as sns

df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [2]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [3]:
df.dropna(inplace=True)
df.isnull().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [4]:
df['species'].value_counts()

Adelie       146
Gentoo       119
Chinstrap     68
Name: species, dtype: int64

In [5]:
df['island'].value_counts()

Biscoe       163
Dream        123
Torgersen     47
Name: island, dtype: int64

In [6]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


## Dependent Variable = species

In [7]:
# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['species'], axis=1), 
                                                    df['species'], 
                                                    test_size=.2, 
                                                    random_state=42)

print(X_train.shape)
print(X_test.shape)

(266, 6)
(67, 6)


## One Hot Encoding For Features

### For features with more than 2 unique labels

In [8]:
# use sklearn one hot encoder
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categories='auto', drop='first', sparse=False, handle_unknown='ignore')

cat_features = ['island']
ohe_train = ohe.fit_transform(X_train[cat_features])
ohe_train = pd.DataFrame(ohe_train, columns=ohe.get_feature_names_out(cat_features))
ohe_train.index = X_train.index
X_train = ohe_train.join(X_train)
X_train.drop(cat_features, axis=1, inplace=True)

ohe_test = ohe.transform(X_test[cat_features])
ohe_test = pd.DataFrame(ohe_test, columns=ohe.get_feature_names_out(cat_features))
ohe_test.index = X_test.index
X_test = ohe_test.join(X_test)
X_test.drop(cat_features, axis=1, inplace=True)

In [9]:
X_train.sample(10)

Unnamed: 0,island_Dream,island_Torgersen,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
177,1.0,0.0,52.0,19.0,197.0,4150.0,Male
111,0.0,0.0,45.6,20.3,191.0,4600.0,Male
278,0.0,0.0,43.2,14.5,208.0,4450.0,Female
104,0.0,0.0,37.9,18.6,193.0,2925.0,Female
56,0.0,0.0,39.0,17.5,186.0,3550.0,Female
49,1.0,0.0,42.3,21.2,191.0,4150.0,Male
284,0.0,0.0,45.8,14.2,219.0,4700.0,Female
271,0.0,0.0,48.5,14.1,220.0,5300.0,Male
197,1.0,0.0,50.8,18.5,201.0,4450.0,Male
252,0.0,0.0,45.1,14.5,207.0,5050.0,Female


In [10]:
import numpy as np

print(np.sort(df['island'].unique()))

['Biscoe' 'Dream' 'Torgersen']


## Bi Label Mapping for Features

### For features with only two unique labels

In [11]:
X_train['sex'] = X_train['sex'].map({'Female': 0, 'Male': 1})
X_test['sex'] = X_test['sex'].map({'Female': 0, 'Male': 1})

## Map Dependent Variable to Number

In [12]:
import numpy as np

print(np.sort(y_train.unique()))

['Adelie' 'Chinstrap' 'Gentoo']


In [13]:
y_train = y_train.map({'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2})
y_test = y_test.map({'Adelie': 0, 'Chinstrap': 1, 'Gentoo': 2})

In [14]:
X_train.head()

Unnamed: 0,island_Dream,island_Torgersen,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
230,0.0,0.0,40.9,13.7,214.0,4650.0,0
84,1.0,0.0,37.3,17.8,191.0,3350.0,0
303,0.0,0.0,50.0,15.9,224.0,5350.0,1
22,0.0,0.0,35.9,19.2,189.0,3800.0,0
29,0.0,0.0,40.5,18.9,180.0,3950.0,1


In [15]:
y_train.head()

230    2
84     0
303    2
22     0
29     0
Name: species, dtype: int64