# Feature Encodeing
## Content
- Label Encoding
- One-Hot Encoding


## 1.0 Imports

In [1]:
import os
import pandas as pd

## 2.0 Load Data

In [2]:
df = pd.read_csv('../data/raw/insurance.csv')
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## 3.0 Encoding

### 3.1 Label Encoding

In [3]:
df['smoker'] = df['smoker'].map({'yes': 1, 'no': 0})
df['sex'] = df['sex'].map({'female': 1, 'male': 0})

print("Label Encoding done (smoker, sex).")
df.head()

Label Encoding done (smoker, sex).


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,1,27.9,0,1,southwest,16884.924
1,18,0,33.77,1,0,southeast,1725.5523
2,28,0,33.0,3,0,southeast,4449.462
3,33,0,22.705,0,0,northwest,21984.47061
4,32,0,28.88,0,0,northwest,3866.8552


### 3.2 One-Hot Encoding

In [4]:
region_dummies = pd.get_dummies(df['region'], prefix='region', drop_first=True, dtype=int) 
df = pd.concat([df, region_dummies], axis=1)
df.drop('region', axis=1, inplace=True)

print("One-Hot Encoding done (region).")
df.head()

One-Hot Encoding done (region).


Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,1,27.9,0,1,16884.924,0,0,1
1,18,0,33.77,1,0,1725.5523,0,1,0
2,28,0,33.0,3,0,4449.462,0,1,0
3,33,0,22.705,0,0,21984.47061,1,0,0
4,32,0,28.88,0,0,3866.8552,1,0,0


## 4.0 Result Check

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1338 non-null   int64  
 1   sex               1338 non-null   int64  
 2   bmi               1338 non-null   float64
 3   children          1338 non-null   int64  
 4   smoker            1338 non-null   int64  
 5   charges           1338 non-null   float64
 6   region_northwest  1338 non-null   int64  
 7   region_southeast  1338 non-null   int64  
 8   region_southwest  1338 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 94.2 KB


## 5.0 Save to csv

In [6]:
file_path = '../data/processed/insurance_encoded.csv'

directory = os.path.dirname(file_path)

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory created: {directory}")

df.to_csv(file_path, index=False)
print(f"File saved succesfully: {file_path}")

File saved succesfully: ../data/processed/insurance_encoded.csv
