## In this practice session, we will learn how to encode categorical features with higher cardinality

## Some of the techniques covered are as follows

  * **Label Encoder**
  * **Ordinal Encoder**
  * **Label Binarizer**
  * **Count Encoding**

In [None]:
!pip install pip --upgrade --user
!pip install numpy pandas seaborn matplotlib scipy statsmodels sklearn --user

In [1]:
import numpy as np
import pandas as pd

In [2]:
!gdown https://drive.google.com/uc?id=1PAV9l8_yVqMS9tmr4GWUVhEa3-AL-ZHL

Downloading...
From: https://drive.google.com/uc?id=1PAV9l8_yVqMS9tmr4GWUVhEa3-AL-ZHL
To: c:\Users\miniv\Desktop\AIM\Downloads\Practice\2_General_ML_AI-20211019T065435Z-001\2_General_ML_AI\1_Explorartory_Data_Analysis\melb_data.csv

  0%|          | 0.00/2.78M [00:00<?, ?B/s]
 19%|█▉        | 524k/2.78M [00:00<00:01, 1.43MB/s]
 57%|█████▋    | 1.57M/2.78M [00:00<00:00, 2.46MB/s]
 75%|███████▌  | 2.10M/2.78M [00:00<00:00, 2.84MB/s]
 94%|█████████▍| 2.62M/2.78M [00:01<00:00, 2.83MB/s]
100%|██████████| 2.78M/2.78M [00:01<00:00, 2.69MB/s]


In [3]:
df = pd.read_csv('melb_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
df.shape

(18396, 22)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18396 entries, 0 to 18395
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     18396 non-null  int64  
 1   Suburb         18396 non-null  object 
 2   Address        18396 non-null  object 
 3   Rooms          18396 non-null  int64  
 4   Type           18396 non-null  object 
 5   Price          18396 non-null  float64
 6   Method         18396 non-null  object 
 7   SellerG        18396 non-null  object 
 8   Date           18396 non-null  object 
 9   Distance       18395 non-null  float64
 10  Postcode       18395 non-null  float64
 11  Bedroom2       14927 non-null  float64
 12  Bathroom       14925 non-null  float64
 13  Car            14820 non-null  float64
 14  Landsize       13603 non-null  float64
 15  BuildingArea   7762 non-null   float64
 16  YearBuilt      8958 non-null   float64
 17  CouncilArea    12233 non-null  object 
 18  Lattit

In [6]:
# Get list of categorical variables
s = (df.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

Categorical variables:
['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']


In [7]:
df[object_cols].nunique()

Suburb           330
Address        18134
Type               3
Method             5
SellerG          305
Date              58
CouncilArea       33
Regionname         8
dtype: int64

In [8]:
features = df[['Type','Method','Regionname']]
features.head()

Unnamed: 0,Type,Method,Regionname
0,h,S,Northern Metropolitan
1,h,S,Northern Metropolitan
2,h,SP,Northern Metropolitan
3,h,PI,Northern Metropolitan
4,h,VB,Northern Metropolitan


In [9]:
features.Type.value_counts()

h    12095
u     4296
t     2005
Name: Type, dtype: int64

In [10]:
features.Method.value_counts()

S     12034
SP     2349
PI     2189
VB     1696
SA      128
Name: Method, dtype: int64

In [11]:
features.Regionname.value_counts()

Southern Metropolitan         6343
Northern Metropolitan         5307
Western Metropolitan          3887
Eastern Metropolitan          1995
South-Eastern Metropolitan     680
Eastern Victoria                78
Northern Victoria               63
Western Victoria                42
Name: Regionname, dtype: int64

In [None]:
mapping ={'h':1,
           'u':2,
           't':3
          }
features['type'] = features.Type.map(mapping) 

In [None]:
features.type.value_counts()

## Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1 = features[['Regionname']]
df1['Region'] = le.fit_transform(features['Regionname'].astype(str).tolist())

In [None]:
df1['Region'].value_counts()

In [None]:
#one-hot-encoding
df2 = pd.get_dummies(features['Method'])

In [None]:
df2

## Ordinal Encoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder
oe = OrdinalEncoder()
features['Type_ord'] = pd.DataFrame(oe.fit_transform(features[['Type']]))


In [None]:
features.Type_ord.value_counts()

In [None]:
features.head()

## Label Binarizer

In [None]:
from sklearn.preprocessing import LabelBinarizer

lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(features["Type"])

In [None]:
pd.DataFrame(lb_results, columns=lb_style.classes_).nunique()

## Count Encoding

In [None]:
df_frequency_map = features.Type.value_counts().to_dict()
features.Type = features.Type.map(df_frequency_map)

In [None]:
features.Type.iloc[20:23]

In [None]:
features.head()