 Encode strings as numeric values. Helpful to turn category data into numerical data

Factorize applies on Series

In [1]:
import pandas as pd
import numpy as np

#create DataFrame
df = pd.DataFrame({'conf': ['West', 'West', 'East', 'East'],
                   'team': ['A', 'B', 'C', 'D'],
                   'position': ['Guard', 'Forward', 'Guard', 'Center'] })


In [2]:
df

Unnamed: 0,conf,team,position
0,West,A,Guard
1,West,B,Forward
2,East,C,Guard
3,East,D,Center


In [3]:
# factorize one column
#df['conf'] = pd.factorize(df['conf'])[0]
df['conf_factorized'] = pd.factorize(df['conf'])[0]
df

Unnamed: 0,conf,team,position,conf_factorized
0,West,A,Guard,0
1,West,B,Forward,0
2,East,C,Guard,1
3,East,D,Center,1


In [4]:
#factorize conf and team columns only
df[['conf_factorized', 'team_factorized']] = df[['conf', 'team']].apply(lambda x: pd.factorize(x)[0])
df

Unnamed: 0,conf,team,position,conf_factorized,team_factorized
0,West,A,Guard,0,0
1,West,B,Forward,0,1
2,East,C,Guard,1,2
3,East,D,Center,1,3


In [5]:
#factorize all columns
df_factorized = df.apply(lambda x: pd.factorize(x)[0])
df_factorized

Unnamed: 0,conf,team,position,conf_factorized,team_factorized
0,0,0,0,0,0
1,0,1,1,0,1
2,1,2,0,1,2
3,1,3,2,1,3


 ## Pandas Factorize dissection

factorize() returns a tuple.

In [7]:
type(pd.factorize(df['conf']))

tuple

In [8]:
pd.factorize(df['conf'])

(array([0, 0, 1, 1], dtype=int64), Index(['West', 'East'], dtype='object'))

In [9]:
pd.factorize(df['conf'])[0]

array([0, 0, 1, 1], dtype=int64)

### Reshaping factorized array values

In [12]:
conf_encoded, conf_categories = df['conf'].factorize()

In [13]:
conf_encoded

array([0, 0, 1, 1], dtype=int64)

In [14]:
conf_categories

Index(['West', 'East'], dtype='object')

In [15]:
conf_encoded.reshape(-1,1)

array([[0],
       [0],
       [1],
       [1]], dtype=int64)

### One hot Encoding

In [19]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

encoder_df = pd.DataFrame(encoder.fit_transform(df[['conf']]).toarray())

In [20]:
encoder_df

Unnamed: 0,0,1
0,0.0,1.0
1,0.0,1.0
2,1.0,0.0
3,1.0,0.0


In [23]:
encoder = OneHotEncoder()

encoder_df = encoder.fit_transform(conf_encoded.reshape(-1,1))

In [26]:
encoder_df.toarray()

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [27]:
encoder.categories_

[array([0, 1], dtype=int64)]