In [1]:
import pandas as pd
import numpy as np

In [2]:
# To convert a categorical variable into a “dummy” or “indicator” DataFrame, for example a column in a DataFrame 

In [5]:
df = pd.DataFrame({"key":list("bbacba"),"data1":range(6)})
df

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,b,4
5,a,5


In [6]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,0,1,0
5,1,0,0


In [8]:
# Sometimes it’s useful to prefix the column names, for example when merging the result with the original DataFrame

In [9]:
dummies = pd.get_dummies(df["key"],prefix = "key")
dummies

Unnamed: 0,key_a,key_b,key_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,0,1,0
5,1,0,0


In [10]:
# This function is often used along with discretization functions like cut:

In [11]:
values = np.random.randn(10)
values

array([ 0.14413885,  1.36183727,  0.50313193,  0.14343767, -0.21721738,
       -0.03213196, -0.23535661, -0.62068748, -1.8785985 ,  0.49639342])

In [12]:
bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,1,0,0,0,0
1,0,0,0,0,0
2,0,0,1,0,0
3,1,0,0,0,0
4,0,0,0,0,0
5,0,0,0,0,0
6,0,0,0,0,0
7,0,0,0,0,0
8,0,0,0,0,0
9,0,0,1,0,0


In [13]:
# get_dummies() also accepts a DataFrame. By default all categorical variables

In [14]:
df = pd.DataFrame({"A":["a","b","a"], "B":["c","c","b"],"C":[1,2,3]})
df

Unnamed: 0,A,B,C
0,a,c,1
1,b,c,2
2,a,b,3


In [15]:
pd.get_dummies(df)

Unnamed: 0,C,A_a,A_b,B_b,B_c
0,1,1,0,0,1
1,2,0,1,0,1
2,3,1,0,1,0


In [16]:
# You can control the columns that are encoded with the columns keyword.

In [18]:
pd.get_dummies(df,columns = ["A"])

Unnamed: 0,B,C,A_a,A_b
0,c,1,1,0
1,c,2,0,1
2,b,3,1,0


As with the Series version, you can pass values for the prefix and prefix_sep. By default the column name is used as the prefix, and ‘_’ as the prefix separator. You can specify prefix and prefix_sep in 3 ways

-string: Use the same value for prefix or prefix_sep for each column to be encoded 


-list: Must be the same length as the number of columns being encoded.

-dict: Mapping column name to prefix

In [19]:
simple = pd.get_dummies(df, prefix='new_prefix')
simple

Unnamed: 0,C,new_prefix_a,new_prefix_b,new_prefix_b.1,new_prefix_c
0,1,1,0,0,1
1,2,0,1,0,1
2,3,1,0,1,0


In [20]:
from_list = pd.get_dummies(df, prefix=['from_A', 'from_B'])
from_list

Unnamed: 0,C,from_A_a,from_A_b,from_B_b,from_B_c
0,1,1,0,0,1
1,2,0,1,0,1
2,3,1,0,1,0


In [21]:
from_dict = pd.get_dummies(df, prefix={'B': 'from_B', 'A': 'from_A'})
from_dict

Unnamed: 0,C,from_A_a,from_A_b,from_B_b,from_B_c
0,1,1,0,0,1
1,2,0,1,0,1
2,3,1,0,1,0


Sometimes it will be useful to only keep k-1 levels of a categorical variable to avoid collinearity 
when feeding the result to statistical models. You can switch to this mode by turn on drop_first.

In [22]:
s = pd.Series(list("abcaa"))
pd.get_dummies(s)

Unnamed: 0,a,b,c
0,1,0,0
1,0,1,0
2,0,0,1
3,1,0,0
4,1,0,0


In [23]:
pd.get_dummies(s, drop_first=True)

Unnamed: 0,b,c
0,0,0
1,1,0
2,0,1
3,0,0
4,0,0
