#### convert code values to categorical

In [1]:
import pandas as pd
values = pd.Series([1,0,3,0,0,1,0,1,3,1])
categories = pd.Series(["orange", "apple", "banana", "peach"])
categories.take(values)

1     apple
0    orange
3     peach
0    orange
0    orange
1     apple
0    orange
1     apple
3     peach
1     apple
dtype: object

#### create categrical from cloumn data

In [2]:
import numpy as np
N=10

raw_data={
    "basket_id": range(N),
    "fruit": np.random.choice(categories, N),
    "weight": np.random.uniform(5,9, N),
    "count": np.random.randint(6,18, N)
}

df = pd.DataFrame(raw_data)
df.set_index("basket_id")


Unnamed: 0_level_0,fruit,weight,count
basket_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,orange,7.310722,6
1,orange,5.63545,9
2,apple,8.044075,8
3,apple,8.875922,15
4,apple,7.903844,16
5,orange,6.252324,8
6,orange,8.61276,15
7,banana,6.548966,10
8,apple,8.805716,12
9,apple,5.891017,11


In [3]:
## convert "fruit" column to categorical data type
fruit_cat_series = df["fruit"].astype("category")
fruit_cat = fruit_cat_series.values
type(fruit_cat)

pandas.core.arrays.categorical.Categorical

In [4]:
fruit_cat.categories

Index(['apple', 'banana', 'orange'], dtype='object')

In [5]:
fruit_cat.codes

array([2, 2, 0, 0, 0, 2, 2, 1, 0, 0], dtype=int8)

#### create categorical from categories

In [6]:
pd.Categorical(categories)

[orange, apple, banana, peach]
Categories (4, object): [apple, banana, orange, peach]

#### create categroical from codes

In [7]:
pd.Categorical.from_codes([3, 0, 3, 3, 3, 3, 2, 3, 1, 1], categories, ordered=True)

[peach, orange, peach, peach, peach, peach, banana, peach, apple, apple]
Categories (4, object): [orange < apple < banana < peach]

categories could be any imutable object

### Computations with categoricals

In [8]:
rnd = np.random.rand(1000)
bins = pd.qcut(rnd, 4, labels=["Q1", "Q2", "Q3", "Q4"])
series = pd.Series(rnd)
series.groupby(bins).agg(["count","min","max"])

Unnamed: 0,count,min,max
Q1,250,0.000453,0.24398
Q2,250,0.244513,0.479676
Q3,250,0.480373,0.747607
Q4,250,0.748544,0.997028


### Performance
- It seems groupby is solower with categorical data

In [9]:
N = 100000
draws = np.random.rand(N)
labels = pd.Series(np.random.choice(["cat1", "cat2", "cat3"], N))
categories = labels.astype("category")
labels.memory_usage()

800080

In [10]:
categories.memory_usage()

100184

In [11]:
df_cat = pd.DataFrame(draws, index=categories, columns=["value"])
df = pd.DataFrame(draws, index=labels, columns=["value"])

In [12]:
%%timeit
df_cat.groupby(by=df_cat.iloc[0])

23.8 ms ± 5.08 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
%%timeit
df.groupby(by=df.iloc[0])

13.5 ms ± 3.17 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Categorical Functions

In [14]:
categories.cat.categories

Index(['cat1', 'cat2', 'cat3'], dtype='object')

In [15]:
categories.cat.codes.head(10)

0    2
1    1
2    0
3    0
4    2
5    2
6    0
7    2
8    2
9    0
dtype: int8

In [16]:
categories.cat.set_categories(["cat1", "cat2", "cat3", "cat4"]).head(10)

0    cat3
1    cat2
2    cat1
3    cat1
4    cat3
5    cat3
6    cat1
7    cat3
8    cat3
9    cat1
dtype: category
Categories (4, object): [cat1, cat2, cat3, cat4]

In [19]:
categories.cat.remove_unused_categories().head(10)

0    cat3
1    cat2
2    cat1
3    cat1
4    cat3
5    cat3
6    cat1
7    cat3
8    cat3
9    cat1
dtype: category
Categories (3, object): [cat1, cat2, cat3]

In [21]:
categories.isin(["cat1", "cat2"]).head()

0    False
1     True
2     True
3     True
4    False
dtype: bool

In [25]:
no_cat3 = categories[categories.isin(["cat1", "cat2"])]
no_cat3.cat.remove_unused_categories().head()

1    cat2
2    cat1
3    cat1
6    cat1
9    cat1
dtype: category
Categories (2, object): [cat1, cat2]

### Dummy Variables
- convert categorical data to matrix with categories in column and 1/0 for values

In [30]:
pd.get_dummies(categories).head(10)

Unnamed: 0,cat1,cat2,cat3
0,0,0,1
1,0,1,0
2,1,0,0
3,1,0,0
4,0,0,1
5,0,0,1
6,1,0,0
7,0,0,1
8,0,0,1
9,1,0,0
