# 12.1 Categorical Data

In [1]:
import pandas as pd
import numpy as np

In [5]:
values = pd.Series(['apples','apples','oranges','oranges','apples','banana'])

In [6]:
values

0     apples
1     apples
2    oranges
3    oranges
4     apples
5     banana
dtype: object

In [7]:
# Check unique values 
pd.unique(values)

array(['apples', 'oranges', 'banana'], dtype=object)

In [8]:
values.value_counts()

apples     3
oranges    2
banana     1
dtype: int64

#### Many data systems (for data warehousing, statistical computing, or other uses) have developed specialized approaches for representing data with repeated values for more efficient storage and computation. In data warehousing, a best practice is to use so-called dimension tables containing the distinct values and storing the primary observations as integer keys referencing the dimension table

In [17]:
values = pd.Series([0,1,0,0,2,3]*2)

In [18]:
values

0     0
1     1
2     0
3     0
4     2
5     3
6     0
7     1
8     0
9     0
10    2
11    3
dtype: int64

In [22]:
dimensions = pd.Series(['apple','oranges','banana','pineapple'])

In [23]:
dimensions

0        apple
1      oranges
2       banana
3    pineapple
dtype: object

### We can use the "take" method to restore the original Series of strings:

In [24]:
dimensions.take(values)

0        apple
1      oranges
0        apple
0        apple
2       banana
3    pineapple
0        apple
1      oranges
0        apple
0        apple
2       banana
3    pineapple
dtype: object

In [28]:
# Check otherwise
category = pd.Series(['apples','apples','oranges','oranges','apples','banana'])

In [29]:
dims = pd.Series([0,1,2])

In [30]:
# dims.take(category)
# it doesnt work

### Categorical Type in pandas

In [31]:
fruits = ['apples','apples','oranges','oranges','apples','banana'] * 2

In [33]:
N = len(fruits)
N

12

In [35]:
df = pd.DataFrame({
    'fruits': fruits,
    'basket_id': np.arange(N),
    'count': np.random.randint(45,1000, size=N),
    'weight': np.random.uniform(0,4,size=N)
})

In [36]:
df

Unnamed: 0,fruits,basket_id,count,weight
0,apples,0,708,1.696933
1,apples,1,735,3.745569
2,oranges,2,989,1.654496
3,oranges,3,447,0.34393
4,apples,4,203,2.870764
5,banana,5,846,0.971663
6,apples,6,485,1.825643
7,apples,7,342,2.516087
8,oranges,8,770,0.946739
9,oranges,9,671,3.984036


In [37]:
fruit_category = df['fruits'].astype('category')

In [38]:
fruit_category

0      apples
1      apples
2     oranges
3     oranges
4      apples
5      banana
6      apples
7      apples
8     oranges
9     oranges
10     apples
11     banana
Name: fruits, dtype: category
Categories (3, object): [apples, banana, oranges]

In [40]:
c = fruit_category.values

In [41]:
c.categories

Index(['apples', 'banana', 'oranges'], dtype='object')

In [42]:
c.value_counts()

apples     6
banana     2
oranges    4
dtype: int64

In [43]:
c.codes

array([0, 0, 2, 2, 0, 1, 0, 0, 2, 2, 0, 1], dtype=int8)

In [44]:
df['fruitCategoricalValues'] = c.codes

In [45]:
df

Unnamed: 0,fruits,basket_id,count,weight,fruitCategoricalValues
0,apples,0,708,1.696933,0
1,apples,1,735,3.745569,0
2,oranges,2,989,1.654496,2
3,oranges,3,447,0.34393,2
4,apples,4,203,2.870764,0
5,banana,5,846,0.971663,1
6,apples,6,485,1.825643,0
7,apples,7,342,2.516087,0
8,oranges,8,770,0.946739,2
9,oranges,9,671,3.984036,2


In [49]:
df['fruitCategoricalValues_another_way'] = df['fruits'].astype('category')

In [50]:
df

Unnamed: 0,fruits,basket_id,count,weight,fruitCategoricalValues,fruitCategoricalValues_another_way
0,apples,0,708,1.696933,0,apples
1,apples,1,735,3.745569,0,apples
2,oranges,2,989,1.654496,2,oranges
3,oranges,3,447,0.34393,2,oranges
4,apples,4,203,2.870764,0,apples
5,banana,5,846,0.971663,1,banana
6,apples,6,485,1.825643,0,apples
7,apples,7,342,2.516087,0,apples
8,oranges,8,770,0.946739,2,oranges
9,oranges,9,671,3.984036,2,oranges


In [51]:
myCategory = pd.Categorical(['foo','bar','baz','bar','foo'])

In [54]:
myCategory

[foo, bar, baz, bar, foo]
Categories (3, object): [bar, baz, foo]

### If you have obtained categorical encoded data from another source, you can use the alternative from_codes constructor

In [55]:
categories = ['foor','ara','baz']

In [56]:
codes = [0,1,2,0,1,0,1,1]

In [57]:
my_cat = pd.Categorical.from_codes(codes, categories)

In [58]:
my_cat

[foor, ara, baz, foor, ara, foor, ara, ara]
Categories (3, object): [foor, ara, baz]

## Computations with Categoricals

In [80]:
np.random.seed(12345)
draws = np.random.randn(1000)

In [81]:
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [82]:
bins = pd.cut(draws, bins=4)
bins

[(-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (0.489, 2.208], ..., (-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (0.489, 2.208], (0.489, 2.208]]
Length: 1000
Categories (4, interval[float64]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.208] < (2.208, 3.928]]

In [83]:
# adding quartile names to each cut
bins = pd.qcut(draws, 4, labels=['Q1','Q2','Q3','Q4'])
bins

[Q2, Q3, Q2, Q2, Q4, ..., Q3, Q2, Q1, Q3, Q4]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [84]:
bins.codes[:3]

array([1, 2, 1], dtype=int8)

In [85]:
bins = pd.Series(bins, name='Quartile')
bins[-5:]

995    Q3
996    Q2
997    Q1
998    Q3
999    Q4
Name: Quartile, dtype: category
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [77]:
results = pd.DataFrame({
    pd.Series(draws).groupby(bins).agg(['count','min','max']).reset_index()
})

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed

In [88]:
results = (pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']).reset_index())

In [89]:
results

Unnamed: 0,Quartile,count,min,max
0,Q1,250,-2.949343,-0.685484
1,Q2,250,-0.683066,-0.010115
2,Q3,250,-0.010032,0.628894
3,Q4,250,0.634238,3.927528


### Better performance with categoricals

In [None]:
N = 10000000
draws = pd.Series(np.random.randn(N))