# 12.1 Categorical Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
values = pd.Series(['apples','apples','oranges','oranges','apples','banana'])

In [3]:
values

0     apples
1     apples
2    oranges
3    oranges
4     apples
5     banana
dtype: object

In [4]:
# Check unique values 
pd.unique(values)

array(['apples', 'oranges', 'banana'], dtype=object)

In [5]:
values.value_counts()

apples     3
oranges    2
banana     1
dtype: int64

#### Many data systems (for data warehousing, statistical computing, or other uses) have developed specialized approaches for representing data with repeated values for more efficient storage and computation. In data warehousing, a best practice is to use so-called dimension tables containing the distinct values and storing the primary observations as integer keys referencing the dimension table

In [6]:
values = pd.Series([0,1,0,0,2,3]*2)

In [7]:
values

0     0
1     1
2     0
3     0
4     2
5     3
6     0
7     1
8     0
9     0
10    2
11    3
dtype: int64

In [8]:
dimensions = pd.Series(['apple','oranges','banana','pineapple'])

In [9]:
dimensions

0        apple
1      oranges
2       banana
3    pineapple
dtype: object

### We can use the "take" method to restore the original Series of strings:

In [10]:
dimensions.take(values)

0        apple
1      oranges
0        apple
0        apple
2       banana
3    pineapple
0        apple
1      oranges
0        apple
0        apple
2       banana
3    pineapple
dtype: object

In [11]:
# Check otherwise
category = pd.Series(['apples','apples','oranges','oranges','apples','banana'])

In [12]:
dims = pd.Series([0,1,2])

In [13]:
# dims.take(category)
# it doesnt work

### Categorical Type in pandas- Convert a category column to numerical column

In [14]:
fruits = ['apples','apples','oranges','oranges','apples','banana'] * 2

In [15]:
N = len(fruits)
N

12

In [16]:
df = pd.DataFrame({
    'fruits': fruits,
    'basket_id': np.arange(N),
    'count': np.random.randint(45,1000, size=N),
    'weight': np.random.uniform(0,4,size=N)
})

In [17]:
df

Unnamed: 0,fruits,basket_id,count,weight
0,apples,0,304,3.895555
1,apples,1,736,1.060578
2,oranges,2,817,1.045548
3,oranges,3,600,0.318025
4,apples,4,740,1.791273
5,banana,5,687,2.627476
6,apples,6,717,2.994989
7,apples,7,920,3.226749
8,oranges,8,84,3.56609
9,oranges,9,668,2.187944


In [18]:
fruit_category = df['fruits'].astype('category')

In [19]:
fruit_category

0      apples
1      apples
2     oranges
3     oranges
4      apples
5      banana
6      apples
7      apples
8     oranges
9     oranges
10     apples
11     banana
Name: fruits, dtype: category
Categories (3, object): [apples, banana, oranges]

In [56]:
c = fruit_category.values

In [21]:
c.categories

Index(['apples', 'banana', 'oranges'], dtype='object')

In [22]:
c.value_counts()

apples     6
banana     2
oranges    4
dtype: int64

In [23]:
c.codes

array([0, 0, 2, 2, 0, 1, 0, 0, 2, 2, 0, 1], dtype=int8)

In [24]:
df['fruitCategoricalValues'] = c.codes

In [25]:
df

Unnamed: 0,fruits,basket_id,count,weight,fruitCategoricalValues
0,apples,0,304,3.895555,0
1,apples,1,736,1.060578,0
2,oranges,2,817,1.045548,2
3,oranges,3,600,0.318025,2
4,apples,4,740,1.791273,0
5,banana,5,687,2.627476,1
6,apples,6,717,2.994989,0
7,apples,7,920,3.226749,0
8,oranges,8,84,3.56609,2
9,oranges,9,668,2.187944,2


## Another one line transformation from categorical to numerical column

In [57]:
df['fruitCategoricalValues_another_way'] = df['fruits'].astype('category').values.codes

In [58]:
df

Unnamed: 0,fruits,basket_id,count,weight,fruitCategoricalValues,fruitCategoricalValues_another_way
0,apples,0,304,3.895555,0,0
1,apples,1,736,1.060578,0,0
2,oranges,2,817,1.045548,2,2
3,oranges,3,600,0.318025,2,2
4,apples,4,740,1.791273,0,0
5,banana,5,687,2.627476,1,1
6,apples,6,717,2.994989,0,0
7,apples,7,920,3.226749,0,0
8,oranges,8,84,3.56609,2,2
9,oranges,9,668,2.187944,2,2


In [28]:
myCategory = pd.Categorical(['foo','bar','baz','bar','foo'])

In [29]:
myCategory

[foo, bar, baz, bar, foo]
Categories (3, object): [bar, baz, foo]

### If you have obtained categorical encoded data from another source, you can use the alternative from_codes constructor

In [30]:
categories = ['foor','ara','baz']

In [31]:
codes = [0,1,2,0,1,0,1,1]

In [32]:
my_cat = pd.Categorical.from_codes(codes, categories)

In [33]:
my_cat

[foor, ara, baz, foor, ara, foor, ara, ara]
Categories (3, object): [foor, ara, baz]

## Computations with Categoricals

In [34]:
np.random.seed(12345)
draws = np.random.randn(1000)

In [35]:
draws[:5]

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057])

In [36]:
bins = pd.cut(draws, bins=4)
bins

[(-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (0.489, 2.208], ..., (-1.23, 0.489], (-1.23, 0.489], (-1.23, 0.489], (0.489, 2.208], (0.489, 2.208]]
Length: 1000
Categories (4, interval[float64]): [(-2.956, -1.23] < (-1.23, 0.489] < (0.489, 2.208] < (2.208, 3.928]]

In [37]:
# adding quartile names to each cut
bins = pd.qcut(draws, 4, labels=['Q1','Q2','Q3','Q4'])
bins

[Q2, Q3, Q2, Q2, Q4, ..., Q3, Q2, Q1, Q3, Q4]
Length: 1000
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [38]:
bins.codes[:3]

array([1, 2, 1], dtype=int8)

In [39]:
bins = pd.Series(bins, name='Quartile')
bins[-5:]

995    Q3
996    Q2
997    Q1
998    Q3
999    Q4
Name: Quartile, dtype: category
Categories (4, object): [Q1 < Q2 < Q3 < Q4]

In [40]:
results = (pd.Series(draws).groupby(bins).agg(['count', 'min', 'max']).reset_index())

In [41]:
results

Unnamed: 0,Quartile,count,min,max
0,Q1,250,-2.949343,-0.685484
1,Q2,250,-0.683066,-0.010115
2,Q3,250,-0.010032,0.628894
3,Q4,250,0.634238,3.927528


# Categorical Methods

### Series containing categorical data have several special methods similar to the Series.str specialized string methods. This also provides convenient access to the cate‐gories and codes.

In [42]:
s = pd.Series(['a','b','c','a','b']*2)

In [43]:
c_s = s.astype('category')
c_s

0    a
1    b
2    c
3    a
4    b
5    a
6    b
7    c
8    a
9    b
dtype: category
Categories (3, object): [a, b, c]

In [44]:
c_s.values.codes

array([0, 1, 2, 0, 1, 0, 1, 2, 0, 1], dtype=int8)

In [45]:
c_s.value_counts()

b    4
a    4
c    2
dtype: int64

### The special attribute 'cat' provides access to categorical methods:

In [46]:
c_s.cat.categories

Index(['a', 'b', 'c'], dtype='object')

In [47]:
c_s.cat.codes

0    0
1    1
2    2
3    0
4    1
5    0
6    1
7    2
8    0
9    1
dtype: int8

### Suppose that we know the actual set of categories for this data extends beyond the four values observed in the data. We can use the set_categories method to changethem

In [48]:
real_categories = ['a','b','c','d','e']

In [49]:
c_s2 = c_s.cat.set_categories(real_categories)

In [50]:
c_s2

0    a
1    b
2    c
3    a
4    b
5    a
6    b
7    c
8    a
9    b
dtype: category
Categories (5, object): [a, b, c, d, e]

In [51]:
c_s2.value_counts()

b    4
a    4
c    2
e    0
d    0
dtype: int64

In [52]:
c_s2.cat.codes

0    0
1    1
2    2
3    0
4    1
5    0
6    1
7    2
8    0
9    1
dtype: int8

### In large datasets, categoricals are often used as a convenient tool for memory savings and better performance. After you filter a large DataFrame or Series, many of thecategories may not appear in the data. To help with this, we can use the remove_unused_categories method to trim unobserved categories

In [65]:
c_s3 = c_s[c_s.isin(['a','b','c'])]

In [66]:
c_s3

0    a
1    b
2    c
3    a
4    b
5    a
6    b
7    c
8    a
9    b
dtype: category
Categories (3, object): [a, b, c]

## 12.2 Advanced GroupBy Use

Group Transforms and “Unwrapped” GroupBys
In Chapter 10 we looked at the apply method in grouped operations for performing
transformations. There is another built-in method called transform , which is similar
to apply but imposes more constraints on the kind of function you can use:

• It can produce a scalar value to be broadcast to the shape of the group

• It can produce an object of the same shape as the input group

• It must not mutate its input

In [71]:
df = pd.DataFrame({
    'key': ['a','b','c']*3,
    'value': np.random.randint(45,1000, size=9)
})

In [72]:
df

Unnamed: 0,key,value
0,a,223
1,b,702
2,c,353
3,a,979
4,b,928
5,c,643
6,a,848
7,b,896
8,c,317


In [78]:
df.groupby('key').mean()

Unnamed: 0_level_0,value
key,Unnamed: 1_level_1
a,683.333333
b,842.0
c,437.666667


In [76]:
g=df.groupby('key').value

In [77]:
g.mean()

key
a    683.333333
b    842.000000
c    437.666667
Name: value, dtype: float64

Suppose instead we wanted to produce a Series of the same shape as df['value'] but
with values replaced by the average grouped by 'key' . We can pass the function
lambda x: x.mean() to transform :

In [79]:
g.transform(lambda x: x.mean())

0    683.333333
1    842.000000
2    437.666667
3    683.333333
4    842.000000
5    437.666667
6    683.333333
7    842.000000
8    437.666667
Name: value, dtype: float64

In [80]:
g.transform('mean')

0    683.333333
1    842.000000
2    437.666667
3    683.333333
4    842.000000
5    437.666667
6    683.333333
7    842.000000
8    437.666667
Name: value, dtype: float64

In [81]:
def normalize(x):
    return (x-x.mean()/x.std())

In [82]:
g.transform(normalize)

0    221.308603
1    695.114993
2    350.551276
3    977.308603
4    921.114993
5    640.551276
6    846.308603
7    889.114993
8    314.551276
Name: value, dtype: float64

In [83]:
g.apply(normalize)

0    221.308603
1    695.114993
2    350.551276
3    977.308603
4    921.114993
5    640.551276
6    846.308603
7    889.114993
8    314.551276
Name: value, dtype: float64

Built-in aggregate functions like 'mean' or 'sum' are often much faster than a general
apply function. These also have a “fast past” when used with transform . This allows
us to perform a so-called unwrapped group operation:

In [84]:
g.transform('mean')

0    683.333333
1    842.000000
2    437.666667
3    683.333333
4    842.000000
5    437.666667
6    683.333333
7    842.000000
8    437.666667
Name: value, dtype: float64

### Another way


In [85]:
normalize = (df['value']-g.transform('mean'))/g.transform('std')

In [86]:
normalize

0   -1.139424
1   -1.144775
2   -0.473706
3    0.731838
4    0.703219
5    1.148830
6    0.407585
7    0.441556
8   -0.675124
Name: value, dtype: float64

In [93]:
dfw = df.assign(k = df['value'])

In [94]:
dfw

Unnamed: 0,key,value,k
0,a,223,223
1,b,702,702
2,c,353,353
3,a,979,979
4,b,928,928
5,c,643,643
6,a,848,848
7,b,896,896
8,c,317,317
