In [1]:
import numpy as np
import pandas as pd

In [104]:
# Examples are gender, social class, blood type, country affiliation, observation time or rating via Likert scales.
%matplotlib inline

In [3]:
cat = pd.Categorical(pd.Series(['a','b','d','d']),categories=['a','b','c'])

In [4]:
cat

[a, b, NaN, NaN]
Categories (3, object): [a, b, c]

In [8]:
from pandas.api.types import CategoricalDtype
s = pd.Series(["a", "b", "c", "a"])
cat_type = CategoricalDtype(categories=["d", "c", "a"],
                                ordered=True)

In [9]:
s_cat = s.astype(cat_type)

In [10]:
s_cat

0      a
1    NaN
2      c
3      a
dtype: category
Categories (3, object): [d < c < a]

In [13]:
s_cat.astype(object)

0      a
1    NaN
2      c
3      a
dtype: object

In [15]:
s_cat.describe()

count     3
unique    2
top       a
freq      2
dtype: object

In [17]:
s_cat.cat.ordered

True

In [24]:
s = pd.Series(list('bbc')).astype(CategoricalDtype(list('abcd')))
s.unique()

[b, c]
Categories (2, object): [b, c]

In [25]:
s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [26]:
s.cat.rename_categories([1,2,3,4])

0    2
1    2
2    3
dtype: category
Categories (4, int64): [1, 2, 3, 4]

In [36]:
s.to_sql

<bound method NDFrame.to_sql of 0    b
1    b
2    c
dtype: category
Categories (4, object): [a, b, c, d]>

In [38]:
s.cat.add_categories('e')

0    b
1    b
2    c
dtype: category
Categories (5, object): [a, b, c, d, e]

In [39]:
s.cat.remove_categories('a')

0    b
1    b
2    c
dtype: category
Categories (3, object): [b, c, d]

In [41]:
s.cat.remove_unused_categories()

0    b
1    b
2    c
dtype: category
Categories (2, object): [b, c]

In [42]:
s.sort_values(inplace=True)

In [43]:
s

0    b
1    b
2    c
dtype: category
Categories (4, object): [a, b, c, d]

In [44]:
dfs = pd.DataFrame({'A' : pd.Categorical(list('bbeebbaa'), categories=['e','a','b'], ordered=True),
                        'B' : [1,2,1,2,2,1,2,1] })

In [46]:
dfs.sort_values(by=['A','B'])

Unnamed: 0,A,B
2,e,1
3,e,2
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2


In [47]:
dfs['A'] = dfs['A'].cat.reorder_categories(['a','b','e'])
dfs.sort_values(by=['A','B'])

Unnamed: 0,A,B
7,a,1
6,a,2
0,b,1
5,b,1
1,b,2
4,b,2
2,e,1
3,e,2


In [48]:
cats = pd.Categorical(["a","b","b","b","c","c","c"], categories=["a","b","c","d"])
df = pd.DataFrame({"cats":cats,"values":[1,2,2,2,3,4,5]})
df

Unnamed: 0,cats,values
0,a,1
1,b,2
2,b,2
3,b,2
4,c,3
5,c,4
6,c,5


In [49]:
df.groupby('cats').mean()

Unnamed: 0_level_0,values
cats,Unnamed: 1_level_1
a,1.0
b,2.0
c,4.0
d,


In [50]:
idx = pd.Index(["h","i","j","k","l","m","n",])
cats = pd.Series(["a","b","b","b","c","c","c"], dtype="category", index=idx)
values= [1,2,2,2,3,4,5]
df = pd.DataFrame({"cats":cats,"values":values}, index=idx)

In [51]:
df

Unnamed: 0,cats,values
h,a,1
i,b,2
j,b,2
k,b,2
l,c,3
m,c,4
n,c,5


In [54]:
df.iat[4,1]

3

In [62]:
df.iloc[4:5,0:1]

Unnamed: 0,cats
l,c


In [60]:
df.at['k','cats']

'b'

In [63]:
df["cats"].cat.categories = ["x","y","z"]

In [64]:
df

Unnamed: 0,cats,values
h,x,1
i,y,2
j,y,2
k,y,2
l,z,3
m,z,4
n,z,5


In [66]:
df = pd.DataFrame({"a":[1,1,1,1,1], "b":["a","a","a","a","a"]})
df.loc[1:2,"a"] = pd.Categorical(["b","b"], categories=["a","b"])



In [74]:
df.loc[1:2,'a']

1    b
2    b
Name: a, dtype: object

In [77]:
from pandas.api.types import union_categoricals
a = pd.Categorical(['1','b'])
b = pd.Categorical(['c','d'])
union_categoricals([a,b])

[1, b, c, d]
Categories (4, object): [1, b, c, d]

In [80]:
a,b

([1, b]
 Categories (2, object): [1, b], [c, d]
 Categories (2, object): [c, d])

In [81]:
c = pd.Series(['a','c','d'],dtype='category')

In [82]:
d  =pd.Series(['e','f','d'],dtype='category')

In [85]:
pd.concat([c,d]).

0    a
1    c
2    d
0    e
1    f
2    d
dtype: object

In [86]:
a = pd.Categorical(['a','b','c'])

In [87]:
d = pd.Series(['e','f','d'],index=a)

In [88]:
d

a    e
b    f
c    d
dtype: object

In [89]:
d.index

CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=False, dtype='category')

In [93]:
c.values

[a, c, d]
Categories (3, object): [a, c, d]

In [94]:
c.nbytes

27

In [95]:
c

0    a
1    c
2    d
dtype: category
Categories (3, object): [a, c, d]

In [103]:
c.cat.remove_categories('c')

0      a
1    NaN
2      d
dtype: category
Categories (2, object): [a, d]

In [102]:
c

0    a
1    c
2    d
dtype: category
Categories (3, object): [a, c, d]