# 第8张 分类数据

In [129]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r'C:\Users\gan\Desktop\joyful-pandas\data\table.csv')
df.head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


## 一、category的创建及其性质
### 1.分类变量的创建
#### （a）用Series创建

In [4]:
pd.Series(['a','b','c','a','d','b'],dtype = 'category')

0    a
1    b
2    c
3    a
4    d
5    b
dtype: category
Categories (4, object): [a, b, c, d]

In [130]:
temp_df = pd.DataFrame({'A':pd.Series(['a','b','c','a'],dtype = 'category'),'B':list('abcd')})
temp_df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,d


In [13]:
temp_df.dtypes

A    category
B      object
dtype: object

#### (c) 利用内置Categorical类型创建

In [83]:
cat = pd.Categorical(['a','b','c','a'],categories = ['a','b','c'])
pd.Series(cat)

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

#### （d）利用cut函数创建
##### 默认使用区间类型为标签

In [17]:
pd.cut(np.random.randint(0,60,5),[0,10,30,60])

[(10, 30], (30, 60], (30, 60], (10, 30], (0, 10]]
Categories (3, interval[int64]): [(0, 10] < (10, 30] < (30, 60]]

##### 可指定字符为标签

In [19]:
pd.cut(np.random.randint(0,60,5),[0,10,30,60],right = False,labels=['0-10','10-30','30-60'])    
# 这两种分类数据类型不同，一是 interval，二是 object

[30-60, 30-60, 10-30, 30-60, 30-60]
Categories (3, object): [0-10 < 10-30 < 30-60]

### 2.分类变量的结构
##### 一个分类变量包括三个部分，元素值（values）、分类类别（categories）、是否有序（order）
##### 从上面可以看出，使用cut函数创建的分类变量默认为有序分类变量
##### 下面介绍如何获取或修改这些属性
##### （a）describe方法
##### 该方法描述了一个分类序列的情况，包括非缺失值个数、元素值类别数（不是分类类别数）、最多次出现的元素及其频数

In [26]:
s = pd.Series(pd.Categorical(['a','b','c','a',np.nan],categories = ['a','b','c','d']))
s.describe()

count     4
unique    3
top       a
freq      2
dtype: object

#### (b) categories和ordered属性
##### 查看分类类别和是否排序

In [21]:
s.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [29]:
s.cat.ordered

False

### 3. 类别的修改
#### （a）利用 set_categories修改
#####  修改分类，但本身值不会变化

In [32]:
s = pd.Series(pd.Categorical(['a','b','c','a',np.nan],categories = ['a','b','c','d']))
s

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (4, object): [a, b, c, d]

In [33]:
s.cat.set_categories(['new_a','c'])

0    NaN
1    NaN
2      c
3    NaN
4    NaN
dtype: category
Categories (2, object): [new_a, c]

In [34]:
s

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (4, object): [a, b, c, d]

#### (b)利用rename_categories修改
##### 需要注意的是该方法会把值和分类

In [35]:
s = pd.Series(pd.Categorical(['a','b','c','a',np.nan],categories = ['a','b','c','d']))
s.cat.rename_categories(['new_%s'%i for i in s.cat.categories])

0    new_a
1    new_b
2    new_c
3    new_a
4      NaN
dtype: category
Categories (4, object): [new_a, new_b, new_c, new_d]

##### 利用字典修改值 

In [37]:
s.cat.rename_categories({'a':'new_a','b':'new_b'})

0    new_a
1    new_b
2        c
3    new_a
4      NaN
dtype: category
Categories (4, object): [new_a, new_b, c, d]

#### (c) 利用add_categories添加

In [38]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a",np.nan], categories=['a','b','c','d']))
s.cat.add_categories(['e'])

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (5, object): [a, b, c, d, e]

#### (e)利用remove_categories移除

In [40]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a",np.nan], categories=['a','b','c','d']))
s.cat.remove_categories(['d'])

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (3, object): [a, b, c]

#### （e）删除元素值未出现的分类类型

In [41]:
s = pd.Series(pd.Categorical(["a", "b", "c", "a",np.nan], categories=['a','b','c','d']))
s.cat.remove_unused_categories()

0      a
1      b
2      c
3      a
4    NaN
dtype: category
Categories (3, object): [a, b, c]

## 二、分类变量的排序
##### 前面提到，分类数据类型被分为有序和无序，这非常好理解，例如分数区间的高低是有序变量（cut函数），考试科目的类别一般看做无序变量。

### 1.序的建立
#### （a）一般来说会将一个序列转为有序变量，可以利用as_ordered方法

In [42]:
s = pd.Series(['a','d','c','a']).astype('category')
s

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a, c, d]

In [43]:
s.cat.as_ordered()

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a < c < d]

##### 退化为无序变量，只需要使用as_unordered( )

In [44]:
s.cat.as_unordered()

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a, c, d]

#### (b) 利用set_categories方法中的ordered !!! 参数

In [50]:
s = pd.Series(['a','b','c','a']).astype('category').cat.set_categories(['a','d','c'],ordered = True)
s

0      a
1    NaN
2      c
3      a
dtype: category
Categories (3, object): [a < d < c]

#### 利用reorder_categories方法
##### 这个方法的特点在于，新设置的分类必须与原分类为同一集合

In [51]:
s = pd.Series(['a','d','c','a']).astype('category')
s

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a, c, d]

In [52]:
s.cat.reorder_categories(['a','d','c'],ordered = True)

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a < d < c]

In [53]:
s.cat.reorder_categories(['a','c'],ordered = True)

ValueError: items in new_categories are not the same as in old categories

In [54]:
s.cat.reorder_categories(['a','c','e'],ordered = True)

ValueError: items in new_categories are not the same as in old categories

### 2.排序
#####  先前在第1章介绍的值排序和索引排序都是适用的

In [55]:
s = pd.Series(np.random.choice(['perfect','good','fair','bad','awful'],50)).astype('category')
s

0        good
1         bad
2        fair
3        good
4        fair
5         bad
6        good
7         bad
8        good
9         bad
10        bad
11      awful
12       fair
13        bad
14       good
15      awful
16        bad
17      awful
18    perfect
19      awful
20        bad
21      awful
22       good
23       good
24    perfect
25    perfect
26       good
27        bad
28      awful
29       fair
30       good
31      awful
32        bad
33       good
34    perfect
35        bad
36       good
37       fair
38       good
39       good
40       fair
41      awful
42       good
43        bad
44      awful
45    perfect
46    perfect
47      awful
48      awful
49      awful
dtype: category
Categories (5, object): [awful, bad, fair, good, perfect]

In [57]:
s.cat.set_categories(['perfect','good','fair','bad','awful'][::-1],ordered = True).head()

0    good
1     bad
2    fair
3    good
4    fair
dtype: category
Categories (5, object): [awful < bad < fair < good < perfect]

In [58]:
s.sort_values(ascending=False)

24    perfect
34    perfect
46    perfect
45    perfect
18    perfect
25    perfect
3        good
6        good
8        good
14       good
22       good
23       good
26       good
30       good
33       good
0        good
36       good
38       good
42       good
39       good
12       fair
29       fair
2        fair
4        fair
37       fair
40       fair
35        bad
9         bad
10        bad
5         bad
43        bad
7         bad
1         bad
13        bad
16        bad
32        bad
20        bad
27        bad
15      awful
17      awful
11      awful
41      awful
19      awful
21      awful
44      awful
48      awful
47      awful
28      awful
31      awful
49      awful
dtype: category
Categories (5, object): [awful, bad, fair, good, perfect]

In [59]:
df_sort = pd.DataFrame({'cat':s.values,'values':np.random.rand(50)}).set_index('cat')
df_sort

Unnamed: 0_level_0,values
cat,Unnamed: 1_level_1
good,0.654763
bad,0.814404
fair,0.062513
good,0.958212
fair,0.778294
bad,0.770169
good,0.991618
bad,0.405158
good,0.165524
bad,0.839724


In [60]:
df_sort.sort_index()

Unnamed: 0_level_0,values
cat,Unnamed: 1_level_1
awful,0.476171
awful,0.182044
awful,0.008224
awful,0.822501
awful,0.360678
awful,0.069365
awful,0.824832
awful,0.186094
awful,0.637393
awful,0.613423


## 三、分类变量的比较操作
### 1.与标量或等长序列的比较
#### （a）标量比较

In [61]:
s = pd.Series(['a','d','c','a']).astype('category')
s

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a, c, d]

In [62]:
s == 'a'

0     True
1    False
2    False
3     True
dtype: bool

#### (b) 等长序列比较

In [64]:
s == list('abcd')

0     True
1    False
2     True
3    False
dtype: bool

### 2.与另一分类变量的比较
#### （a） 等式判别（包含等号和不等号）
#####  两个分类变量的等式判别需要满足分类完全相同 ( 分类相同即可，值可以不相同 )

In [74]:
s = pd.Series(["a", "d", "c", "a"]).astype('category')
s == s

0    True
1    True
2    True
3    True
dtype: bool

In [67]:
s != s

0    False
1    False
2    False
3    False
dtype: bool

In [69]:
new_s = pd.Series(["a", "d", "c", "c"]).astype('category')
s == new_s

0     True
1     True
2     True
3    False
dtype: bool

In [75]:
s_new = s.cat.set_categories(['a','d','e'])
s == s_new

TypeError: Categoricals can only be compared if 'categories' are the same.

#### (b)不等式判别（包含>= , <=,<,>）
##### 两个分类变量的不等式判别需要满足两个条件：①分类完全相同；②排序完全相同

In [76]:
s = pd.Series(['a','d','c','a']).astype('category')
s >= s      # 未排序，及排序不相同

TypeError: Unordered Categoricals can only compare equality or not

In [77]:
s = pd.Series(['a','d','c','a']).astype('category').cat.reorder_categories(['a','c','d'])
s

0    a
1    d
2    c
3    a
dtype: category
Categories (3, object): [a, c, d]

In [81]:
s = pd.Series(['a','d','c','a']).astype('category').cat.reorder_categories(['a','c','d'],ordered = True)
s >= s

0    True
1    True
2    True
3    True
dtype: bool

## 四、问题与练习

#### 【问题一】 如何使用union_categoricals方法？它的作用是什么？

In [91]:
from pandas.api.types import union_categoricals

In [84]:
s1 = pd.Categorical(['a','b','c'])
s1

[a, b, c]
Categories (3, object): [a, b, c]

In [89]:
s1.dtype

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)

In [97]:
s2 = pd.Categorical(['a','d','e'],)
s2

[a, d, e]
Categories (3, object): [a, d, e]

In [98]:
s2.dtype

CategoricalDtype(categories=['a', 'd', 'e'], ordered=False)

#### 分类数据经过 pd.concat 合并后类型转为了 object 类型，如果想要保持分类类型的话，可以借助 union_categoricals 来完成，所有类别必须具有相同的dtype！！！

In [100]:
union_categoricals([s1,s2])

[a, b, c, a, d, e]
Categories (5, object): [a, b, c, d, e]

#### 【问题二】 利用concat方法将两个序列纵向拼接，它的结果一定是分类变量吗？什么情况下不是？

In [103]:
a = pd.Series(['a','b','c','d'],dtype = 'category')
b = pd.Series(['a','b','c','d'],dtype = 'category')
pd.concat([a,b])

0    a
1    b
2    c
3    d
0    a
1    b
2    c
3    d
dtype: category
Categories (4, object): [a, b, c, d]

In [106]:
a = pd.Series(['a','b','c','d'],dtype = 'category')
b = pd.Series(['a','b','c','e'],dtype = 'category')
pd.concat([a,b])  # 发现并不是分类变量，因为两个分类变量不同

0    a
1    b
2    c
3    d
0    a
1    b
2    c
3    e
dtype: object

#### 【问题三】 当使用groupby方法或者value_counts方法时，分类变量的统计结果和普通变量有什么区别？

In [107]:
pd.Series(['a','b','d','a',np.nan],dtype = 'category').value_counts()

a    2
d    1
b    1
dtype: int64

In [109]:
pd.Categorical(['a','b','d','a'],categories = ['a','b','c','d','e']).value_counts()

a    2
b    1
c    0
d    1
e    0
dtype: int64

In [132]:
df = pd.DataFrame({'A':pd.Series(['a','b','c','a'],dtype = 'category'),'B':list('abcd')})
df.groupby('A').size()

A
a    2
b    1
c    1
dtype: int64