# 7.3 数据转换

In [1]:
import pandas as pd
import numpy as np

# 7.3.5 离散化和装箱

In [2]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

### 1 按指定范围切割

In [3]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [4]:
cats.codes # 之前版本是 cats.labels

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [5]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

In [6]:
cats.value_counts

<bound method Categorical.value_counts of [(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]>

In [7]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

#### 默认的区间是前开后闭, 可是修改 right=False

In [8]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

#### 也可以用一个list或数组给labels选项来设定bin的名字：

In [9]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
res = pd.cut(ages, bins, labels=group_names)
res

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [MiddleAged < Senior < YoungAdult < Youth]

In [10]:
res.categories

Index(['MiddleAged', 'Senior', 'YoungAdult', 'Youth'], dtype='object')

### 2 等宽切割

In [11]:
data = np.random.rand(20)
print(data)
pd.cut(data, 4, precision=3) # 切割为四份,区间划分精确到三位小数

[0.67651875 0.3197963  0.47697867 0.72462546 0.29119138 0.79807027
 0.79357362 0.32479623 0.2927407  0.63190487 0.23092703 0.85465141
 0.5074627  0.36727528 0.27575041 0.33252942 0.14043439 0.07875755
 0.44021159 0.55902441]


[(0.661, 0.855], (0.273, 0.467], (0.467, 0.661], (0.661, 0.855], (0.273, 0.467], ..., (0.273, 0.467], (0.078, 0.273], (0.078, 0.273], (0.273, 0.467], (0.467, 0.661]]
Length: 20
Categories (4, interval[float64]): [(0.078, 0.273] < (0.273, 0.467] < (0.467, 0.661] < (0.661, 0.855]]

### 3 等频切割

In [12]:
data = np.random.randn(1000)
cats = pd.qcut(data, 4)
cats

[(-0.663, -0.0203], (0.682, 3.032], (-0.663, -0.0203], (-0.663, -0.0203], (-3.311, -0.663], ..., (-0.663, -0.0203], (-3.311, -0.663], (-0.663, -0.0203], (-0.663, -0.0203], (-0.0203, 0.682]]
Length: 1000
Categories (4, interval[float64]): [(-3.311, -0.663] < (-0.663, -0.0203] < (-0.0203, 0.682] < (0.682, 3.032]]

In [13]:
pd.value_counts(cats)

(0.682, 3.032]       250
(-0.0203, 0.682]     250
(-0.663, -0.0203]    250
(-3.311, -0.663]     250
dtype: int64

#### 也可以指定百分比：

In [14]:
cats2 = pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.]) # 累计的百分比
cats2

[(-1.333, -0.0203], (-0.0203, 1.251], (-1.333, -0.0203], (-1.333, -0.0203], (-3.311, -1.333], ..., (-1.333, -0.0203], (-1.333, -0.0203], (-1.333, -0.0203], (-1.333, -0.0203], (-0.0203, 1.251]]
Length: 1000
Categories (4, interval[float64]): [(-3.311, -1.333] < (-1.333, -0.0203] < (-0.0203, 1.251] < (1.251, 3.032]]

In [15]:
print(type(pd.value_counts(cats2)))
pd.value_counts(cats2)

<class 'pandas.core.series.Series'>


(-0.0203, 1.251]     400
(-1.333, -0.0203]    400
(1.251, 3.032]       100
(-3.311, -1.333]     100
dtype: int64