In [None]:
import pandas as pd
import numpy as np

ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [2]:
# 将年龄分成18-25,26-35,36-60以及61以上等若干组,为了实现这个功能,可以使用pandas中的cut

In [34]:
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins) # 默认labels=None
cats_f = pd.cut(ages, bins, labels=False)
cats_list = pd.cut(ages, bins, labels=np.array(['a', 'b', 'c', 'd']))

In [35]:
cats # 返回值是特殊的Categorical对象

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [31]:
cats_f # 0代表该值属于第一个区间

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int64)

In [36]:
cats_list # 指定返回的bin的标签

[a, a, a, b, a, ..., b, d, c, c, b]
Length: 12
Categories (4, object): [a < b < c < d]

In [5]:
cats.codes # 0代表该值属于第一个区间

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [6]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]],
              closed='right',
              dtype='interval[int64]')

In [7]:
pd.value_counts(cats) # 是对pandas.cut的结果中的箱数量的计数

(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

In [8]:
# 通过指定right=False来改变哪一边是封闭的
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [9]:
# 向labels选项传递一个列表或数组来自定义的箱名
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages, bins, labels=group_names)

[Youth, Youth, Youth, YoungAdult, Youth, ..., YoungAdult, Senior, MiddleAged, MiddleAged, YoungAdult]
Length: 12
Categories (4, object): [Youth < YoungAdult < MiddleAged < Senior]

In [10]:
data = np.random.rand(20)

In [33]:
# 如果传给cut整数来代替显式的箱边,pandas将根据数据中的最大值和最小值计算等长的箱(区间长度相同)
data_cut = pd.cut(data, 4, precision=2) # 存储和显示箱标签的精度为2
pd.value_counts(data_cut)

(0.012, 0.26]    7
(0.75, 0.99]     6
(0.5, 0.75]      6
(0.26, 0.5]      1
dtype: int64

In [12]:
data1 = np.random.randn(1000) # 正态分布

In [13]:
cats1 = pd.qcut(data1, 4) # 切成4份
# qcut是根据这些值的频率来选择箱子的均匀间隔，即每个箱子中含有的数的数量是相同的

In [14]:
cats1

[(-3.282, -0.726], (-3.282, -0.726], (-0.0252, 0.601], (-0.726, -0.0252], (-3.282, -0.726], ..., (-3.282, -0.726], (0.601, 3.36], (-3.282, -0.726], (0.601, 3.36], (-3.282, -0.726]]
Length: 1000
Categories (4, interval[float64]): [(-3.282, -0.726] < (-0.726, -0.0252] < (-0.0252, 0.601] < (0.601, 3.36]]

In [15]:
pd.value_counts(cats1)  # 每个区间的个数都是250

(0.601, 3.36]        250
(-0.0252, 0.601]     250
(-0.726, -0.0252]    250
(-3.282, -0.726]     250
dtype: int64

In [16]:
# 类似于cut,qcut也可以传入自定义的分位数
list1 = [0, 0.1, 0.5, 0.9, 1.]
cats2 = pd.qcut(data1, list1)
cats2

[(-3.282, -1.295], (-1.295, -0.0252], (-0.0252, 1.249], (-1.295, -0.0252], (-1.295, -0.0252], ..., (-3.282, -1.295], (1.249, 3.36], (-3.282, -1.295], (1.249, 3.36], (-1.295, -0.0252]]
Length: 1000
Categories (4, interval[float64]): [(-3.282, -1.295] < (-1.295, -0.0252] < (-0.0252, 1.249] < (1.249, 3.36]]

In [17]:
pd.value_counts(cats2)
# list1数据间隔之间的比例是1:4:4:1,故数据点之间的比例是100:400:400:100


(-0.0252, 1.249]     400
(-1.295, -0.0252]    400
(1.249, 3.36]        100
(-3.282, -1.295]     100
dtype: int64