### Discretization
- Categorizing continious data in discrete bins

In [14]:
import numpy as np
import pandas as pd

# age data with mean age 35 and std dev 20
age_data = 20*np.random.randn(200) + 35
age_data = age_data[age_data>18] ## only take ages>18

bins = [18,25,35,60, 100]
cats = pd.cut(age_data, bins)

cats


[(35, 60], (60, 100], (35, 60], (35, 60], (35, 60], ..., (60, 100], (18, 25], (60, 100], (35, 60], (35, 60]]
Length: 161
Categories (4, interval[int64]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [19]:
cats = pd.cut(age_data, bins, right=False)
cats ## range closed in right

[[35, 60), [60, 100), [35, 60), [35, 60), [35, 60), ..., [60, 100), [18, 25), [60, 100), [35, 60), [35, 60)]
Length: 161
Categories (4, interval[int64]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

In [15]:
cats.codes

array([2, 3, 2, 2, 2, 0, 2, 3, 2, 3, 0, 1, 0, 2, 3, 2, 2, 2, 1, 1, 2, 2,
       2, 3, 2, 0, 1, 1, 1, 0, 2, 1, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 1, 2,
       2, 2, 2, 3, 3, 2, 1, 2, 2, 0, 2, 2, 1, 2, 1, 3, 2, 1, 2, 3, 1, 1,
       2, 2, 0, 1, 1, 1, 2, 2, 1, 1, 3, 3, 1, 3, 2, 3, 2, 0, 2, 0, 2, 2,
       2, 1, 3, 1, 2, 1, 0, 3, 0, 2, 3, 2, 2, 3, 3, 2, 2, 1, 2, 2, 2, 2,
       0, 2, 0, 1, 2, 3, 0, 0, 2, 1, 1, 3, 0, 0, 1, 1, 0, 2, 1, 0, 2, 1,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 1, 0, 0, 1, 2, 2, 3, 1, 2, 2,
       2, 3, 3, 0, 3, 2, 2], dtype=int8)

In [16]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]]
              closed='right',
              dtype='interval[int64]')

### Named Categories

In [22]:
pd.cut(age_data, bins, labels=['young', 'young-adult', 'middle-aged', 'old'])

[middle-aged, old, middle-aged, middle-aged, middle-aged, ..., old, young, old, middle-aged, middle-aged]
Length: 161
Categories (4, object): [young < young-adult < middle-aged < old]

### Cut in Equal Intervals

In [24]:
cats = pd.cut(age_data, 4)
cats

[(51.021, 67.414], (67.414, 83.806], (34.628, 51.021], (34.628, 51.021], (34.628, 51.021], ..., (51.021, 67.414], (18.17, 34.628], (67.414, 83.806], (34.628, 51.021], (34.628, 51.021]]
Length: 161
Categories (4, interval[float64]): [(18.17, 34.628] < (34.628, 51.021] < (51.021, 67.414] < (67.414, 83.806]]

In [25]:
pd.value_counts(cats)

(34.628, 51.021]    62
(18.17, 34.628]     56
(51.021, 67.414]    34
(67.414, 83.806]     9
dtype: int64

### Cut in Quantiles (qcut)

In [27]:
cats = pd.qcut(age_data, 4)
cats

[(41.694, 52.103], (52.103, 83.806], (41.694, 52.103], (41.694, 52.103], (30.523, 41.694], ..., (52.103, 83.806], (18.233999999999998, 30.523], (52.103, 83.806], (41.694, 52.103], (41.694, 52.103]]
Length: 161
Categories (4, interval[float64]): [(18.233999999999998, 30.523] < (30.523, 41.694] < (41.694, 52.103] < (52.103, 83.806]]

In [28]:
pd.value_counts(cats)

(18.233999999999998, 30.523]    41
(52.103, 83.806]                40
(41.694, 52.103]                40
(30.523, 41.694]                40
dtype: int64

#### Custom Quantiles

In [29]:
cats = pd.qcut(age_data, [0,0.2,0.8,1])
cats

[(27.774, 55.325], (55.325, 83.806], (27.774, 55.325], (27.774, 55.325], (27.774, 55.325], ..., (55.325, 83.806], (18.233999999999998, 27.774], (55.325, 83.806], (27.774, 55.325], (27.774, 55.325]]
Length: 161
Categories (3, interval[float64]): [(18.233999999999998, 27.774] < (27.774, 55.325] < (55.325, 83.806]]

In [30]:
pd.value_counts(cats)

(27.774, 55.325]                96
(18.233999999999998, 27.774]    33
(55.325, 83.806]                32
dtype: int64