This notebook will briefly introduce methods similar to `df.quantile` and give brief demonstrations of how they work.

In [15]:
# imports
import pandas as pd
import numpy as np
import random

vals = pd.DataFrame({
    'cat': [random.sample(['A', 'B', 'C'], 1)[0] for i in range(1000)],
    'val': [np.random.choice(250) for i in range(1000)]})
vals

Unnamed: 0,cat,val
0,A,87
1,A,21
2,C,7
3,C,112
4,A,199
...,...,...
995,A,209
996,A,118
997,C,164
998,B,80


In [17]:
# quantile returns the value associated with the percentile value passed in
# 48 is the number associated with the 20th percentile of this column
vals['val'].quantile(.2)

48.0

In [22]:
# qcut will automatically split the val column into three separate bins
pd.qcut(vals['val'], 3)

0       (80.0, 166.0]
1      (-0.001, 80.0]
2      (-0.001, 80.0]
3       (80.0, 166.0]
4      (166.0, 249.0]
            ...      
995    (166.0, 249.0]
996     (80.0, 166.0]
997     (80.0, 166.0]
998    (-0.001, 80.0]
999    (166.0, 249.0]
Name: val, Length: 1000, dtype: category
Categories (3, interval[float64]): [(-0.001, 80.0] < (80.0, 166.0] < (166.0, 249.0]]

In [25]:
# if you want to specify the cutoff points, you can do so as percentiles or raw values
# here we do cutoffs at the 25th & 75% percentiles
pd.qcut(vals['val'], q=[0, 0.25, 0.75, 1])

0       (60.75, 188.0]
1      (-0.001, 60.75]
2      (-0.001, 60.75]
3       (60.75, 188.0]
4       (188.0, 249.0]
            ...       
995     (188.0, 249.0]
996     (60.75, 188.0]
997     (60.75, 188.0]
998     (60.75, 188.0]
999     (188.0, 249.0]
Name: val, Length: 1000, dtype: category
Categories (3, interval[float64]): [(-0.001, 60.75] < (60.75, 188.0] < (188.0, 249.0]]

In [26]:
# you can also supply them labels
pd.qcut(vals['val'], q=[0, 0.25, 0.75, 1], labels=['low', 'medium', 'high'])

0      medium
1         low
2         low
3      medium
4        high
        ...  
995      high
996    medium
997    medium
998    medium
999      high
Name: val, Length: 1000, dtype: category
Categories (3, object): ['low' < 'medium' < 'high']

In [27]:
# you can also use np.percentile to give you the percentile of a particular value
# it's the inverse of quantile
# the value at the 96th percentile of the val column is 238
np.percentile(vals['val'], 96)

238.0