In [7]:
# import interactive matplotlib for jupyter lab
# modify if necessary
%matplotlib ipympl
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import binny

In [10]:
from importlib import reload
reload(binny.binny)
reload(binny.general_helpers)
reload(binny)

<module 'binny' from '/Users/felix/Documents/projects/devel_python/binny/binny/__init__.py'>

In [309]:
# create some very normal data with lots of noise
N      = 1000
np.random.seed(123)

x      = np.random.normal(scale=5, size=N)
y_tru  = 5 * x + 1
y_obs  = y_tru + np.random.normal(scale=100, size=N)

df     = pd.DataFrame(
    {
        'y_tru':  y_tru,
        'y_obs':  y_obs,
    },
    index=x
)

In [310]:
df.head()

Unnamed: 0,y_tru,y_obs
-5.428153,-26.140765,-101.023512
4.986727,25.933636,82.693109
1.414892,8.074462,79.889517
-7.531474,-36.657368,-136.595443
-2.893001,-13.465006,34.024826


In [311]:
# this looks indeed very noisy
plt.close(1)
fig, ax = plt.subplots(num=1)
df.plot(y=['y_tru','y_obs'], style=['.', '.'], ax=ax)

FigureCanvasNbAgg()

<matplotlib.axes._subplots.AxesSubplot at 0x145365898>

In [312]:
# binny allows us to bin using very little code
binned = binny.bin_df(
    # the input data from above
    df,
    # how to bin the df columns. default is just mean.
    # here, we also want the standard error
    bin_col_func=['mean','se'],
    # choose the number of bins
    bins=10
)

In [313]:
# voilà
binned

Unnamed: 0_level_0,y_tru,y_tru,y_obs,y_obs
Unnamed: 0_level_1,mean,se,mean,se
x,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
-14.779285,-72.896423,2.231899,-11.023319,38.30645
-10.40197,-51.009852,0.930757,-56.300929,14.676493
-6.349354,-30.74677,0.483002,-36.272024,7.453882
-1.99938,-8.996899,0.344399,-0.513227,5.315838
1.985338,10.926692,0.376014,9.73613,5.483252
6.251109,32.255546,0.509429,28.471377,7.805379
10.260005,52.300025,1.076102,47.935794,23.046125
15.49468,78.473398,4.954823,46.768803,49.561006


In [314]:
# plot the end result: we reconstructed the underlying regularity
# binning the "hidden" variable shows that the result is not biased
plt.close(2)
fig, ax = plt.subplots(num=2)
df.plot(y='y_tru', style='--', ax=ax, label='ground truth')
binned.plot(
    y=[('y_tru','mean'),('y_obs','mean')], 
    yerr=[2*binned.y_tru.se, 2*binned.y_obs.se],
    style='.',
    ax=ax
)

FigureCanvasNbAgg()

<matplotlib.axes._subplots.AxesSubplot at 0x145394b70>

In [315]:
# sometimes our dependent variable is not is not evenly distributed:

df2 = pd.DataFrame(
    {
        'y_tru':  y_tru,
        'y_obs':  y_obs,
    },
    index=np.exp(x)
)

In [316]:
# this looks indeed very noisy
plt.close(3)
fig, ax = plt.subplots(num=3)
df2.plot(y=['y_tru','y_obs'], style=['.', '.'], logx=True, ax=ax)

FigureCanvasNbAgg()

<matplotlib.axes._subplots.AxesSubplot at 0x1454200f0>

In [317]:
# binny allows us to use logarithmic or quantile-base bin sizes.
# the latter contain an equal number of samples each
binned_2 = binny.bin_df(
    # the input data from above
    df2,
    # how to bin the df columns. default is just mean.
    # here, we also want the standard error
    bin_col_func=['mean','se','count'],
    # choose the number of bins
    bins=15,
    # choose between 'lin', 'log', 'q' bins
    space='log',
    # it can make sense to drop extreme outliers
    drop_edges=True
)

In [318]:
plt.close(4)
fig, ax = plt.subplots(num=4)
df2.plot(y='y_tru', style='-', ax=ax, label='ground truth')
binned_2.plot(
    y=[('y_tru','mean'),('y_obs','mean')], 
    yerr=[2*binned_2.y_tru.se, 2*binned_2.y_obs.se],
    style='.',
    ax=ax,
    logx=True
)

FigureCanvasNbAgg()

<matplotlib.axes._subplots.AxesSubplot at 0x16b075898>

In [319]:
# binny allows us to use logarithmic or quantile-base bin sizes.
# the latter contain an equal number of samples each
binned_2q = binny.bin_df(
    # the input data from above
    df2,
    # how to bin the df columns. default is just mean.
    # here, we also want the standard error
    bin_col_func=['mean','se','count'],
    # choose the number of bins
    bins=15,
    # choose between 'lin', 'log', 'q' bins
    space='q',
    # it can make sense to drop extreme outliers
    drop_edges=True
)

In [320]:
# now the error is no longer influenced from the different
# sample counts in the bins
plt.close(5)
fig, ax = plt.subplots(num=5)
df2.plot(y='y_tru', style='-', ax=ax, label='ground truth')
binned_2q.plot(
    y=[('y_tru','mean'),('y_obs','mean')], 
    yerr=[2*binned_2q.y_tru.se, 2*binned_2q.y_obs.se],
    style='.',
    ax=ax,
    logx=True
)

FigureCanvasNbAgg()

<matplotlib.axes._subplots.AxesSubplot at 0x178b12eb8>