In [None]:
import numpy as np
import pandas as pd

In [None]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}{1}
    """
    def __init__(self, *args):
        self.args = args
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
            for a in self.args)
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
            for a in self.args)

In [None]:
# load the planets dataset
import seaborn as sns
planets = sns.load_dataset('planets')
planets.shape

In [None]:
planets.head()

In [None]:
summary = planets.dropna().describe()
summary

In [None]:
summary = summary.T
summary

In [None]:
# calculate the Inter-quartile range IQR
iqr = summary['75%'] - summary['25%']
iqr = pd.DataFrame(iqr, columns=['iqr'])
iqr

GroupBy

In [None]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])
df

In [None]:
df.groupby('key')

In [None]:
df.groupby('key').sum()

The GroupBy Object

In [None]:
planets.head()

In [None]:
# column indexing
planets.groupby('method')['orbital_period'].median()

In [None]:
# iteration over groups
for (method, group) in planets.groupby('method'):
    print("{0:30s} shape={1}".format(method, group.shape))

In [None]:
planets.groupby('method')['year'].describe().unstack()

Aggregate, Filter, Transform and Apply

In [None]:
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)}, columns=['key', 'data1', 'data2'])
df

In [None]:
# aggregate
df.groupby('key').aggregate(['min', np.median, max])

In [None]:
df.groupby('key').aggregate({'data1': 'min',
                             'data2': 'max'})

In [None]:
# filtering
def filter_std(x):
    return x['data2'].std() > 4

In [None]:
display('df', "df.groupby('key').std()", "df.groupby('key').filter(filter_std)")

In [None]:
# transformation (output should be the same shape as the input)
def trans_center(x):
    return x - x.mean()

In [None]:
df.groupby('key').transform(trans_center)

In [None]:
# apply
def norm_by_data(x):
    x['data1'] /= x['data2'].sum()
    return x

In [None]:
df.groupby('key').apply(norm_by_data)

Specifying the Split Key

In [None]:
L = [0, 1, 2, 0, 1, 1]
df.groupby(L).sum()

In [None]:
df1 = df.set_index('key')
mapping = {'A': 'vowel', 'B': 'consonant', 'C': 'consonant'}
df1.groupby(mapping).sum()

In [None]:
# Grouping Example - Count discovered planets by method and by decade
planets.head()

In [None]:
decade = 10 * (planets['year'] // 10)
decade = decade.astype(str) + 's'
decade.name = 'decade'
decade

In [None]:
planets.groupby(['method', decade])['number'].sum().unstack().fillna(0)