_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 10 - Data Aggregation and Group Operations

In [None]:
import numpy as np
import pandas as pd

## Part 2 - Data Aggregation

In [None]:
df = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': np.random.randn(5),
    'data2' : np.random.randn(5)})
print(df)

In [None]:
# calling a DataFrame/Series method (eg: quantile) on a DataFrameGroupBy object
grouped = df.groupby('key1')

grouped.quantile(0.9)

In [None]:
# using custom aggregator
def peak_to_peak(arr):
    return arr.max() - arr.min()

grouped.aggregate(peak_to_peak)

In [None]:
# also describe() works!
grouped.describe().T

Column-Wise and Multiple Function Application

In [None]:
tips = pd.read_csv('examples/tips.csv')

In [None]:
# Add tip percentage of total bill
tips['tip_pct'] = tips['tip'] / tips['total_bill']
tips.head()

In [None]:
# group on day and smoker
grouped = tips.groupby(['day', 'smoker'])

# restrict on a column
grouped_pct = grouped['tip_pct']

# aggregate using mean
print(grouped_pct.mean())
# same
print(grouped_pct.agg('mean'))

# many aggregators
print(grouped_pct.agg(['mean', 'std', peak_to_peak]))

# naming the aggregators
grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])

In [None]:
# applying same stats to more cols
functions = ['count', 'mean', 'max']
result = grouped['tip_pct', 'total_bill'].agg(functions)
print(result)
print('\nStats for tip_pct only:')
print(result['tip_pct'])

germanly = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]
print(grouped['tip_pct', 'total_bill'].agg(germanly))

print('\nDifferent aggregators to cols:')
print(grouped.agg({'tip': np.max, 'size': 'sum'}))
grouped.agg({'tip_pct': ['min', 'max', 'mean', 'std'], 'size': 'sum'})

Returning Aggregated Data Without Row Indexes

In [None]:
tips.groupby(['day', 'smoker'], as_index=False).mean()