_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 10 - Data Aggregation and Group Operations

In [None]:
import numpy as np
import pandas as pd

## Part 1 - GroupBy Mechanics

In [None]:
# given a DataFrame:
df = pd.DataFrame({
    'key1': ['a', 'a', 'b', 'b', 'a'],
    'key2': ['one', 'two', 'one', 'two', 'one'],
    'data1': np.random.randn(5),
    'data2': np.random.randn(5)
})

print(df)

# calculate the mean of data1 column for key1
grouped = df['data1'].groupby(df['key1'])
print('\nSeries.groupby() returns a', type(grouped))
print('\nmeans of data1 grouped by key1:')
print(grouped.mean())

In [None]:
print('Means of data1 grouped on key1 and key2:')
means = df['data1'].groupby([df['key1'], df['key2']]).mean()
print(means)

print('\nUnstacking the means:')
means.unstack()

In [None]:
# Group keys could also list/array of the right size
#states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])
states = ['Ohio', 'California', 'California', 'Ohio', 'Ohio']
years = np.array([2005, 2005, 2006, 2005, 2006])

df['data1'].groupby([states, years]).mean()

In [None]:
# implicit col ref
print('Grouping df on key1 - key2 is discarded from resulting columns being not numeric:')
print(df.groupby('key1').mean())
print('\nGrouping df on key1 and key2:')
print(df.groupby(['key1', 'key2']).mean())

In [None]:
# Size of resulting groups:
print(df.groupby(['key1', 'key2']).size())
print(df.groupby(['key1']).size())

Iterating Over Groups

In [None]:
print('Iterating on df grouped by key1:')
for name, group in df.groupby('key1'):
    print('Name:', name)
    print('Group:\n', group)
    
# multiple keys
print('\nIterating on df grouped by key1 and key2:')
for keys, group in df.groupby(['key1', 'key2']):
    print('Tuple of keys:', keys)
    print('Group:\n', group)

In [None]:
# group by key1, convert the result to a dict, passing through list, in a single line
pieces = dict(list(df.groupby('key1')))
print(pieces['a'])
print(pieces['b'])

Selecting a Column or Subset of Columns

In [None]:
# same
print(df.groupby('key1')['data1'], df['data1'].groupby(df['key1']))

# same
print(df.groupby('key1')[['data2']], df[['data2']].groupby(df['key1']))

In [None]:
df.groupby(['key1', 'key2'])[['data2']].mean()

Grouping with Dicts and Series

In [None]:
people = pd.DataFrame(
    data=np.random.randn(5, 5),
    columns=['a', 'b', 'c', 'd', 'e'],
    index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.iloc[2, [1, 2]] = np.nan

print(people)

mapping = {'a': 'red', 'b': 'red', 'c': 'blue', 'd': 'blue', 'e': 'red', 'f' : 'orange'}

# group by column
print(people.groupby(mapping, axis=1).sum())

# ditto, mapping with a Series
print(people.groupby(pd.Series(mapping), axis=1).count())

Grouping with Functions

In [None]:
people.groupby(len).sum()

Grouping by Index Levels

In [None]:
columns = pd.MultiIndex.from_arrays(
    [['US', 'US', 'US', 'JP', 'JP'],[1, 3, 5, 1, 3]],
    names=['cty', 'tenor'])
hier_df = pd.DataFrame(
    np.random.randn(4, 5),
    columns=columns)
print(hier_df)
print(hier_df.groupby(level='cty', axis=1).count())