# 数据聚合和分组运算

In [1]:
import numpy as np
from numpy import NaN as NA
import pandas as pd
from numpy.random import randn
from numpy.linalg import inv,qr, eig ,det ,svd

import matplotlib.pyplot as plt
import random

from pandas import DataFrame , Series

In [3]:
df = DataFrame({ 'key1':list('aabba'),
                'key2':'one,two,one,two,one'.split(','),
                'data1':np.random.randn(5),
                'data2':np.random.randn(5)
            
})

In [4]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.882622,-0.259932,a,one
1,0.542232,-0.217811,a,two
2,-1.931732,0.00845,b,one
3,-0.199144,0.240973,b,two
4,0.195455,0.062002,a,one


In [5]:
grouped = df['data1'].groupby(df['key1'])

In [6]:
grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f9b869cd780>

In [8]:
grouped.mean()

key1
a   -0.048311
b   -1.065438
Name: data1, dtype: float64

In [9]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()

In [10]:
means

key1  key2
a     one    -0.343583
      two     0.542232
b     one    -1.931732
      two    -0.199144
Name: data1, dtype: float64

In [11]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.343583,0.542232
b,-1.931732,-0.199144


In [14]:
states = np.array('Ohio,Calfornia,California,Ohio,Ohio'.split(','))

In [17]:
years = np.array([2005, 2005, 2006, 2005, 2006])

In [18]:
df['data1'].groupby([states, years]).mean()

Calfornia   2005    0.542232
California  2006   -1.931732
Ohio        2005   -0.540883
            2006    0.195455
Name: data1, dtype: float64

In [19]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.882622,-0.259932,a,one
1,0.542232,-0.217811,a,two
2,-1.931732,0.00845,b,one
3,-0.199144,0.240973,b,two
4,0.195455,0.062002,a,one


In [20]:
df.groupby('key1').mean()

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,-0.048311,-0.13858
b,-1.065438,0.124711


In [21]:
df.groupby(['key1', 'key2']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,data1,data2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,one,-0.343583,-0.098965
a,two,0.542232,-0.217811
b,one,-1.931732,0.00845
b,two,-0.199144,0.240973


In [22]:
df.groupby(['key1', 'key2']).size()

key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64

## 对分组进行迭代

In [23]:
for name,group in df.groupby('key1'):
    print(name)
    print(group)

a
      data1     data2 key1 key2
0 -0.882622 -0.259932    a  one
1  0.542232 -0.217811    a  two
4  0.195455  0.062002    a  one
b
      data1     data2 key1 key2
2 -1.931732  0.008450    b  one
3 -0.199144  0.240973    b  two


In [24]:
for (k1, k2), group in df.groupby('key1,key2'.split(',')):
    print(k1,k2)
    print(group)
    print('=========')

a one
      data1     data2 key1 key2
0 -0.882622 -0.259932    a  one
4  0.195455  0.062002    a  one
a two
      data1     data2 key1 key2
1  0.542232 -0.217811    a  two
b one
      data1    data2 key1 key2
2 -1.931732  0.00845    b  one
b two
      data1     data2 key1 key2
3 -0.199144  0.240973    b  two


In [25]:
pieces =dict(list(df.groupby(['key1'])))

In [26]:
pieces['b']

Unnamed: 0,data1,data2,key1,key2
2,-1.931732,0.00845,b,one
3,-0.199144,0.240973,b,two


In [27]:
list(df.groupby(['key1']))

[('a',       data1     data2 key1 key2
  0 -0.882622 -0.259932    a  one
  1  0.542232 -0.217811    a  two
  4  0.195455  0.062002    a  one), ('b',       data1     data2 key1 key2
  2 -1.931732  0.008450    b  one
  3 -0.199144  0.240973    b  two)]

In [29]:
dict([('a', 'n')])

{'a': 'n'}

In [30]:
df.dtypes

data1    float64
data2    float64
key1      object
key2      object
dtype: object

In [31]:
grouped = df.groupby(df.dtypes, axis = 1)

In [32]:
dict(list(grouped))

{dtype('float64'):       data1     data2
 0 -0.882622 -0.259932
 1  0.542232 -0.217811
 2 -1.931732  0.008450
 3 -0.199144  0.240973
 4  0.195455  0.062002, dtype('O'):   key1 key2
 0    a  one
 1    a  two
 2    b  one
 3    b  two
 4    a  one}

In [33]:
df

Unnamed: 0,data1,data2,key1,key2
0,-0.882622,-0.259932,a,one
1,0.542232,-0.217811,a,two
2,-1.931732,0.00845,b,one
3,-0.199144,0.240973,b,two
4,0.195455,0.062002,a,one


In [34]:
grouped.mean()

DataError: No numeric types to aggregate

## 选取一个或者一组列

In [35]:
df.groupby('key1')['data1']

<pandas.core.groupby.SeriesGroupBy object at 0x7f9b85d41e80>

In [36]:
df.groupby('key1')[['data2']]

<pandas.core.groupby.DataFrameGroupBy object at 0x7f9b85d417f0>

In [37]:
df.groupby('key1')['data1'].mean()

key1
a   -0.048311
b   -1.065438
Name: data1, dtype: float64

In [38]:
df.groupby('key1')[['data2']].mean()

Unnamed: 0_level_0,data2
key1,Unnamed: 1_level_1
a,-0.13858
b,0.124711


In [39]:
s_grouped = df.groupby(['key1', 'key2'])['data2']

In [40]:
s_grouped

<pandas.core.groupby.SeriesGroupBy object at 0x7f9b85d41dd8>

In [41]:
s_grouped.mean()

key1  key2
a     one    -0.098965
      two    -0.217811
b     one     0.008450
      two     0.240973
Name: data2, dtype: float64

## 通过字典和Series进行分组

In [42]:
people = DataFrame(np.random.randn(5, 5),
                   columns=list('abcde'),
                   index='Joe,Steve,Wes,Jim,Travis'.split(',')

)

In [43]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.110559,0.944368,-0.804101,-0.297503,-0.397996
Steve,-0.228949,0.121923,0.138203,0.151283,-0.339019
Wes,0.872632,-0.465398,-2.200225,-0.719175,-0.075825
Jim,-0.390272,1.972085,1.0758,-1.082002,1.806319
Travis,0.998766,0.866858,-0.60479,0.400033,-0.97847


In [45]:
people.loc[2:3, list('bc')] = np.nan

In [46]:
people

Unnamed: 0,a,b,c,d,e
Joe,-0.110559,0.944368,-0.804101,-0.297503,-0.397996
Steve,-0.228949,0.121923,0.138203,0.151283,-0.339019
Wes,0.872632,,,-0.719175,-0.075825
Jim,-0.390272,1.972085,1.0758,-1.082002,1.806319
Travis,0.998766,0.866858,-0.60479,0.400033,-0.97847


In [48]:
mapping = {
    'a':'red',
    'b':'red',
    'c':'blue',
    'b':'blue',
    'e':'red',
    'f':'orange',
    
}

In [49]:
by_column = people.groupby(mapping, axis=1)

In [50]:
by_column.sum()

Unnamed: 0,blue,red
Joe,0.140267,-0.508555
Steve,0.260127,-0.567967
Wes,,0.796807
Jim,3.047885,1.416048
Travis,0.262068,0.020296
