In [52]:
import numpy as np
import pandas as pd

tips = pd.read_csv('tips.csv')
tips.drop(['smoker'], axis=1, inplace=True)

In [53]:
tips.head()

Unnamed: 0,total_bill,tip,day,time,size
0,16.99,1.01,Sun,Dinner,2
1,10.34,1.66,Sun,Dinner,3
2,21.01,3.5,Sun,Dinner,3
3,23.68,3.31,Sun,Dinner,2
4,24.59,3.61,Sun,Dinner,4


### 参数传递

In [54]:
group_df = tips.groupby('day')

In [55]:
def fun_agg(group, func_name='mean'):
    if func_name == 'mean':
        return group.mean()
    else:
        return group.sum()

In [56]:
# 可参与运算的列
group_df.apply(fun_agg, func_name='mean')

Unnamed: 0_level_0,total_bill,tip,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,17.151579,2.734737,2.105263
Sat,20.441379,2.993103,2.517241
Sun,21.41,3.255132,2.842105
Thur,17.682742,2.771452,2.451613


In [57]:
# 全部列
group_df.agg(fun_agg, func_name='mean')

Unnamed: 0_level_0,total_bill,tip,time,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,17.151579,2.734737,,2.105263
Sat,20.441379,2.993103,,2.517241
Sun,21.41,3.255132,,2.842105
Thur,17.682742,2.771452,,2.451613


### apply函数的特点

In [58]:
def app_f1(group):
    print(group) # 每个分类下的所有的数据为一个group
    return 2, 3, 4

# 函数返回值的拼接
group_df.apply(app_f1)

     total_bill   tip  day    time  size
90        28.97  3.00  Fri  Dinner     2
91        22.49  3.50  Fri  Dinner     2
92         5.75  1.00  Fri  Dinner     2
93        16.32  4.30  Fri  Dinner     2
94        22.75  3.25  Fri  Dinner     2
95        40.17  4.73  Fri  Dinner     4
96        27.28  4.00  Fri  Dinner     2
97        12.03  1.50  Fri  Dinner     2
98        21.01  3.00  Fri  Dinner     2
99        12.46  1.50  Fri  Dinner     2
100       11.35  2.50  Fri  Dinner     2
101       15.38  3.00  Fri  Dinner     2
220       12.16  2.20  Fri   Lunch     2
221       13.42  3.48  Fri   Lunch     2
222        8.58  1.92  Fri   Lunch     1
223       15.98  3.00  Fri   Lunch     3
224       13.42  1.58  Fri   Lunch     2
225       16.27  2.50  Fri   Lunch     2
226       10.09  2.00  Fri   Lunch     2
     total_bill   tip  day    time  size
19        20.65  3.35  Sat  Dinner     3
20        17.92  4.08  Sat  Dinner     2
21        20.29  2.75  Sat  Dinner     2
22        15.77 

day
Fri     (2, 3, 4)
Sat     (2, 3, 4)
Sun     (2, 3, 4)
Thur    (2, 3, 4)
dtype: object

In [59]:
def app_f2(group):
    ser = pd.Series([2, 3, 4], index=['one', 'two', 'three'])
    print(ser)
    return ser

group_df.apply(app_f2)

one      2
two      3
three    4
dtype: int64
one      2
two      3
three    4
dtype: int64
one      2
two      3
three    4
dtype: int64
one      2
two      3
three    4
dtype: int64


Unnamed: 0_level_0,one,two,three
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Fri,2,3,4
Sat,2,3,4
Sun,2,3,4
Thur,2,3,4


In [60]:
def app_f3(group):
    return group.head(3)

group_df.apply(app_f3)

Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,day,time,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Fri,90,28.97,3.0,Fri,Dinner,2
Fri,91,22.49,3.5,Fri,Dinner,2
Fri,92,5.75,1.0,Fri,Dinner,2
Sat,19,20.65,3.35,Sat,Dinner,3
Sat,20,17.92,4.08,Sat,Dinner,2
Sat,21,20.29,2.75,Sat,Dinner,2
Sun,0,16.99,1.01,Sun,Dinner,2
Sun,1,10.34,1.66,Sun,Dinner,3
Sun,2,21.01,3.5,Sun,Dinner,3
Thur,77,27.2,4.0,Thur,Lunch,4


In [61]:
def app_f4(group):
    mx = group.max()
    mi = group.min()

    # 多层索引
    index_out = np.repeat(mi.index.values, 2)
    index_inner = np.array(['max', 'min'] * 5)
    all_index = np.stack((index_out, index_inner), axis=0)
    new_columns = pd.MultiIndex.from_arrays(all_index)

    values = np.stack((mx.values, mi.values), axis=0).T.flatten()
    result = pd.Series(values, index=new_columns)
    print(result)
    return result

group_df.apply(app_f4)

total_bill  max     40.17
            min      5.75
tip         max      4.73
            min       1.0
day         max       Fri
            min       Fri
time        max     Lunch
            min    Dinner
size        max         4
            min         1
dtype: object
total_bill  max     50.81
            min      3.07
tip         max      10.0
            min       1.0
day         max       Sat
            min       Sat
time        max    Dinner
            min    Dinner
size        max         5
            min         1
dtype: object
total_bill  max     48.17
            min      7.25
tip         max       6.5
            min      1.01
day         max       Sun
            min       Sun
time        max    Dinner
            min    Dinner
size        max         6
            min         2
dtype: object
total_bill  max     43.11
            min      7.51
tip         max       6.7
            min      1.25
day         max      Thur
            min      Thur
time        max     Lu

Unnamed: 0_level_0,total_bill,total_bill,tip,tip,day,day,time,time,size,size
Unnamed: 0_level_1,max,min,max,min,max,min,max,min,max,min
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
Fri,40.17,5.75,4.73,1.0,Fri,Fri,Lunch,Dinner,4,1
Sat,50.81,3.07,10.0,1.0,Sat,Sat,Dinner,Dinner,5,1
Sun,48.17,7.25,6.5,1.01,Sun,Sun,Dinner,Dinner,6,2
Thur,43.11,7.51,6.7,1.25,Thur,Thur,Lunch,Dinner,6,1


### agg函数的特点

In [62]:
def agg_f1(group):
    return 2, 3, 4

group_df.agg(agg_f1) # 不包括用于groupby的组

Unnamed: 0_level_0,total_bill,tip,time,size
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Fri,"(2, 3, 4)","(2, 3, 4)","(2, 3, 4)","(2, 3, 4)"
Sat,"(2, 3, 4)","(2, 3, 4)","(2, 3, 4)","(2, 3, 4)"
Sun,"(2, 3, 4)","(2, 3, 4)","(2, 3, 4)","(2, 3, 4)"
Thur,"(2, 3, 4)","(2, 3, 4)","(2, 3, 4)","(2, 3, 4)"


In [65]:
def agg_f2(group):
    print(group) # 每个分类下的每个列为一个group
    print(group.mean()) # 标量
    # ★★★★★返回值必须为标量(agg函数的作用为聚合,聚合就是数据由向量缩减为标量).若返回值不为标量,请用apply函数
    return group.mean() # 根据每个group的顺序(即下面group_df.index的顺序)和名字将计算的结果进行相应的拼接

group_df.agg(agg_f2)

90     28.97
91     22.49
92      5.75
93     16.32
94     22.75
95     40.17
96     27.28
97     12.03
98     21.01
99     12.46
100    11.35
101    15.38
220    12.16
221    13.42
222     8.58
223    15.98
224    13.42
225    16.27
226    10.09
Name: total_bill, dtype: float64
17.151578947368417
90     28.97
91     22.49
92      5.75
93     16.32
94     22.75
95     40.17
96     27.28
97     12.03
98     21.01
99     12.46
100    11.35
101    15.38
220    12.16
221    13.42
222     8.58
223    15.98
224    13.42
225    16.27
226    10.09
Name: total_bill, dtype: float64
17.151578947368417
90     28.97
91     22.49
92      5.75
93     16.32
94     22.75
95     40.17
96     27.28
97     12.03
98     21.01
99     12.46
100    11.35
101    15.38
220    12.16
221    13.42
222     8.58
223    15.98
224    13.42
225    16.27
226    10.09
Name: Fri, dtype: float64
17.151578947368417
90     3.00
91     3.50
92     1.00
93     4.30
94     3.25
95     4.73
96     4.00
97     1.50
98     3.00
99

ValueError: Shape of passed values is (5, 4), indices imply (4, 4)