## 函数映射apply
* Seris每个单独元素应用于函数
* 沿DataFrame的轴(index或columns)应用与函数

In [1]:
import numpy as np
import pandas as pd

In [2]:
'''
func : function
    Function to apply to each column or row.
    
axis : {0 or 'index', 1 or 'columns'}, default 0
    Axis along which the function is applied:

    * 0 or 'index': apply function to each column.
    * 1 or 'columns': apply function to each row.

result_type : {'expand', 'reduce', 'broadcast', None}, default None
    These only act when ``axis=1`` (columns):

    * 'expand' : list-like results will be turned into columns.
    * 'reduce' : returns a Series if possible rather than expanding
      list-like results. This is the opposite of 'expand'.
    * 'broadcast' : results will be broadcast to the original shape
      of the DataFrame, the original index and columns will be
      retained.

    The default behaviour (None) depends on the return value of the
    applied function: list-like results will be returned as a Series
    of those. However if the apply function returns a Series these
    are expanded to columns.

**kwds
    Additional keyword arguments to pass as keywords arguments to
    `func`.
'''
s = pd.Series([20, 21, 12],
              index=['London', 'New York', 'Helsinki'])
s

London      20
New York    21
Helsinki    12
dtype: int64

In [3]:
def square(x):
    # 若作用于Series,则x为Series的单个元素
    # 返回值再将经过函数返回值拼接

    # 若作用于DataFrame,则x为DataFrame的行或列(根据axis参数指定)
    # 返回值再将经过函数返回值拼接
    print('x:', x)
    return x ** 2


index = iter(list(s.index))


def classify(x):
    print('x:', x)
    element_index = next(index)  # 元素x对应的index
    print("element_index:", element_index)
    if abs(x) > 16:
        return x
    else:
        return None  # 返回值若为None,则返回NaN

In [4]:
s.apply(func=square)

x: 20
x: 21
x: 12


London      400
New York    441
Helsinki    144
dtype: int64

In [5]:
s.apply(classify)

x: 20
element_index: London
x: 21
element_index: New York
x: 12
element_index: Helsinki


London      20.0
New York    21.0
Helsinki     NaN
dtype: float64

In [6]:
s  # s不变

London      20
New York    21
Helsinki    12
dtype: int64

In [7]:
s.apply('I am a {}'.format)

London      I am a 20
New York    I am a 21
Helsinki    I am a 12
dtype: object

In [8]:
s_list = pd.Series([[20], [21], [12]],
                   index=['London', 'New York', 'Helsinki'])
s_list

London      [20]
New York    [21]
Helsinki    [12]
dtype: object

In [9]:
def list_append(lst):
    lst.append(999)
    return lst


s_list.apply(list_append)

London      [20, 999]
New York    [21, 999]
Helsinki    [12, 999]
dtype: object

In [10]:
s_list  # s_list改变(s_list的元素可变数据类型)

London      [20, 999]
New York    [21, 999]
Helsinki    [12, 999]
dtype: object

In [11]:
df = pd.DataFrame([[4, 9], ] * 3,
                  columns=['A', 'B'])
df

Unnamed: 0,A,B
0,4,9
1,4,9
2,4,9


In [12]:
df.apply(square)  # 默认axis=0

x: 0    4
1    4
2    4
Name: A, dtype: int64
x: 0    9
1    9
2    9
Name: B, dtype: int64


Unnamed: 0,A,B
0,16,81
1,16,81
2,16,81


In [13]:
df.apply(np.sqrt)

Unnamed: 0,A,B
0,2.0,3.0
1,2.0,3.0
2,2.0,3.0


In [14]:
df.apply(np.sum, axis=0)  # 类型为Series

A    12
B    27
dtype: int64

In [15]:

df.apply(lambda x: [1, 2], axis=1)  # 使用匿名函数

0    [1, 2]
1    [1, 2]
2    [1, 2]
dtype: object

In [16]:
# Passing result_type=’expand’ will expand list-like results to columns of a Dataframe
df.apply(lambda x: [1, 2], axis=1, result_type='expand')  # 扩展为列

Unnamed: 0,0,1
0,1,2
1,1,2
2,1,2


In [17]:
# Passing result_type='broadcast' will ensure the same shape result
df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')  # 形状不变,列名也不变

Unnamed: 0,A,B
0,1,2
1,1,2
2,1,2


In [18]:
# Returning a Series inside the function is similar to passing result_type='expand'. The resulting column names will be the Series index.
df.apply(lambda x: pd.Series([1, 2, 3], index=['foo', 'bar', 'tom']), axis=1)  # 返回值为Series,自动扩展为列

Unnamed: 0,foo,bar,tom
0,1,2,3
1,1,2,3
2,1,2,3


In [19]:
def f(x):
    """
    计算DataFrame每一列的最大和最小值
    :param x: DataFrame的每一列
    :return: Series
    """
    print("x:", x, type(x))
    return pd.Series([min(x), x.max()], index=['min', 'max'])  # 应用多个函数


df.apply(f, axis=0)

x: 0    4
1    4
2    4
Name: A, dtype: int64 <class 'pandas.core.series.Series'>
x: 0    9
1    9
2    9
Name: B, dtype: int64 <class 'pandas.core.series.Series'>


Unnamed: 0,A,B
min,4,9
max,4,9


In [20]:
def g(x, func1=np.mean):
    """
    对DataFrame函数操作
    :param x: DataFrame的每一列
    :param func1: 要执行的函数
    :return: 
    """
    return func1(x)


df.apply(func=g, func1=np.sqrt, axis=0)  # 向apply中的函数(g(x))传递参数

Unnamed: 0,A,B
0,2.0,3.0
1,2.0,3.0
2,2.0,3.0


In [21]:
def get_stats(group, gfunc=np.sum):
    # 注:这里group为DataFrame的行或列(即类型为Series)
    if group.name == 'A':  # 不同列应用不同的函数
        return pd.Series([group.min(), group.max(), np.ptp(group), g(group, func1=gfunc)],
                         index=['min', 'max', 'ptp', 'g'])
    else:
        return pd.Series([group.min(), group.max(), group.mean(), g(group, func1=gfunc)],
                         index=['min', 'max', 'mean', 'g'])

In [22]:
df.apply(func=get_stats, gfunc=np.sum, axis=0).unstack().unstack()




Unnamed: 0,g,max,mean,min,ptp
A,12.0,4.0,,4.0,0.0
B,27.0,9.0,9.0,9.0,
