In [41]:
import pandas as pd
import numpy as np

# Efficiency

- Always vectorize operations
- Using NumPy (by calling .values) could also improve performance

- .iloc is faster when selecting rows
- .loc is faster when selecting columns (with the syntax df.iloc[[...]])
- .samples is faster for randoms sample (specify axis for rows or columns)

# .replace

**.replace** is faster than using .loc

In [18]:
df = pd.DataFrame(dict(colA=[1,2,3,4,5],
                       colB='A B C D E'.split()))

In [22]:
df.replace('A', 'Substitute', inplace=True)

In [23]:
df

Unnamed: 0,colA,colB
0,1,Substitute
1,2,B
2,3,C
3,4,D
4,5,E


It can be used with multiple values (in one or both fields):

In [24]:
df.replace(['B', 'C'], 'Also Replaced')

Unnamed: 0,colA,colB
0,1,Substitute
1,2,Also Replaced
2,3,Also Replaced
3,4,D
4,5,E


In [25]:
df.replace(['B', 'C'], ['B was replaces', 'C was replaced'])

Unnamed: 0,colA,colB
0,1,Substitute
1,2,B was replaces
2,3,C was replaced
3,4,D
4,5,E


We can also use a dictionary (this is more efficient than the previous approach):

In [28]:
replace_dict = {'B':'B was replaced', 'C':'C was replaced'}
df.replace(replace_dict)

Unnamed: 0,colA,colB
0,1,Substitute
1,2,B was replaced
2,3,C was replaced
3,4,D
4,5,E


# .apply and .transform + groupby  
https://stackoverflow.com/questions/27517425/apply-vs-transform-on-a-group-object

- apply passes all the columns for each group as a DataFrame. It can return a scalar, series, or DataFrame
- transform passes each column for each group individually. It must return a sequence the same lenght as the group

In [43]:
df = pd.DataFrame({'State':['Texas', 'Texas', 'Florida', 'Florida'], 
                   'a':[4,5,1,3], 'b':[6,10,3,11]})
df

Unnamed: 0,State,a,b
0,Texas,4,6
1,Texas,5,10
2,Florida,1,3
3,Florida,3,11


apply works on subset dataframes, and it can perform operations on different columns (the output is confusing - the last column shows the results, while the other columns are indexes):

In [48]:
df.groupby('State').apply(lambda x:x['a']-x['b'])

State     
Florida  2   -2
         3   -8
Texas    0   -2
         1   -5
dtype: int64

transform operates on the columns individually, so the operation above would not work. The examples show how it could be used (notice that if transform returns a scalar, these will be repeated on the group):

In [52]:
df.groupby('State').transform(lambda x:x*2)

Unnamed: 0,a,b
0,8,12
1,10,20
2,2,6
3,6,22


In [53]:
df.groupby('State').transform(lambda x:x.sum())

Unnamed: 0,a,b
0,9,16
1,9,16
2,4,14
3,4,14


# groupby + apply + multiple functions

In [54]:
df = pd.DataFrame(np.random.rand(4,4), columns=list('abcd'))
df['group'] = [0, 0, 1, 1]
df

Unnamed: 0,a,b,c,d,group
0,0.100077,0.842549,0.45496,0.201828,0
1,0.534102,0.08419,0.28655,0.307996,0
2,0.09773,0.93341,0.646344,0.872681,1
3,0.260526,0.220976,0.543182,0.061414,1


Solution 1: dictionary mapping column names to aggregation functions

In [57]:
def min_max(x):
    return x.max() - x.min()
# very nice trick to show nice display
min_max.__name__ = 'min_max'

df.groupby('group').agg({'a':['sum', 'max'],
                          'b':'mean',
                          'c':'sum',
                          'd':min_max})

Unnamed: 0_level_0,a,a,b,c,d
Unnamed: 0_level_1,sum,max,mean,sum,min_max
group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
0,0.634179,0.534102,0.46337,0.74151,0.106168
1,0.358256,0.260526,0.577193,1.189527,0.811267


Solution 2: if you need multiple column interaction on each group, use a custom function with the operations (it should return a pandas series)

In [58]:
def f(x):
    d = {}
    d['a_sum'] = x['a'].sum()
    d['a_max'] = x['a'].max()
    d['b_mean'] = x['b'].mean()
    d['c_d_prodsum'] = (x['c'] * x['d']).sum()
    return pd.Series(d, index=['a_sum', 'a_max', 'b_mean', 'c_d_prodsum'])

In [59]:
df.groupby('group').apply(f)

Unnamed: 0_level_0,a_sum,a_max,b_mean,c_d_prodsum
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.634179,0.534102,0.46337,0.18008
1,0.358256,0.260526,0.577193,0.597412


Better than having to do the operations individually:

In [62]:
df.groupby('group').apply(lambda x:(x['c'] * x['d']).sum())

group
0    0.180080
1    0.597412
dtype: float64

# groupby + filter

In [67]:
df.groupby('group').filter(lambda x: x['a'].sum() > 0.6)

Unnamed: 0,a,b,c,d,group
0,0.100077,0.842549,0.45496,0.201828,0
1,0.534102,0.08419,0.28655,0.307996,0
