# Data Wrangling Using Pandas: Group Data

In [1]:
import pandas as pd
import numpy as np
#%load_ext line_profiler
%load_ext Cython
from joblib import Parallel, delayed

# 1. Introduction

What does it mean to Group Data? 

* Split: Segment data based on criteria

* Apply: Aggregate - Transform - Filter


In [39]:
# Load data sets <Shift-Tab> see inside function parenthesis to see more arguments.
df_bal = pd.read_csv('/Users/stewarta/Documents/DATA/Home Data/bureau_balance.csv')

In [3]:
#df_bal = df_bal.query('SK_ID_BUREAU in [5715448, 5041336]')
#df_bal.reset_index(drop=True, inplace=True)
#df_bal = df_bal.query('SK_ID_BUREAU == 5715448' )
print(df_bal)

          SK_ID_BUREAU  MONTHS_BALANCE STATUS
0              5715448               0      C
1              5715448              -1      C
2              5715448              -2      C
3              5715448              -3      C
4              5715448              -4      C
5              5715448              -5      C
6              5715448              -6      C
7              5715448              -7      C
8              5715448              -8      C
9              5715448              -9      0
10             5715448             -10      0
11             5715448             -11      X
12             5715448             -12      X
13             5715448             -13      X
14             5715448             -14      0
15             5715448             -15      0
16             5715448             -16      0
17             5715448             -17      0
18             5715448             -18      0
19             5715448             -19      0
20             5715448            

In [74]:
df_bal.index

Int64Index([       0,        1,        2,        3,        4,        5,
                   6,        7,        8,        9,       10,       11,
                  12,       13,       14,       15,       16,       17,
                  18,       19,       20,       21,       22,       23,
                  24,       25,       26, 27299901, 27299902, 27299903,
            27299904, 27299905, 27299906, 27299907, 27299908, 27299909,
            27299910, 27299911, 27299912, 27299913, 27299914, 27299915,
            27299916, 27299917, 27299918, 27299919, 27299920, 27299921,
            27299922, 27299923, 27299924],
           dtype='int64')

In [4]:
df_bal.reset_index(drop=True, inplace=True)

In [40]:
df_bal = df_bal.set_index('SK_ID_BUREAU')

In [71]:
df_bal.index

Int64Index([5715448, 5715448, 5715448, 5715448, 5715448, 5715448, 5715448,
            5715448, 5715448, 5715448, 5715448, 5715448, 5715448, 5715448,
            5715448, 5715448, 5715448, 5715448, 5715448, 5715448, 5715448,
            5715448, 5715448, 5715448, 5715448, 5715448, 5715448],
           dtype='int64', name='SK_ID_BUREAU')

In [5]:
df_bal['WEIGHT'] = pd.Series(np.random.rand(len(df_bal)))
df_bal.tail()

Unnamed: 0_level_0,MONTHS_BALANCE,STATUS,WEIGHT
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5041336,-47,X,0.132415
5041336,-48,X,0.132415
5041336,-49,X,0.132415
5041336,-50,X,0.132415
5041336,-51,X,0.132415


# 2. Split a Dataframe into Groups

Split: Segment data based on criteria

Pandas objects can be split on any of their axes. Splits are created using the using the groupby() function.  

We form groups by passeing one or more columns and and axis to the groupby function. Default axis = 0



In [41]:
group_bal = df_bal.groupby('SK_ID_BUREAU')

## Inspecting Groups
A single group can be selected using get_group()

In [34]:
group_bal.get_group(5715448) # returns a dataframe!

Unnamed: 0_level_0,MONTHS_BALANCE,STATUS,WEIGHT
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
5715448,0,C,0.546438
5715448,-1,C,0.546438
5715448,-2,C,0.546438
5715448,-3,C,0.546438
5715448,-4,C,0.546438
5715448,-5,C,0.546438
5715448,-6,C,0.546438
5715448,-7,C,0.546438
5715448,-8,C,0.546438
5715448,-9,0,0.546438


## Iterating through Groups¶

Try going back to dask and changing the way I iterate over groups...
import dask.dataframe as dd
import pandas as pd
pdf = pd.DataFrame({'A':[1, 2, 3, 4, 5], 'B':['1','1','a','a','a']})
ddf = dd.from_pandas(pdf, npartitions = 3)
groups = ddf.groupby('B')

for group in pdf['B'].unique():
    print groups.get_group(group)

Unnamed: 0_level_0,MONTHS_BALANCE,STATUS
SK_ID_BUREAU,Unnamed: 1_level_1,Unnamed: 2_level_1
5001709,0,C
5001709,-71,C
5001709,-70,C
5001709,-69,C
5001709,-68,C
5001709,-67,C
5001709,-66,C
5001709,-65,C
5001709,-64,C
5001709,-63,C


In [19]:
%%timeit -n 1 -r 1 -t x = range(10)
count = 0
for name, group in group_bal:
    print(type(name) ) ## <class 'int'>
    print(type(group)) ## <class 'pandas.core.frame.DataFrame'>
    count += 1
    if count == 1:
        break
    pass

<class 'int'>
<class 'pandas.core.frame.DataFrame'>
2.05 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [20]:
%%timeit -n 1 -r 1 -t x = range(10)
count = 0
for name, group in group_bal['MONTHS_BALANCE']: ##pandas.core.groupby.groupby.SeriesGroupBy
    print(type(name) ) ## <class 'int'>
    print(type(group)) ## <class 'pandas.core.series.Series'>
    count += 1
    if count == 1:
        break
    pass

<class 'int'>
<class 'pandas.core.series.Series'>
1.06 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# 3. Simple Operations on Groups

## Single Operations on Groups

Compute a statistics for each group: first, last, nth, mean, sum, std, var, min, max, size, count, describe,sem

NOTE: Not every method is supported in dask : describe, nth, sem

### Variance for each Group

2.42 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [35]:
%%timeit -n 1 -r 1 -t x = range(10)
group_bal['MONTHS_BALANCE'].var()

2.42 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Sum for a selected group and column...

In [36]:
%%timeit -n 1 -r 1 -t x = range(10)
group_bal['MONTHS_BALANCE'].sum()

403 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


# Aggregate

## Multiple Operations on Groups

The aggregation API allows one to use one or more operations over the specified axis. 

* agg is an alias for aggregate. Use the alias.

* pass multiple aggregation arguments as a list

* You can also pass **named methods** as strings. These will return a Series of the aggregated output. Example: df.agg(['sum', 'mean'])

* a NumPy Mathematical Functions: https://docs.scipy.org/doc/numpy-1.13.0/reference/routines.math.html 


* NOTE: Using a single function is equivalent to apply().

### Apply Multiple Operations to the Same Column

Pandas: 1.46 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

In [20]:
%%timeit -n 1 -r 1 -t x = range(10)
group_bal['MONTHS_BALANCE'].agg([np.sum, np.mean, np.std])

1.41 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### Apply Multiple Operations to Different Columns

In [21]:
%%timeit -n 1 -r 1 -t x = range(10)
group_bal.agg({'MONTHS_BALANCE' : np.mean, 'WEIGHT' : np.var})

671 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## Transformation Operations

The transform() method returns an object that is indexed the same (same size) as the original. This API allows you to provide multiple operations at the same time rather than one-by-one. Its API is quite similar to the .agg API
Some examples:

* Standardize data (zscore) within a group.
* Filling NAs within groups with a value derived from each group.


In [24]:
%%timeit -n 1 -r 1 -t x = range(10)
# this is just ridiculous
group_bal['MONTHS_BALANCE'].transform(zscore)
gmeans = group_bal['MONTHS_BALANCE'].mean() #pandas.core.series.Series
gstds = group_bal['MONTHS_BALANCE'].std()

892 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


TypeError: 'SeriesGroupBy' object is not callable

In [None]:
for group in 

In [92]:
zscore = lambda x: (x.values - x.mean()) / x.std()
printer = lambda x: print(type(x))

In [None]:
%%timeit -n 1 -r 1 -t x = range(10)
group_bal['MONTHS_BALANCE'].apply(zscore)

In [95]:
%prun -s ncalls group_bal['MONTHS_BALANCE'].apply(zscore)

TypeError: <lambda>() got an unexpected keyword argument 'axis'

In [114]:
#%%timeit -n 1 -r 1 -t x = range(10)
def myapply1(group):
    result = {}
    for name, g in group:
        #print(type(g['MONTHS_BALANCE'].values))
        x = g['MONTHS_BALANCE'].values
        result[name] = (x - x.mean()) / x.std()
    return result

In [33]:
%prun -l 4 myapply1(group_bal)

NameError: name 'myapply1' is not defined

In [42]:
%%cython
# typed cython
cdef double myapply2(double group):
    result = {}
    for name, g in group:
        #print(type(g['MONTHS_BALANCE'].values))
        x = g['MONTHS_BALANCE'].values
        result[name] = (x - x.mean()) / x.std()
    return result

In [37]:
%%cython
cdef double myapply3(double group):
    result = {}
    for name, g in group:
        #print(type(g['MONTHS_BALANCE'].values))
        x = g['MONTHS_BALANCE'].values
        result[name] = (x - x.mean()) / x.std()
    return result

In [43]:
%prun -l 10  myapply2(group_bal)

  if __name__ == '__main__':


         256663320 function calls (253393727 primitive calls) in 372.846 seconds

   Ordered by: internal time
   List reduced from 283 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
  3269586   24.764    0.000   24.764    0.000 {method 'reduce' of 'numpy.ufunc' objects}
   817395   23.785    0.000   39.234    0.000 _methods.py:86(_var)
        1   16.975   16.975  371.932  371.932 {_cython_magic_809f0aedde4e723899d85e0dc3ccef2b.myapply2}
   817396   15.920    0.000   39.150    0.000 internals.py:3363(_rebuild_blknos_and_blklocs)
   817398   13.911    0.000   52.947    0.000 base.py:255(__new__)
 44957066   10.694    0.000   13.105    0.000 {built-in method builtins.isinstance}
   817395    7.360    0.000   22.955    0.000 _methods.py:53(_mean)
   817395    7.113    0.000  157.950    0.000 internals.py:3869(get_slice)
  3269582    7.111    0.000   12.090    0.000 generic.py:4378(__setattr__)
  1634790    6.832    0.000   23.331   

 

In [35]:
def myfun():
    return group_bal['MONTHS_BALANCE'].apply(lambda x: (x - x.mean()) / x.std())

In [82]:
lst = list(range(1000))
%lprun -f myfun myfun()

Timer unit: 1e-06 s

Total time: 0.009466 s
File: <ipython-input-35-287dcce4b523>
Function: myfun at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def myfun():
     2         1       9466.0   9466.0    100.0      return group_bal['MONTHS_BALANCE'].apply(lambda x: (x - x.mean()) / x.std())

## 3. Filter:

Discard some groups, according to a group-wise computation that evaluates True or False. 

Some examples:

* Discard data that belongs to groups with only a few members.
* Filter out data based on the group sum or mean.   

DASK : 

apply(): 2min 46s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

sum(): 21.8 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)

Pandas:

### This takes a long time ...

In [11]:
%%timeit -n 1 -r 1 -t x = range(10)
## Take AWAY: 2min 45s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
group_bal['MONTHS_BALANCE'].apply(lambda x: x.sum() > 0 )

2min 4s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


### # Same result, but reformulated based on documentation

In [9]:
%%timeit -n 1 -r 1 -t x = range(10)
## Take: AWAY 601 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
group_bal['MONTHS_BALANCE'].sum().apply(lambda x : x > 0)

601 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


## How do I use the result to ?

In [None]:
# some ops in dask optimized # what is the same time for the op in panda