# Aggregate Rows
* `group_and_agg`: Groups and aggregates. Provides a user interface similar to that of MS Access.
* `most_freq_elt`: Aggregates a Series to the most frequent scalar element.<br>
   Like Series.mode, but always returns a scalar.
* `top_values`: Aggregates a Series to a list of the most frequent elements.<br>
   Can also return the counts of the most frequent elements.  
   
* `first_valid_value`: Returns the first not-nan values of a Series.    
* `last_valid_value`:  Returns the last  not-nan values of a Series.      

* `agg_words`: Aggregates a Series of strings to a long string.<br>
   A space is always placed between the elements, the order is preserved.
* `agg_words_nodup`: Aggregates a Series of strings (e.g. signal words) to a long string.
   Like agg_words, but duplicates are removed.
* `agg_strings_nospace`: Aggregates a Series of strings into one long string.<br>
   Like agg_words, but no separators between the substrings.  
   
* `agg_to_list`: Aggregates a Series to a list. 
   Normally this can also be done with a simple 'list', 
   but in combination with transform this does not work.
   Then agg_to_list can be used as a substitute.
* `agg_dicts`: Aggregates a Series of dicts to a single dict.<br>
   If a key occurs more than once, the value is overwritten.
* `agg_dicts_2dd`: Aggregates a Series of dicts to a single defaultdict(list).<br>
   I.e. multiple keys are allowed. The values are always lists. 
* `agg_defaultdicts`: Aggregates a Series of defaultdict(list) to a single defaultdict(list).

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

In [3]:
import numpy      as np
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

grid = pak.grid

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions

time: 10.8 s


## group_and_agg()

In [3]:
?pak.group_and_agg

time: 69.3 ms


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mgroup_and_agg[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_origins[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_funcs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_names[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropna[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Groups and aggregates. Provides a user interface similar to that of MS Access.
* col_origins: list of all columns to process
* col_funcs:   list of all functions to apply to the columns above. 
               Sometimes you have to use strings, sometimes function names.
               'group' or '' = grouping. 
* col_names:   list of new names for the result columns. Optional. Space = default name will

### Basic example

In [4]:
# Try the example
df = pak.people()
pak.group_and_agg( df, 
                   col_origins=['age_class', 'birthplace', 'first_name',  'age', 'age', 'first_name'],
                   col_funcs  =['group',     'group',      pak.agg_words, 'min', 'max', 'min'],
             ) 

Unnamed: 0,age_class,birthplace,first_name_agg_words,age_min,age_max,first_name_min
0,20,Berlin,Horst Anton Clara Karina Tanja Henri Karla Mar...,21,29,Anton
1,20,Bremen,Julian Paul Marcel Christian Ali Dieter Christ...,22,29,Ali
2,30,Berlin,Martin Elisabeth Georg Anna Carina Hertha Ella...,30,39,Anna
3,30,Bremen,Claus Marvin Josef Emma Niclas Ingo Marc Nick ...,31,39,Claus
4,40,Bremen,Isabella,42,42,Isabella


time: 127 ms


### col_names parameter

In [5]:
# just to rename the result columns
df = pak.people()
pak.group_and_agg( df, 
                   col_origins=['age_class', 'birthplace', 'first_name',     'age', 'age', 'first_name'],
                   col_funcs  =['group',     'group',      pak.agg_words,    'min', 'max', 'min'       ],
                   col_names  =['AGE_CLASS', 'BIRTHPLACE', 'first_name_agg', '',    '',    ''          ],                  
             ) 

Unnamed: 0,AGE_CLASS,BIRTHPLACE,first_name_agg,age_min,age_max,first_name_min
0,20,Berlin,Ingeborg Jürgen Marianne Alexandra Luca Louisa...,22,29,Alexandra
1,20,Bremen,Sofia Jannick Berndt Lennard Gisela Marina Hel...,21,29,Berndt
2,30,Berlin,Helena Benjamin Heinrich Tina Malte Moritz Mai...,30,38,Ayleen
3,30,Bremen,Volker Willy Melanie Elena Gerd Fynn Anna Frie...,30,37,Alexander
4,40,Berlin,Willi,41,41,Willi


time: 110 ms


### dropna parameter
Is NaN a regular groupable value? Default: No, NanNs are dropped.

In [6]:
# Create test data 
# and save it for later comparisons

data = []
data.append( ['A',  None,  1,      ] )
data.append( ['A',  'B',   None,   ] )
data.append( ['A',  'B',   np.NaN, ] )
data.append( [None, 'B',   9,      ] )
  
data.append( ['A',  'B',   2,      ] )
data.append( ['A',  'B',   8,      ] )

data.append( ['Q',  'B',   3,      ] )
data.append( ['A',  'Q',   7,      ] )

data = pak.dataframe(data)
data

Unnamed: 0,A,B,C
0,A,,1.0
1,A,B,
2,A,B,
3,,B,9.0
4,A,B,2.0
5,A,B,8.0
6,Q,B,3.0
7,A,Q,7.0


time: 35.1 ms


In [7]:
# Default is dropna=True.
# If grouped by A and B in this way, the maximum values 1 and 9 do not come into play at all.

r = pak.group_and_agg(data, 
              [ 'A',     'B',     'C',   'C',   ],  # col_origins: Liste aller columns, die verarbeitet werden sollen
              [ 'group', 'group', 'min', 'max', ],  # col_funcs:   Liste aller Funktionen, die darauf angewendet werden sollen
             )

r

Unnamed: 0,A,B,C_min,C_max
0,A,B,2.0,8.0
1,A,Q,7.0,7.0
2,Q,B,3.0,3.0


time: 64.7 ms


In [8]:
# With dropna=False NaN gets its own groups
# If grouped by A and B in this way, the maximum values 1 and 9 are included in the result.

r = pak.group_and_agg(data, 
              [ 'A',     'B',     'C',    'C',     ],
              [ 'group', 'group', 'min',  'max',   ],
              [ 'A',     'B',     'Cmin', 'Cmax',  ],  # col_names:   Liste neuer Namen für die Ergebnisspalten           
              dropna=False
             )

r

Unnamed: 0,A,B,Cmin,Cmax
0,A,B,2.0,8.0
1,A,Q,7.0,7.0
2,A,,1.0,1.0
3,Q,B,3.0,3.0
4,,B,9.0,9.0


time: 58.8 ms


## Aggregate functions min(), max(), first(), last()
More aggregate functions: https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/

In [9]:
# Create test data
data = []
data.append( ['A',  'B',   None,   ] )
data.append( ['A',  'B',   np.NaN, ] )
  
data.append( ['A',  'B',   2,      ] )
data.append( ['A',  'B',   3,      ] )
data.append( ['A',  'Q',   8,      ] )
data.append( ['A',  'Q',   7,      ] )

data.append( ['Q',  'B',   3,      ] )
data = pak.dataframe(data)
data

Unnamed: 0,A,B,C
0,A,B,
1,A,B,
2,A,B,2.0
3,A,B,3.0
4,A,Q,8.0
5,A,Q,7.0
6,Q,B,3.0


time: 49.6 ms


In [10]:
r = pak.group_and_agg(data, 
              [ 'A',     'B',     'C',   'C',   'C',     'C',   ], 
              [ 'group', 'group', 'min', 'max', 'first', 'last', ], 
             )
r

Unnamed: 0,A,B,C_min,C_max,C_first,C_last
0,A,B,2.0,3.0,2.0,3.0
1,A,Q,7.0,8.0,8.0,7.0
2,Q,B,3.0,3.0,3.0,3.0


time: 88.6 ms


## Aggregate functions count(), size(), unique()

In [11]:
# Create test data
data = []
data.append( ['A',  'B',   None,   ] )
data.append( ['A',  'B',   np.NaN, ] )
  
data.append( ['A',  'B',   2,      ] )
data.append( ['A',  'B',   3,      ] )
data.append( ['A',  'Q',   8,      ] )
data.append( ['A',  'Q',   8,      ] )

data.append( ['Q',  'B',   3,      ] )
data = pak.dataframe(data)
data

Unnamed: 0,A,B,C
0,A,B,
1,A,B,
2,A,B,2.0
3,A,B,3.0
4,A,Q,8.0
5,A,Q,8.0
6,Q,B,3.0


time: 41.5 ms


In [12]:
# count counts values,                  NaN is not counted
# size counts values,                   NaN is included in the count
# nunique counts only different values, NaN is not counted

r = pak.group_and_agg(data, 
              [ 'A',     'B',     'C',     'C',    'C',       ], 
              [ 'group', 'group', 'count', 'size', 'nunique', ], 
             )

r

Unnamed: 0,A,B,C_count,C_size,C_nunique
0,A,B,2,4,2
1,A,Q,2,2,1
2,Q,B,1,1,1


time: 54.1 ms


## Aggregate functions most_freq_elt() and top_values()
Identificate the most frequent elements and determine their counts

In [13]:
?pak.most_freq_elt

time: 32.8 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mmost_freq_elt[0m[0;34m([0m[0mseries[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregates a Series to the most frequent scalar element.
Like Series.mode, but always returns a scalar.
If two elements are equally frequent, just any one is returned 
Example:
df = pak.people()
df.groupby('age_class')['first_name'].apply(pak.most_freq_elt)    
[0;31mFile:[0m      /media/me/DATA/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      function


In [14]:
# Run the example
df = pak.people()
df.groupby('age_class')['first_name'].apply(pak.most_freq_elt)

age_class
20    Michael
30     Dagmar
40    Michael
Name: first_name, dtype: object

time: 105 ms


In [15]:
?pak.top_values

time: 31.2 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mtop_values[0m[0;34m([0m[0mseries[0m[0;34m,[0m [0mlimit[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m [0mcount[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregates a Series to a list of the most frequent elements.
Or, if there is only one, this single element.
Can also return the counts of the most frequent elements.    
* limit: limits the length of the resulting list
* count: count=False shows the most frequent elements (default)
         cont=True   shows the corresponding counts of the elements
Example:
df = pak.people()
df.groupby('age_class')['first_name'].apply(pak.top_values)

Caution, does not work well with very long data sets.
There are partials preconfigured for 3, 5, 10, 20, 100, 1000 elements,
i.e. top_values_100 or top_values_count_20
[0;31mFile:[0m      /media/me/DATA/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      functio

In [16]:
# Run the example
df = pak.people()
df.groupby('age_class')['first_name'].apply(pak.top_values)

age_class
20      [Lars, Erika, Maik]
30    [Sylvia, Olaf, Tanja]
40          [Oskar, Louisa]
Name: first_name, dtype: object

time: 107 ms


In [17]:
# Create test data
data = []
data.append( ['A',  None,   ] )
data.append( ['A',  None,   ] )
data.append( ['A',  np.NaN, ] )
  
data.append( ['A',  2,      ] )
data.append( ['A',  2,      ] )
data.append( ['A',  3,      ] )
data.append( ['A',  3,      ] )
data.append( ['A',  4,      ] )
data.append( ['A',  None,   ] )
data.append( ['B',  'p'     ] )
data.append( ['B',  'p',    ] )
data.append( ['B',  'z',    ] )

data = pak.dataframe(data)
data

Unnamed: 0,A,B
0,A,
1,A,
2,A,
3,A,2
4,A,2
5,A,3
6,A,3
7,A,4
8,A,
9,B,p


time: 102 ms


In [18]:
# most_freq_elt     like mode, but always returns a scalar and never NaN
# mode              only returns a scalar if the result is unique. Again, no NaN.
# top_values        always returns a list of the most frequent elements, NaN is counted like a normal element. 
#                   There are top_values preconfigured for 3, 5, 10, 20, 100, 1000 elements. 
# top_values_count  provides a list of frequencies to match. 

r = pak.group_and_agg(data, 
              [ 'A',     'B',                 'B',             'B',               'B',                      ], 
              [ 'group',  pak.most_freq_elt,  pd.Series.mode,  pak.top_values_10,  pak.top_values_count_10, ],   # mode must be specially asked to be with us
              [ 'A',     'B_mostfreq',        '',              'B_top',           'B_count',                ]
               #list('ABCDEF')
             )

r

Unnamed: 0,A,B_mostfreq,B_mode,B_top,B_count
0,A,2,"[2, 3]","[None, 2, 3, nan, 4]","[3, 2, 2, 1, 1]"
1,B,p,p,"[p, z]","[2, 1]"


time: 62.7 ms


## Aggregate functions first_valid_value() and last_valid_value()

In [21]:
?pak.first_valid_value

time: 30.6 ms


[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mfirst_valid_value[0m[0;34m([0m[0mseries[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Returns the first not-nan values of a Series. 
[0;31mFile:[0m      /media/me/DATA/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      function


In [23]:
s = pd.Series( [np.NaN, 1, np.NaN, 'z', np.NaN])
assert pak.first_valid_value(s) == 1
assert pak.last_valid_value(s) == 'z'

time: 48.5 ms


In [27]:
s = pd.Series( [np.NaN, None])
assert pak.first_valid_value(s) is None
assert pak.last_valid_value(s)  is None

time: 68.5 ms


## Aggregate strings
* agg_words, agg_words_nodup, agg_strings_nospace

In [None]:
?pak.agg_words

In [None]:
?pak.agg_words_nodup

In [None]:
?pak.agg_strings_nospace

In [None]:
# Create test data
data = []
data.append( ['A',  None,   ] )
data.append( ['A',  None,   ] )
data.append( ['A',  np.NaN, ] )
  
data.append( ['A',  'Who',    ] )
data.append( ['A',  'is is',    ] )
data.append( ['A',  'who',    ] )
data.append( ['A',  'and',    ] )
data.append( ['A',  'who',    ] )
data.append( ['A',  'is is is',    ] )
data.append( ['A',  None,   ] )
data.append( ['A',  'what?',   ] )
data.append( ['B',  ''        ] )
data.append( ['B',  '   ',    ] )  # several narrow spaces
data.append( ['B',  'zz',     ] )
data.append( ['B',  ' ',      ] )  # one narrow space
data.append( ['B',  'zz',     ] )

data = pak.dataframe(data)
data

In [None]:
# agg_words :           Collects all scalars together like words in a sentence. NaN is ignored.
# agg_words_nodups :    Collects all scalars together like signal words in a set: No dups. Order is preserved. NaN is ignored.
# agg_strings_nospace : Just hangs everything one after the other. NaN is ignored.

r = pak.group_and_agg(data, 
              [ 'A', 'B', 'B', 'B',], 
              [ 'group',  pak.agg_words, pak.agg_words_nodup, pak.agg_strings_nospace, ], 
              #[ 'A', 'B1', 'B2', 'B3' ],                 

             ) 

r

## Aggregate lists
* list, explode, sum

In [None]:
# Create test data
data = []
data.append( ['A',  None,   ] )
data.append( ['A',  None,   ] )
data.append( ['A',  np.NaN, ] )
  
data.append( ['A',  2,      ] )
data.append( ['A',  2,      ] )
data.append( ['A',  3,      ] )
data.append( ['A',  3,      ] )
data.append( ['A',  None,   ] )
data.append( ['A',  4,      ] )
data.append( ['B',  'p'     ] )
data.append( ['B',  'p',    ] )
data.append( ['B',  'z',    ] )

data = pak.dataframe(data)
data

### list aggregates scalars to lists

In [None]:
r = pak.group_and_agg(data, 
              [ 'A',     'B',     ], 
              [ 'group',  list,   ],  # by the way, this works the same way with set                
              [ 'A',     'B',     ]
             )

r

### explode turns lists back into scalars

In [None]:
# undo
r.explode('B')

In [None]:
# equal!
r = pak.reset_index(r.explode('B'))
assert pak.check_equal(data,r)

### agg_to_list()

In [None]:
?pak.agg_to_list

### sum aggregates lists to longer lists

In [None]:
# Create test data
data = []
data.append( ['A',  None,                  ] )
data.append( ['A',  [None,None]            ] )
data.append( ['A',  np.NaN,                ] )
  
data.append( ['A',  ['Who', 'is', 'who']  ] )
data.append( ['A',  ['and', 'qq']          ] )
data.append( ['A',  ['who', 'qq']          ] )
data.append( ['A',  ['is', 'qq']          ] )
data.append( ['A',  [None,  'qq']          ] )
data.append( ['A',  ['what?','qq'],         ] )
data.append( ['B',  ['']                   ] )
data.append( ['B',  ['xx','yy'],           ] )  
data.append( ['B',  ['zz'],                ] )

data = pak.dataframe(data)
data

In [None]:
r = pak.group_and_agg(data, 
              [ 'A',     'B',     ], 
              [ 'group',  sum,   ],  
              [ 'A',     'B',     ],                   

             )

r

## Aggregate dicts

In [None]:
# Create test data
data = []
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'b':2, 'c':3 },   ] )
data.append( ['A',  {'a':4},           ] )
data.append( ['B',  {'bb':22},         ] )
#data.append( ['B',  {},                ] )

data = pak.dataframe(data)
data.columns = ['A','D']

# DD is a copy of D, but as defaultdict
data['DD'] = data.D.copy() 

# dict_to_defaultdict turns the dict in the given column into a defaultdict
data = pak.dict_to_defaultdict( data, col='DD' )
data

In [None]:
# agg_dicts      aggregates dicts and overwrites conflicting values
# agg_dicts_2dd  does the same, but results defaultdicts, so no values are lost

r = pak.group_and_agg(data, 
              [ 'A',     'D',            'D',                 ], 
              [ 'group',  pak.agg_dicts, pak.agg_dicts_2dd,  ],  
              [ 'A',     'D_dict',       'D_defaultdict',    ],                   

             )

r

## Aggregate defaultdicts

In [None]:
# See the test data again
data

In [None]:
# agg_defaultdicts aggregates defaultdicts

r = pak.group_and_agg(data, 
              [ 'A',      'DD',                            ], 
              [ 'group',  pak.agg_defaultdicts,            ],  
              [ 'A',      'DD',                            ],                   

             )

r