__[Open and try this file online (Colab)](https://colab.research.google.com/github/djekra/pandasklar/blob/master/jupyter/55_Aggregate_Rows.ipynb)__

# Aggregate Rows
* `group_and_agg`: Groups and aggregates. Provides a user interface similar to that of MS Access.
* `most_freq_elt`: Aggregates a Series to the most frequent scalar element.<br>
   Like Series.mode, but always returns a scalar.
* `top_values`: Aggregates a Series to a list of the most frequent elements.<br>
   Can also return the counts of the most frequent elements.  
   
* `first_valid_value`: Returns the first not-nan values of a Series.    
* `last_valid_value`:  Returns the last  not-nan values of a Series.      

* `agg_words`: Aggregates a Series of strings to a long string.<br>
   A space is always placed between the elements, the order is preserved.
* `agg_words_nodup`: Aggregates a Series of strings (e.g. signal words) to a long string.
   Like agg_words, but duplicates are removed.
* `agg_strings_nospace`: Aggregates a Series of strings into one long string.<br>
   Like agg_words, but no separators between the substrings.  
   
* `agg_to_list`: Aggregates a Series to a list. 
   Normally this can also be done with a simple 'list', 
   but in combination with transform this does not work.
   Then agg_to_list can be used as a substitute.
* `agg_dicts`: Aggregates a Series of dicts to a single dict.<br>
   If a key occurs more than once, the value is overwritten.
* `agg_dicts_2dd`: Aggregates a Series of dicts to a single defaultdict(list).<br>
   I.e. multiple keys are allowed. The values are always lists. 
* `agg_defaultdicts`: Aggregates a Series of defaultdict(list) to a single defaultdict(list).

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

blab init
environment['in_colab']     = False
environment['dropbox_path'] = /home/me/Data_Linux/Dropbox
environment['lib_path']     = /home/me/Data_Linux/Dropbox/31_Projekte/01_Python/libs
Start Time: 13:30:49


In [2]:
import numpy      as np
import pandas     as pd 
import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions



## group_and_agg()

In [3]:
?pak.group_and_agg

[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mgroup_and_agg[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_origins[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_funcs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_names[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdropna[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0moptimize[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Groups and aggregates. Provides a user interface similar to that of MS Access.
* col_origins: list of all columns to process
* col_funcs:   list of all functions to apply to the columns above. 
               Sometimes you have to use strings, sometimes function names.
               'group' or '' = grouping. 
* col_nam

### Basic example

In [4]:
# Try the example
df = pak.people()
pak.group_and_agg( df, 
                   col_origins=['age_class', 'birthplace', 'first_name',  'age', 'age', 'first_name'],
                   col_funcs  =['group',     'group',      pak.agg_words, 'min', 'max', 'min'],
             ) 

94 rows less, now 6 rows


Unnamed: 0,age_class,birthplace,first_name_agg_words,age_min,age_max,first_name_min
0,20,Berlin,Richard Anni Tom Gerda Susanne Vanessa Marta E...,22,29,Anni
1,20,Bremen,Yannick Steffen Anneliese Bernhard Antje Nina ...,22,29,Andreas
2,30,Berlin,Uwe Klara Detlef Kirsten Ole Dennis Willy Andr...,30,39,Andre
3,30,Bremen,Viktoria Jacob Christopher Evelyn Hans Tanja L...,30,37,Christopher
4,40,Berlin,Nico Bernd Jannis Lina,40,41,Bernd
5,40,Bremen,Willy Jacob,40,41,Jacob


### col_names parameter

In [5]:
# just to rename the result columns
df = pak.people()
pak.group_and_agg( df, 
                   col_origins=['age_class', 'birthplace', 'first_name',     'age', 'age', 'first_name'],
                   col_funcs  =['group',     'group',      pak.agg_words,    'min', 'max', 'min'       ],
                   col_names  =['AGE_CLASS', 'BIRTHPLACE', 'first_name_agg', '',    '',    ''          ],                  
             ) 

95 rows less, now 5 rows


Unnamed: 0,AGE_CLASS,BIRTHPLACE,first_name_agg,age_min,age_max,first_name_min
0,20,Berlin,Artur Frank Jannik Tanja Jakob Christiane Jann...,20,29,Andre
1,20,Bremen,Noah Rebecca Tom Raphael Hannelore Clara Fried...,21,29,Anja
2,30,Berlin,Ulrike Mohammed Renate Mia Ilse Melissa Christ...,30,39,Artur
3,30,Bremen,Denis Carina Anna Nikolas Bärbel Nele Lars Gus...,30,37,Anette
4,40,Berlin,Stefan,42,42,Stefan


### dropna parameter
Is NaN a regular groupable value? Default: No, NanNs are dropped.

In [6]:
# Create test data 
# and save it for later comparisons

data = []
data.append( ['A',  None,  1,      ] )
data.append( ['A',  'B',   None,   ] )
data.append( ['A',  'B',   np.NaN, ] )
data.append( [None, 'B',   9,      ] )
  
data.append( ['A',  'B',   2,      ] )
data.append( ['A',  'B',   8,      ] )

data.append( ['Q',  'B',   3,      ] )
data.append( ['A',  'Q',   7,      ] )

data = pak.dataframe(data)
data

Input rtype=('list', 'list', 'str') shape=(8, 3)
rotated=False Output rtype=('DataFrame', 'Series') shape=(8, 3)


Unnamed: 0,A,B,C
0,A,,1.0
1,A,B,
2,A,B,
3,,B,9.0
4,A,B,2.0
5,A,B,8.0
6,Q,B,3.0
7,A,Q,7.0


In [7]:
# Default is dropna=True.
# If grouped by A and B in this way, the maximum values 1 and 9 do not come into play at all.

r = pak.group_and_agg(data, 
              [ 'A',     'B',     'C',   'C',   ],  # col_origins: Liste aller columns, die verarbeitet werden sollen
              [ 'group', 'group', 'min', 'max', ],  # col_funcs:   Liste aller Funktionen, die darauf angewendet werden sollen
             )

r

5 rows less, now 3 rows


Unnamed: 0,A,B,C_min,C_max
0,A,B,2.0,8.0
1,A,Q,7.0,7.0
2,Q,B,3.0,3.0


In [8]:
# With dropna=False NaN gets its own groups
# If grouped by A and B in this way, the maximum values 1 and 9 are included in the result.

r = pak.group_and_agg(data, 
              [ 'A',     'B',     'C',    'C',     ],
              [ 'group', 'group', 'min',  'max',   ],
              [ 'A',     'B',     'Cmin', 'Cmax',  ],  # col_names:   Liste neuer Namen für die Ergebnisspalten           
              dropna=False
             )

r

3 rows less, now 5 rows


Unnamed: 0,A,B,Cmin,Cmax
0,A,B,2.0,8.0
1,A,Q,7.0,7.0
2,A,,1.0,1.0
3,Q,B,3.0,3.0
4,,B,9.0,9.0


## Aggregate functions min(), max(), first(), last()
More aggregate functions: https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/

In [9]:
# Create test data
data = []
data.append( ['A',  'B',   None,   ] )
data.append( ['A',  'B',   np.NaN, ] )
  
data.append( ['A',  'B',   2,      ] )
data.append( ['A',  'B',   3,      ] )
data.append( ['A',  'Q',   8,      ] )
data.append( ['A',  'Q',   7,      ] )

data.append( ['Q',  'B',   3,      ] )
data = pak.dataframe(data)
data

Input rtype=('list', 'list', 'str') shape=(7, 3)
rotated=False Output rtype=('DataFrame', 'Series') shape=(7, 3)


Unnamed: 0,A,B,C
0,A,B,
1,A,B,
2,A,B,2.0
3,A,B,3.0
4,A,Q,8.0
5,A,Q,7.0
6,Q,B,3.0


In [10]:
r = pak.group_and_agg(data, 
              [ 'A',     'B',     'C',   'C',   'C',     'C',   ], 
              [ 'group', 'group', 'min', 'max', 'first', 'last', ], 
             )
r

4 rows less, now 3 rows


Unnamed: 0,A,B,C_min,C_max,C_first,C_last
0,A,B,2.0,3.0,2.0,3.0
1,A,Q,7.0,8.0,8.0,7.0
2,Q,B,3.0,3.0,3.0,3.0


## Aggregate functions count(), size(), unique()

In [11]:
# Create test data
data = []
data.append( ['A',  'B',   None,   ] )
data.append( ['A',  'B',   np.NaN, ] )
  
data.append( ['A',  'B',   2,      ] )
data.append( ['A',  'B',   3,      ] )
data.append( ['A',  'Q',   8,      ] )
data.append( ['A',  'Q',   8,      ] )

data.append( ['Q',  'B',   3,      ] )
data = pak.dataframe(data)
data

Input rtype=('list', 'list', 'str') shape=(7, 3)
rotated=False Output rtype=('DataFrame', 'Series') shape=(7, 3)


Unnamed: 0,A,B,C
0,A,B,
1,A,B,
2,A,B,2.0
3,A,B,3.0
4,A,Q,8.0
5,A,Q,8.0
6,Q,B,3.0


In [12]:
# count counts values,                  NaN is not counted
# size counts values,                   NaN is included in the count
# nunique counts only different values, NaN is not counted

r = pak.group_and_agg(data, 
              [ 'A',     'B',     'C',     'C',    'C',       ], 
              [ 'group', 'group', 'count', 'size', 'nunique', ], 
             )

r

4 rows less, now 3 rows


Unnamed: 0,A,B,C_count,C_size,C_nunique
0,A,B,2,4,2
1,A,Q,2,2,1
2,Q,B,1,1,1


## Aggregate functions most_freq_elt() and top_values()
Identificate the most frequent elements and determine their counts

In [13]:
?pak.most_freq_elt

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mmost_freq_elt[0m[0;34m([0m[0mseries[0m[0;34m,[0m [0minaccurate_limit[0m[0;34m=[0m[0;34m([0m[0;36m10000[0m[0;34m,[0m [0;36m1000[0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregates a Series to the most frequent scalar element.
Like Series.mode, but always returns a scalar.
If two elements are equally frequent, just any one is returned .
* inaccurate_limit: If the data is bigger than this, examine take a sample of this size.
  The first value is for hashable datatypes, the second for non_hashable datatypes.
  Set inaccurate_limit=(None,None) if you don't accept inaccuraties.

Example:
df = pak.people()
df.groupby('age_class')['first_name'].apply(pak.most_freq_elt)    
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      function


In [14]:
# Run the example
df = pak.people()
df.groupby('age_class')['first_name'].apply(pak.most_freq_elt)

age_class
20    Carina
30       Tom
40      Anja
Name: first_name, dtype: object

In [15]:
?pak.top_values

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mtop_values[0m[0;34m([0m[0mseries[0m[0;34m,[0m [0mlimit[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m [0mcount[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregates a Series to a list of the most frequent elements.
Or, if there is only one, this single element.
Can also return the counts of the most frequent elements.    
* limit: limits the length of the resulting list
* count: count=False shows the most frequent elements (default)
         cont=True   shows the corresponding counts of the elements
Example:
df = pak.people()
df.groupby('age_class')['first_name'].apply(pak.top_values)

Caution, does not work well with very long data sets.
There are partials preconfigured for 3, 5, 10, 20, 100, 1000 elements,
i.e. top_values_100 or top_values_count_20
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      function


In [16]:
# Run the example
df = pak.people()
df.groupby('age_class')['first_name'].apply(pak.top_values)

age_class
20          [Ingo, Marc, Maik]
30    [Lars, Jessica, Florian]
40           [Isabella, Sonja]
Name: first_name, dtype: object

In [17]:
# Create test data
data = []
data.append( ['A',  None,   ] )
data.append( ['A',  None,   ] )
data.append( ['A',  np.NaN, ] )
  
data.append( ['A',  2,      ] )
data.append( ['A',  2,      ] )
data.append( ['A',  3,      ] )
data.append( ['A',  3,      ] )
data.append( ['A',  4,      ] )
data.append( ['A',  None,   ] )
data.append( ['B',  'p'     ] )
data.append( ['B',  'p',    ] )
data.append( ['B',  'z',    ] )

data = pak.dataframe(data)
data

Input rtype=('list', 'list', 'str') shape=(12, 2)
rotated=False Output rtype=('DataFrame', 'Series') shape=(12, 2)


Unnamed: 0,A,B
0,A,
1,A,
2,A,
3,A,2
4,A,2
5,A,3
6,A,3
7,A,4
8,A,
9,B,p


In [18]:
# most_freq_elt     like mode, but always returns a scalar and never NaN
# mode              only returns a scalar if the result is unique. Again, no NaN.
# top_values        always returns a list of the most frequent elements, NaN is counted like a normal element. 
#                   There are top_values preconfigured for 3, 5, 10, 20, 100, 1000 elements. 
# top_values_count  provides a list of frequencies to match. 

r = pak.group_and_agg(data, 
              [ 'A',     'B',                 'B',             'B',               'B',                      ], 
              [ 'group',  pak.most_freq_elt,  pd.Series.mode,  pak.top_values_10,  pak.top_values_count_10, ],   # mode must be specially asked to be with us
              [ 'A',     'B_mostfreq',        '',              'B_top',           'B_count',                ]
               #list('ABCDEF')
             )

r

10 rows less, now 2 rows


Unnamed: 0,A,B_mostfreq,B_mode,B_top,B_count
0,A,2,"[2, 3]","[None, 2, 3, nan, 4]","[3, 2, 2, 1, 1]"
1,B,p,p,"[p, z]","[2, 1]"


## Aggregate functions first_valid_value() and last_valid_value()

In [19]:
?pak.first_valid_value

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0mfirst_valid_value[0m[0;34m([0m[0mseries[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Returns the first not-nan values of a Series. 
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/pandas.py
[0;31mType:[0m      function


In [20]:
s = pd.Series( [np.NaN, 1, np.NaN, 'z', np.NaN])
assert pak.first_valid_value(s) == 1
assert pak.last_valid_value(s) == 'z'

In [21]:
s = pd.Series( [np.NaN, None])
assert pak.first_valid_value(s) is None
assert pak.last_valid_value(s)  is None

## Aggregate strings
* agg_words, agg_words_nodup, agg_strings_nospace

In [22]:
?pak.agg_words

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0magg_words[0m[0;34m([0m[0mseries[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregates a Series of strings to a long string.
A space is always placed between the elements,
the order is preserved.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      function


In [23]:
?pak.agg_words_nodup

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0magg_words_nodup[0m[0;34m([0m[0mseries[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregates a Series of strings (e.g. signal words) to a long string.
A space is always placed between the individual elements,
the order is preserved,
duplicates are removed.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      function


In [24]:
?pak.agg_strings_nospace

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0magg_strings_nospace[0m[0;34m([0m[0mseries[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregates a Series of strings into one long string.
No separators between the substrings.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      function


In [25]:
# Create test data
data = []
data.append( ['A',  None,   ] )
data.append( ['A',  None,   ] )
data.append( ['A',  np.NaN, ] )
  
data.append( ['A',  'Who',    ] )
data.append( ['A',  'is is',    ] )
data.append( ['A',  'who',    ] )
data.append( ['A',  'and',    ] )
data.append( ['A',  'who',    ] )
data.append( ['A',  'is is is',    ] )
data.append( ['A',  None,   ] )
data.append( ['A',  'what?',   ] )
data.append( ['B',  ''        ] )
data.append( ['B',  '   ',    ] )  # several narrow spaces
data.append( ['B',  'zz',     ] )
data.append( ['B',  ' ',      ] )  # one narrow space
data.append( ['B',  'zz',     ] )

data = pak.dataframe(data)
data

Input rtype=('list', 'list', 'str') shape=(16, 2)
rotated=False Output rtype=('DataFrame', 'Series') shape=(16, 2)


Unnamed: 0,A,B
0,A,
1,A,
2,A,
3,A,Who
4,A,is is
5,A,who
6,A,and
7,A,who
8,A,is is is
9,A,


In [26]:
# agg_words :           Collects all scalars together like words in a sentence. NaN is ignored.
# agg_words_nodups :    Collects all scalars together like signal words in a set: No dups. Order is preserved. NaN is ignored.
# agg_strings_nospace : Just hangs everything one after the other. NaN is ignored.

r = pak.group_and_agg(data, 
              [ 'A', 'B', 'B', 'B',], 
              [ 'group',  pak.agg_words, pak.agg_words_nodup, pak.agg_strings_nospace, ], 
              #[ 'A', 'B1', 'B2', 'B3' ],                 

             ) 

r

14 rows less, now 2 rows


Unnamed: 0,A,B_agg_words,B_agg_words_nodup,B_agg_strings_nospace
0,A,Who is is who and who is is is what?,Who is who and what?,Whois iswhoandwhois is iswhat?
1,B,zz zz,zz,zz zz


## Aggregate lists
* list, explode, sum

In [27]:
# Create test data
data = []
data.append( ['A',  None,   ] )
data.append( ['A',  None,   ] )
data.append( ['A',  np.NaN, ] )
  
data.append( ['A',  2,      ] )
data.append( ['A',  2,      ] )
data.append( ['A',  3,      ] )
data.append( ['A',  3,      ] )
data.append( ['A',  None,   ] )
data.append( ['A',  4,      ] )
data.append( ['B',  'p'     ] )
data.append( ['B',  'p',    ] )
data.append( ['B',  'z',    ] )

data = pak.dataframe(data)
data

Input rtype=('list', 'list', 'str') shape=(12, 2)
rotated=False Output rtype=('DataFrame', 'Series') shape=(12, 2)


Unnamed: 0,A,B
0,A,
1,A,
2,A,
3,A,2
4,A,2
5,A,3
6,A,3
7,A,
8,A,4
9,B,p


### list aggregates scalars to lists

In [28]:
r = pak.group_and_agg(data, 
              [ 'A',     'B',     ], 
              [ 'group',  list,   ],  # by the way, this works the same way with set                
              [ 'A',     'B',     ]
             )

r

10 rows less, now 2 rows


Unnamed: 0,A,B
0,A,"[None, None, nan, 2, 2, 3, 3, None, 4]"
1,B,"[p, p, z]"


### explode turns lists back into scalars

In [29]:
# undo
r.explode('B')

Unnamed: 0,A,B
0,A,
0,A,
0,A,
0,A,2
0,A,2
0,A,3
0,A,3
0,A,
0,A,4
1,B,p


In [30]:
# equal!
r = pak.reset_index(r.explode('B'))
assert pak.check_equal(data,r)

### agg_to_list()

In [31]:
?pak.agg_to_list

[0;31mSignature:[0m [0mpak[0m[0;34m.[0m[0magg_to_list[0m[0;34m([0m[0mseries[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Aggregates a Series to a list. 
Normally this can also be done with a simple 'list', but in combination with transform this does not work.
Then agg_to_list can be used as a substitute.
[0;31mFile:[0m      ~/Data_Linux/Dropbox/31_Projekte/01_Python/git/pandasklar/src/pandasklar/aggregate.py
[0;31mType:[0m      function


### sum aggregates lists to longer lists

In [32]:
# Create test data
data = []
data.append( ['A',  None,                  ] )
data.append( ['A',  [None,None]            ] )
data.append( ['A',  np.NaN,                ] )
  
data.append( ['A',  ['Who', 'is', 'who']  ] )
data.append( ['A',  ['and', 'qq']          ] )
data.append( ['A',  ['who', 'qq']          ] )
data.append( ['A',  ['is', 'qq']          ] )
data.append( ['A',  [None,  'qq']          ] )
data.append( ['A',  ['what?','qq'],         ] )
data.append( ['B',  ['']                   ] )
data.append( ['B',  ['xx','yy'],           ] )  
data.append( ['B',  ['zz'],                ] )

data = pak.dataframe(data)
data

Input rtype=('list', 'list', 'str') shape=(-77, -77)
rotated=False Output rtype=('DataFrame', 'Series') shape=(12, 2)


Unnamed: 0,A,B
0,A,
1,A,"[None, None]"
2,A,
3,A,"[Who, is, who]"
4,A,"[and, qq]"
5,A,"[who, qq]"
6,A,"[is, qq]"
7,A,"[None, qq]"
8,A,"[what?, qq]"
9,B,[]


In [33]:
r = pak.group_and_agg(data, 
              [ 'A',     'B',     ], 
              [ 'group',  sum,   ],  
              [ 'A',     'B',     ],                   

             )

r

10 rows less, now 2 rows


Unnamed: 0,A,B
0,A,"[None, None, Who, is, who, and, qq, who, qq, i..."
1,B,"[, xx, yy, zz]"


## Aggregate dicts

In [34]:
# Create test data
data = []
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'b':2, 'c':3 },   ] )
data.append( ['A',  {'a':4},           ] )
data.append( ['B',  {'bb':22},         ] )
#data.append( ['B',  {},                ] )

data = pak.dataframe(data)
data.columns = ['A','D']

# DD is a copy of D, but as defaultdict
data['DD'] = data.D.copy() 

# dict_to_defaultdict turns the dict in the given column into a defaultdict
data = pak.dict_to_defaultdict( data, col='DD' )
data

Input rtype=('list', 'list', 'str') shape=(-77, -77)
rotated=False Output rtype=('DataFrame', 'Series') shape=(5, 2)


Unnamed: 0,A,D,DD
0,A,{'a': 1},{'a': [1]}
1,A,{'a': 1},{'a': [1]}
2,A,"{'b': 2, 'c': 3}","{'b': [2], 'c': [3]}"
3,A,{'a': 4},{'a': [4]}
4,B,{'bb': 22},{'bb': [22]}


In [35]:
# agg_dicts      aggregates dicts and overwrites conflicting values
# agg_dicts_2dd  does the same, but results defaultdicts, so no values are lost

r = pak.group_and_agg(data, 
              [ 'A',     'D',            'D',                 ], 
              [ 'group',  pak.agg_dicts, pak.agg_dicts_2dd,  ],  
              [ 'A',     'D_dict',       'D_defaultdict',    ],                   

             )

r

3 rows less, now 2 rows


Unnamed: 0,A,D_dict,D_defaultdict
0,A,"{'a': 4, 'b': 2, 'c': 3}","{'a': [1, 4], 'b': [2], 'c': [3]}"
1,B,{'bb': 22},{'bb': [22]}


## Aggregate defaultdicts

In [36]:
# See the test data again
data

Unnamed: 0,A,D,DD
0,A,{'a': 1},{'a': [1]}
1,A,{'a': 1},{'a': [1]}
2,A,"{'b': 2, 'c': 3}","{'b': [2], 'c': [3]}"
3,A,{'a': 4},{'a': [4]}
4,B,{'bb': 22},{'bb': [22]}


In [37]:
# agg_defaultdicts aggregates defaultdicts

r = pak.group_and_agg(data, 
              [ 'A',      'DD',                            ], 
              [ 'group',  pak.agg_defaultdicts,            ],  
              [ 'A',      'DD',                            ],                   

             )

r

3 rows less, now 2 rows


Unnamed: 0,A,DD
0,A,"{'a': [1, 4], 'b': [2], 'c': [3]}"
1,B,{'bb': [22]}
