In [25]:
# blab init
import blab
startup_notebook = blab.blab_startup()
%run $startup_notebook

blab init
blab found local libs at D:\Dropbox\31_Projekte\01_Python\libs
Start Time: 20:45:35


time: 126 ms


# Explode and Implode Dictionaries
* `explode_dict`: Like pandas explode, but for a dict.<br>
  Turns dictionaries into two columns (key, value) and additional rows, if needed.
* `implode_to_dict`: Reversal of explode_dict.<br>
  Groups rows and turns two columns (key, value) into one dict. 
* `cols_to_dict`: Moves columns into a dict or defaultdict. 


## Using dictionaries in pandas DataFrames
Pandas columns usually contain exactly 1 property. 
* By using columns of dict or defaultdict, columns with any number of properties can also be formulated. Pandasklar provides methods to explode and implode the dicts into rows.
* You can even move a bunch of columns into a single dict column, and back. This is like imploding / exploding into columns.
* See also: Jupyter notebook about Aggregating rows

In [26]:
import numpy      as np
import pandas     as pd 
#import bpyth      as bpy
import pandasklar as pak 
grid = pak.grid

time: 120 ms


## Exploding and imploding dicts

### explode_dict()

In [27]:
?pak.explode_dict

time: 121 ms


[1;31mSignature:[0m
[0mpak[0m[1;33m.[0m[0mexplode_dict[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0mcol_dict[0m[1;33m,[0m[1;33m
[0m    [0mcol_key[0m[1;33m=[0m[1;34m'key'[0m[1;33m,[0m[1;33m
[0m    [0mcol_value[0m[1;33m=[0m[1;34m'value'[0m[1;33m,[0m[1;33m
[0m    [0mfrom_defaultdict[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Like pandas explode, but for a dict.
Turns dictionaries into two columns (key, value) and additional rows, if needed.
* col_dict:          name of the column that contains the dict to explode
* col_key:           name of the new column for the keys of the dict
* col_value:         name of the new column for the values of the dict   
* from_defaultdict:  Should an additional explode be executed? 
                     This can be useful for defaultdicts. Otherwise you get lists.
[1;31mFile:[0m      d:\dropbox\31_projekte\01_python\git\p

In [28]:
# Create test data
data = []
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'b':2, 'c':3 },   ] )
data.append( ['A',  {'a':4},           ] )
data.append( ['B',  {'bb':22},         ] )
#data.append( ['B',  {},                ] )

data = pak.dataframe(data)
data.columns = ['A','D']
data

Unnamed: 0,A,D
0,A,{'a': 1}
1,A,{'a': 1}
2,A,"{'b': 2, 'c': 3}"
3,A,{'a': 4}
4,B,{'bb': 22}


time: 121 ms


In [29]:
# explode dict to key-value pairs 
r = pak.explode_dict(data, 'D') #, col_key='key', col_value='value')
r

Unnamed: 0,A,key,value
0,A,a,1.0
1,A,a,1.0
2,A,b,2.0
2,A,c,3.0
3,A,a,4.0
4,B,bb,22.0


time: 125 ms


### implode_to_dict()

In [30]:
?pak.implode_to_dict

time: 119 ms


[1;31mSignature:[0m
[0mpak[0m[1;33m.[0m[0mimplode_to_dict[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0mcols_group[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcol_key[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcol_value[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcol_result[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0muse_defaultdict[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Reversal of explode_dict.
Groups rows and turns two columns (key, value) into one dict. 
* cols_group       is a string or list of names by which to group.
                   This affects the width of the results. None=no grouping.
* col_key          is the name of the column containing the keys.
* col_value        is the name of the column containing the values
* col_result       is the name of the column containing the results
* use_def

In [31]:
# preliminary work for the undo:
# we need the index as column for the correct grouping
r.index = r.index.set_names(['i'])
r = r.reset_index()  
r

Unnamed: 0,i,A,key,value
0,0,A,a,1.0
1,1,A,a,1.0
2,2,A,b,2.0
3,2,A,c,3.0
4,3,A,a,4.0
5,4,B,bb,22.0


time: 121 ms


In [32]:
# implode_to_dict
df = pak.implode_to_dict( r, cols_group=['i','A'], col_key='key', col_value='value', col_result='D')
df = pak.drop_cols(df,'i')
df

Unnamed: 0,A,D
0,A,{'a': 1.0}
1,A,{'a': 1.0}
2,A,"{'b': 2.0, 'c': 3.0}"
3,A,{'a': 4.0}
4,B,{'bb': 22.0}


time: 136 ms


In [33]:
# equal!
assert pak.check_equal(data,df)

time: 123 ms


## Exploding and imploding defaultdicts

In [34]:
# Create test data
data = []
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'b':2, 'c':3 },   ] )
data.append( ['A',  {'a':4},           ] )
data.append( ['B',  {'bb':22},         ] )
#data.append( ['B',  {},                ] )

data = pak.dataframe(data)
data.columns = ['A','D']

# DD is a copy of D, but as defaultdict
data['DD'] = data.D.copy() 

# dict_to_defaultdict turns the dict in the given column into a defaultdict
data = pak.dict_to_defaultdict( data, col='DD' )
data

Unnamed: 0,A,D,DD
0,A,{'a': 1},{'a': [1]}
1,A,{'a': 1},{'a': [1]}
2,A,"{'b': 2, 'c': 3}","{'b': [2], 'c': [3]}"
3,A,{'a': 4},{'a': [4]}
4,B,{'bb': 22},{'bb': [22]}


time: 125 ms


In [35]:
# When a key is assigned more than once and we implode to dict,
# the last assignment wins. Therefore: {'a':1} is missing
pak.implode_to_dict( r, cols_group='A', col_key='key', col_value='value', col_result='D')

Unnamed: 0,A,D
0,A,"{'a': 4.0, 'b': 2.0, 'c': 3.0}"
1,B,{'bb': 22.0}


time: 135 ms


### implode_to_defaultdict()
implode_to_defaultdict() is implode_to_dict(...,use_defaultdict=True)

In [36]:
# But we can also create defaultdicts, then nothing is lost 
dd1 = pak.implode_to_defaultdict( r, cols_group='A', col_key='key', col_value='value', col_result='DD')
dd1

Unnamed: 0,A,DD
0,A,"{'a': [1.0, 4.0], 'b': [2.0], 'c': [3.0]}"
1,B,{'bb': [22.0]}


time: 132 ms


### Explode defaultdicts

In [37]:
# defaultdicts can also be exploded
# (from_defaultdict=False creates lists)
df = pak.explode_dict(dd1, 'DD', from_defaultdict=True) 
df

Unnamed: 0,A,key,value
0,A,a,1.0
0,A,a,4.0
0,A,b,2.0
0,A,c,3.0
1,B,bb,22.0


time: 123 ms


In [38]:
# Back to defaultdicts
dd2 = pak.implode_to_dict( df, cols_group='A', col_key='key', col_value='value', col_result='DD', use_defaultdict=True)
dd2

Unnamed: 0,A,DD
0,A,"{'a': [1.0, 4.0], 'b': [2.0], 'c': [3.0]}"
1,B,{'bb': [22.0]}


time: 132 ms


## Implode columns into dict

### cols_to_dict()

In [39]:
?pak.cols_to_dict

time: 119 ms


[1;31mSignature:[0m
[0mpak[0m[1;33m.[0m[0mcols_to_dict[0m[1;33m([0m[1;33m
[0m    [0mdf[0m[1;33m,[0m[1;33m
[0m    [0mcol_dict[0m[1;33m=[0m[1;34m''[0m[1;33m,[0m[1;33m
[0m    [0mcols_add[0m[1;33m=[0m[1;33m[[0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0muse_defaultdict[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mdrop[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Moves columns into a dict or defaultdict.
This is 
* col_dict:         name of the target column. Can be empty, but may already contain a dict or defaultdict. 
* cols_add:         Columns to be packed.
* use_defaultdict:  Should a defaultdict be used as data structure? Otherwise keys can only occur once.
* drop:             Should the packed columns be dropped (>> move) or not (>> copy)?
[1;31mFile:[0m      d:\dropbox\31_projekte\01_python\git\pandasklar\src\pandasklar\aggregate.py
[1;31mType:[0m      functi

In [40]:
# Create test data
data = []
data.append( [ {'x':1},          'who', 111,  ] )
data.append( [ {'y':2, 'z':3 },  'is',  222,  ] )
data.append( [ {'B':4},          'who', 333,  ] )
data.append( [ None,             '!!!', 444,  ] )
data.append( [ {'x':1},           '',   None, ] )

data = pak.dataframe(data)
data.columns = list('DAB')
data

Unnamed: 0,D,A,B
0,{'x': 1},who,111.0
1,"{'y': 2, 'z': 3}",is,222.0
2,{'B': 4},who,333.0
3,,!!!,444.0
4,{'x': 1},,


time: 123 ms


In [41]:
# cols A and B get inside row D. 
# See the overwritten value in row 2.
r = pak.cols_to_dict(data, col_dict='D', cols_add=['A','B'], )
r

Unnamed: 0,D
0,"{'x': 1, 'A': 'who', 'B': 111.0}"
1,"{'y': 2, 'z': 3, 'A': 'is', 'B': 222.0}"
2,"{'B': 333.0, 'A': 'who'}"
3,"{'A': '!!!', 'B': 444.0}"
4,{'x': 1}


time: 123 ms


In [45]:
# explode it
#df = pak.explode_dict(r, 'D') 
#df

time: 119 ms


### cols_to_defaultdict()

In [46]:
# Create test data
data = []
data.append( [ {'x':1},          'who', 111,  ] )
data.append( [ {'y':2, 'z':3 },  'is',  222,  ] )
data.append( [ {'A':4},          'who', 333,  ] )
data.append( [ {'B':4},          '!!!', 444,  ] )
data.append( [ {'x':1},           '',   None, ] )

data = pak.dataframe(data)
data.columns = ['DD','A','B']
# dict_to_defaultdict turns the dict in the given column into a defaultdict
data = pak.dict_to_defaultdict( data, col='DD' )
data

## DD is a copy of D, but as defaultdict
#data['DD'] = data.D.copy() 
#
## dict_to_defaultdict turns the dict in the given column into a defaultdict
#data = pak.dict_to_defaultdict( data, col='DD' )
#data

Unnamed: 0,DD,A,B
0,{'x': [1]},who,111.0
1,"{'y': [2], 'z': [3]}",is,222.0
2,{'A': [4]},who,333.0
3,{'B': [4]},!!!,444.0
4,{'x': [1]},,


time: 123 ms


In [47]:
# cols A and B get inside row A. 
# No values are overwritten on conflict.
r = pak.cols_to_defaultdict(data, col_dict='DD', cols_add=['A','B'])
r

Unnamed: 0,DD
0,"{'x': [1], 'A': ['who'], 'B': [111.0]}"
1,"{'y': [2], 'z': [3], 'A': ['is'], 'B': [222.0]}"
2,"{'A': [4, 'who'], 'B': [333.0]}"
3,"{'B': [4, 444.0], 'A': ['!!!']}"
4,{'x': [1]}


time: 125 ms


In [49]:
# explode it
#df = pak.explode_dict(r, 'DD', from_defaultdict=True) 
#df

time: 195 ms
