# Explode and Implode Dictionaries
* `explode_dict`: Like pandas explode, but for a dict.<br>
  Turns dictionaries into two columns (key, value) and additional rows, if needed.
* `implode_to_dict`: Reversal of explode_dict.<br>
  Groups rows and turns two columns (key, value) into one dict. 
* `cols_to_dict`: Moves columns into a dict or defaultdict. 


## Using dictionaries in pandas DataFrames
Pandas columns usually contain exactly 1 property. 
* By using columns of dict or defaultdict, columns with any number of properties can also be formulated. Pandasklar provides methods to explode and implode the dicts into rows.
* You can even move a bunch of columns into a single dict column, and back. This is like imploding / exploding into columns.
* See also: Jupyter notebook about Aggregating rows

In [1]:
# blab init
try:
    import blab
except ImportError as e:
    !pip install blab
    import blab    
startup_notebook = blab.blab_startup()
%run $startup_notebook 

In [3]:
import numpy      as np
import pandas     as pd 
#import bpyth      as bpy

# pandasklar
try:
    import pandasklar as pak 
except ImportError as e:
    !pip install pandasklar
    import pandasklar as pak   
    
# verbose
pak.Config.set('VERBOSE', True)

grid = pak.grid

VERBOSE = True
--> setting verbose=True as default for all pandasklar functions

time: 10.8 s


## Exploding and imploding dicts

### explode_dict()

In [3]:
?pak.explode_dict

time: 79.6 ms


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mexplode_dict[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_dict[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_key[0m[0;34m=[0m[0;34m'key'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_value[0m[0;34m=[0m[0;34m'value'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfrom_defaultdict[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Like pandas explode, but for a dict.
Turns dictionaries into two columns (key, value) and additional rows, if needed.
* col_dict:          name of the column that contains the dict to explode
* col_key:           name of the new column for the keys of the dict
* col_value:         name of the new column for the values of the dict   
* from_defaultdict:  Should an additional explode be executed? 
                     This can be useful for defaultdicts. Otherwise you get l

In [4]:
# Create test data
data = []
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'b':2, 'c':3 },   ] )
data.append( ['A',  {'a':4},           ] )
data.append( ['B',  {'bb':22},         ] )
#data.append( ['B',  {},                ] )

data = pak.dataframe(data)
data.columns = ['A','D']
data

Unnamed: 0,A,D
0,A,{'a': 1}
1,A,{'a': 1}
2,A,"{'b': 2, 'c': 3}"
3,A,{'a': 4}
4,B,{'bb': 22}


time: 44.3 ms


In [5]:
# explode dict to key-value pairs 
r = pak.explode_dict(data, 'D') #, col_key='key', col_value='value')
r

Unnamed: 0,A,key,value
0,A,a,1.0
1,A,a,1.0
2,A,b,2.0
2,A,c,3.0
3,A,a,4.0
4,B,bb,22.0


time: 51.8 ms


### implode_to_dict()

In [6]:
?pak.implode_to_dict

time: 29.4 ms


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mimplode_to_dict[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcols_group[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_key[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_value[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_result[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_defaultdict[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Reversal of explode_dict.
Groups rows and turns two columns (key, value) into one dict. 
* cols_group       is a string or list of names by which to group.
                   This affects the width of the results. None=no grouping.
* col_key          is the name of the column containing the keys.
* col_value        is the name of the column containing the values


In [7]:
# preliminary work for the undo:
# we need the index as column for the correct grouping
r.index = r.index.set_names(['i'])
r = r.reset_index()  
r

Unnamed: 0,i,A,key,value
0,0,A,a,1.0
1,1,A,a,1.0
2,2,A,b,2.0
3,2,A,c,3.0
4,3,A,a,4.0
5,4,B,bb,22.0


time: 46.7 ms


In [8]:
# implode_to_dict
df = pak.implode_to_dict( r, cols_group=['i','A'], col_key='key', col_value='value', col_result='D')
df = pak.drop_cols(df,'i')
df

Unnamed: 0,A,D
0,A,{'a': 1.0}
1,A,{'a': 1.0}
2,A,"{'b': 2.0, 'c': 3.0}"
3,A,{'a': 4.0}
4,B,{'bb': 22.0}


time: 64.1 ms


In [9]:
# equal!
assert pak.check_equal(data,df)

time: 35.9 ms


## Exploding and imploding defaultdicts

In [10]:
# Create test data
data = []
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'a':1},           ] )
data.append( ['A',  {'b':2, 'c':3 },   ] )
data.append( ['A',  {'a':4},           ] )
data.append( ['B',  {'bb':22},         ] )
#data.append( ['B',  {},                ] )

data = pak.dataframe(data)
data.columns = ['A','D']

# DD is a copy of D, but as defaultdict
data['DD'] = data.D.copy() 

# dict_to_defaultdict turns the dict in the given column into a defaultdict
data = pak.dict_to_defaultdict( data, col='DD' )
data

Unnamed: 0,A,D,DD
0,A,{'a': 1},{'a': [1]}
1,A,{'a': 1},{'a': [1]}
2,A,"{'b': 2, 'c': 3}","{'b': [2], 'c': [3]}"
3,A,{'a': 4},{'a': [4]}
4,B,{'bb': 22},{'bb': [22]}


time: 57.8 ms


In [11]:
# When a key is assigned more than once and we implode to dict,
# the last assignment wins. Therefore: {'a':1} is missing
pak.implode_to_dict( r, cols_group='A', col_key='key', col_value='value', col_result='D')

Unnamed: 0,A,D
0,A,"{'a': 4.0, 'b': 2.0, 'c': 3.0}"
1,B,{'bb': 22.0}


time: 55.9 ms


### implode_to_defaultdict()
implode_to_defaultdict() is implode_to_dict(...,use_defaultdict=True)

In [12]:
# But we can also create defaultdicts, then nothing is lost 
dd1 = pak.implode_to_defaultdict( r, cols_group='A', col_key='key', col_value='value', col_result='DD')
dd1

Unnamed: 0,A,DD
0,A,"{'a': [1.0, 4.0], 'b': [2.0], 'c': [3.0]}"
1,B,{'bb': [22.0]}


time: 61.6 ms


### Explode defaultdicts

In [13]:
# defaultdicts can also be exploded
# (from_defaultdict=False creates lists)
df = pak.explode_dict(dd1, 'DD', from_defaultdict=True) 
df

Unnamed: 0,A,key,value
0,A,a,1.0
0,A,a,4.0
0,A,b,2.0
0,A,c,3.0
1,B,bb,22.0


time: 36.9 ms


In [14]:
# Back to defaultdicts
dd2 = pak.implode_to_dict( df, cols_group='A', col_key='key', col_value='value', col_result='DD', use_defaultdict=True)
dd2

Unnamed: 0,A,DD
0,A,"{'a': [1.0, 4.0], 'b': [2.0], 'c': [3.0]}"
1,B,{'bb': [22.0]}


time: 69 ms


## Implode columns into dict

### cols_to_dict()

In [15]:
?pak.cols_to_dict

time: 31.5 ms


[0;31mSignature:[0m
[0mpak[0m[0;34m.[0m[0mcols_to_dict[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mdf[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcol_dict[0m[0;34m=[0m[0;34m''[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcols_add[0m[0;34m=[0m[0;34m[[0m[0;34m][0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0muse_defaultdict[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdrop[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Moves columns into a dict or defaultdict.
This is 
* col_dict:         name of the target column. Can be empty, but may already contain a dict or defaultdict. 
* cols_add:         Columns to be packed.
* use_defaultdict:  Should a defaultdict be used as data structure? Otherwise keys can only occur once.
* drop:             Should the packed columns be dropped (>> move) or not (>> copy)?
[0;31mFile:[0m      /media/me/DATA/Dropbox/31_Projekte/01_

In [16]:
# Create test data
data = []
data.append( [ {'x':1},          'who', 111,  ] )
data.append( [ {'y':2, 'z':3 },  'is',  222,  ] )
data.append( [ {'B':4},          'who', 333,  ] )
data.append( [ None,             '!!!', 444,  ] )
data.append( [ {'x':1},           '',   None, ] )

data = pak.dataframe(data)
data.columns = list('DAB')
data

Unnamed: 0,D,A,B
0,{'x': 1},who,111.0
1,"{'y': 2, 'z': 3}",is,222.0
2,{'B': 4},who,333.0
3,,!!!,444.0
4,{'x': 1},,


time: 37.3 ms


In [17]:
# cols A and B get inside row D. 
# See the overwritten value in row 2.
r = pak.cols_to_dict(data, col_dict='D', cols_add=['A','B'], )
r

Unnamed: 0,D
0,"{'x': 1, 'A': 'who', 'B': 111.0}"
1,"{'y': 2, 'z': 3, 'A': 'is', 'B': 222.0}"
2,"{'B': 333.0, 'A': 'who'}"
3,"{'A': '!!!', 'B': 444.0}"
4,{'x': 1}


time: 39.9 ms


In [18]:
# explode it
#df = pak.explode_dict(r, 'D') 
#df

time: 30.2 ms


### cols_to_defaultdict()

In [19]:
# Create test data
data = []
data.append( [ {'x':1},          'who', 111,  ] )
data.append( [ {'y':2, 'z':3 },  'is',  222,  ] )
data.append( [ {'A':4},          'who', 333,  ] )
data.append( [ {'B':4},          '!!!', 444,  ] )
data.append( [ {'x':1},           '',   None, ] )

data = pak.dataframe(data)
data.columns = ['DD','A','B']
# dict_to_defaultdict turns the dict in the given column into a defaultdict
data = pak.dict_to_defaultdict( data, col='DD' )
data

## DD is a copy of D, but as defaultdict
#data['DD'] = data.D.copy() 
#
## dict_to_defaultdict turns the dict in the given column into a defaultdict
#data = pak.dict_to_defaultdict( data, col='DD' )
#data

Unnamed: 0,DD,A,B
0,{'x': [1]},who,111.0
1,"{'y': [2], 'z': [3]}",is,222.0
2,{'A': [4]},who,333.0
3,{'B': [4]},!!!,444.0
4,{'x': [1]},,


time: 45.7 ms


In [20]:
# cols A and B get inside row A. 
# No values are overwritten on conflict.
r = pak.cols_to_defaultdict(data, col_dict='DD', cols_add=['A','B'])
r

Unnamed: 0,DD
0,"{'x': [1], 'A': ['who'], 'B': [111.0]}"
1,"{'y': [2], 'z': [3], 'A': ['is'], 'B': [222.0]}"
2,"{'A': [4, 'who'], 'B': [333.0]}"
3,"{'B': [4, 444.0], 'A': ['!!!']}"
4,{'x': [1]}


time: 46.6 ms


In [21]:
# explode it
#df = pak.explode_dict(r, 'DD', from_defaultdict=True) 
#df

time: 43.2 ms
