# Adding Settings to Normalize

In this notebook, the feature update to add the switch of whether or not to expand lists and dicts is tested.

## Imports

In [1]:
import os
import sys
sys.path.insert(0, os.pardir)

In [2]:
import pandas as pd
from flat_table import mapper, normalize

## Sample Dataframe

In [3]:
data = [
    (
        1001, 
        { 'first_name': 'john', 'last_name': 'smith', 'phones': {'mobile': '201-..', 'home': '978-..'} },
        ['a', 'b'],
        [{ 'zip': '07014', 'city': 'clifton' }],
        { 'membership': True, 'memberid': '1231' }
    ),
    (
        1002, 
        pd.np.nan, 
        ['one', 'two', 'three', 'four'],
        [{'zip': '07014', 'address1': '1 Journal Square'}],
        { 'membership': False, 'memberid': '' }
    ),
    (
        1003, 
        { 'first_name': 'marry', 'last_name': 'kate', 'gender': 'female' }, 
        ['w', 'y', 'z'],
        [{ 'zip': '10001', 'city': 'new york' }, { 'zip': '10008', 'city': 'brooklyn' }],
        { 'membership': True, 'memberid': '9754' }
    ),
]
df = pd.DataFrame(data, columns=['id', 'user_info', 'name_list', 'address', 'membership'])

In [4]:
df.head()

Unnamed: 0,id,user_info,name_list,address,membership
0,1001,"{'first_name': 'john', 'last_name': 'smith', '...","[a, b]","[{'zip': '07014', 'city': 'clifton'}]","{'membership': True, 'memberid': '1231'}"
1,1002,,"[one, two, three, four]","[{'zip': '07014', 'address1': '1 Journal Squar...","{'membership': False, 'memberid': ''}"
2,1003,"{'first_name': 'marry', 'last_name': 'kate', '...","[w, y, z]","[{'zip': '10001', 'city': 'new york'}, {'zip':...","{'membership': True, 'memberid': '9754'}"


## Setttings

In [5]:
mp = mapper(df)
mp

Unnamed: 0,parent,child,type,obj
0,.,id,int,"0 1001 1 1002 2 1003 Name: id, dtype:..."
1,.,user_info,dict,"0 {'first_name': 'john', 'last_name': 'smit..."
2,user_info,user_info.gender,str,0 NaN 1 NaN 2 female Name: user...
3,user_info,user_info.phones.home,str,0 978-.. 1 NaN 2 NaN Name: user...
4,user_info,user_info.phones.mobile,str,0 201-.. 1 NaN 2 NaN Name: user...
5,user_info,user_info.last_name,str,0 smith 1 NaN 2 kate Name: user_in...
6,user_info,user_info.first_name,str,0 john 1 NaN 2 marry Name: user_in...
7,.,name_list,list,"0 [a, b] 1 [one, two, t..."
8,,name_list,str,0 a 0 b 1 one 1 two 1 ...
9,.,address,list,"0 [{'zip': '07014', 'city': 'cl..."


### Find Original Df

In [6]:
org_df = mp[mp.parent == '.']
org_df

Unnamed: 0,parent,child,type,obj
0,.,id,int,"0 1001 1 1002 2 1003 Name: id, dtype:..."
1,.,user_info,dict,"0 {'first_name': 'john', 'last_name': 'smit..."
7,.,name_list,list,"0 [a, b] 1 [one, two, t..."
9,.,address,list,"0 [{'zip': '07014', 'city': 'cl..."
14,.,membership,dict,"0 {'membership': True, 'memberid': '1231'} ..."


### Settings

In [7]:
expand_dicts = False
expand_lists = False

if expand_dicts and expand_lists:
    dataframe = mp[(mp.type != 'dict') & (mp.type != 'list')]
# dicts expand, lists are same
elif expand_dicts and not expand_lists:
    print('dict True list False')
    org_df = mp[mp.parent == '.']
    dict_items = org_df[org_df.type == 'dict'].child
    final_dict_items = mp[mp.parent.isin(dict_items)]
    final_others = org_df[org_df.type != 'dict']
    dataframe = pd.concat([final_dict_items, final_others]).sort_index()
# dicts are same, lists expand
elif not expand_dicts and expand_lists:
    print('dict False list True')
    org_df = mp[mp.parent == '.']
    list_items = mp[mp.type == 'list'].child
    final_list_items = mp[
        (mp.parent.isin(list_items) | mp.child.isin(list_items)) 
        & ~mp.type.isin(['dict', 'list'])]
    final_others = org_df[org_df.type != 'list']
    dataframe = pd.concat([final_list_items, final_others]).sort_index()
# original df
else:
    dataframe = mp[mp.parent == '.']
dataframe

Unnamed: 0,parent,child,type,obj
0,.,id,int,"0 1001 1 1002 2 1003 Name: id, dtype:..."
1,.,user_info,dict,"0 {'first_name': 'john', 'last_name': 'smit..."
7,.,name_list,list,"0 [a, b] 1 [one, two, t..."
9,.,address,list,"0 [{'zip': '07014', 'city': 'cl..."
14,.,membership,dict,"0 {'membership': True, 'memberid': '1231'} ..."


### After Implementation

In [8]:
normalize(df, expand_dicts=True, expand_lists=False)

dict True list False


Unnamed: 0,index,id,name_list,address,user_info.gender,user_info.phones.home,user_info.phones.mobile,user_info.last_name,user_info.first_name,membership.memberid,membership.membership
0,0,1001,"[a, b]","[{'zip': '07014', 'city': 'clifton'}]",,978-..,201-..,smith,john,1231.0,True
1,1,1002,"[one, two, three, four]","[{'zip': '07014', 'address1': '1 Journal Squar...",,,,,,,False
2,2,1003,"[w, y, z]","[{'zip': '10001', 'city': 'new york'}, {'zip':...",female,,,kate,marry,9754.0,True


In [9]:
normalize(df, expand_dicts=False, expand_lists=True)

dict False list True


Unnamed: 0,index,id,user_info,membership,name_list,address.address1,address.city,address.zip
0,0,1001,"{'first_name': 'john', 'last_name': 'smith', '...","{'membership': True, 'memberid': '1231'}",a,,clifton,7014
1,0,1001,"{'first_name': 'john', 'last_name': 'smith', '...","{'membership': True, 'memberid': '1231'}",b,,clifton,7014
2,1,1002,,"{'membership': False, 'memberid': ''}",one,1 Journal Square,,7014
3,1,1002,,"{'membership': False, 'memberid': ''}",two,1 Journal Square,,7014
4,1,1002,,"{'membership': False, 'memberid': ''}",three,1 Journal Square,,7014
5,1,1002,,"{'membership': False, 'memberid': ''}",four,1 Journal Square,,7014
6,2,1003,"{'first_name': 'marry', 'last_name': 'kate', '...","{'membership': True, 'memberid': '9754'}",w,,new york,10001
7,2,1003,"{'first_name': 'marry', 'last_name': 'kate', '...","{'membership': True, 'memberid': '9754'}",w,,brooklyn,10008
8,2,1003,"{'first_name': 'marry', 'last_name': 'kate', '...","{'membership': True, 'memberid': '9754'}",y,,new york,10001
9,2,1003,"{'first_name': 'marry', 'last_name': 'kate', '...","{'membership': True, 'memberid': '9754'}",y,,brooklyn,10008
