In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  How to create a Dask DataFrame from an array.
#
#     .  Given a source Dask DataFrame, pivot a column that is a string into an array.
#
#        We might use this technique when given a flat file of data, that we derive
#        nodes and edges from.


In [None]:

NUM_PARTITIONS  = 3

print("--")


In [None]:

import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


In [95]:

#  Create our source Dask DataFrame from an array

np_name = np.array([
       #
   ["Mary"   , "Bob, Tim, Dave"            ],
   ["Joyce"  , "Allen, Tom, Dave"          ],    
   ["Alice"  , "Earl"                      ],
       #
   ], dtype="str")

pd_name = pd.DataFrame(np_name, columns = ["col1", "col2"])
   #
dd_name = from_pandas(pd_name, npartitions = NUM_PARTITIONS)


for l_each in dd_name.iterrows():
      #
   l_col1    = l_each[1][0]
   l_col2    = l_each[1][1]
      #
   print("Names:  %-16s   %-32s" % (l_col1, l_col2))

print("--")

#  Sample output,
#
#  Names:  Mary               Bob, Tim, Dave                  
#  Names:  Joyce              Allen, Tom, Dave                
#  Names:  Alice              Earl


Names:  Mary               Bob, Tim, Dave                  
Names:  Joyce              Allen, Tom, Dave                
Names:  Alice              Earl                            
--


In [None]:

#  Create a DataFrame with just the unique state_code values

#  Approach one, this also outputs a count per unique key
#
dd_states = dd_airports.groupby("state_code").aggregate({"state_code": "count"})
   #
for l_each in dd_states.iterrows():
   print(l_each)

print("")

#  Approach two, just the keys
#
dd_states = dd_airports[["state_code"]].drop_duplicates()
   #
for l_each in dd_states.iterrows():
   print(l_each)

#  Sample output,
#
#  ('IL', state_code    1 Name: IL, dtype: int64)
#  ('WI', state_code    1 Name: WI, dtype: int64)
#  ('CA', state_code    2 Name: CA, dtype: int64)
#  ('CO', state_code    1 Name: CO, dtype: int64)
#  
#  (0, state_code    WI Name: 0, dtype: object)
#  (1, state_code    IL Name: 1, dtype: object)
#  (2, state_code    CA Name: 2, dtype: object)
#  (4, state_code    CO Name: 4, dtype: object)


In [None]:

#  And then if we need our edge between the two above

dd_edges = dd_airports[["airport_code", "state_code"]]
   #
for l_each in dd_edges.iterrows():
   print(l_each)

#  Sample output,
#
#  (0, airport_code    MKE state_code       WI Name: 0, dtype: object)
#  (1, airport_code    ORD state_code       IL Name: 1, dtype: object)
#  (2, airport_code    SJC state_code       CA Name: 2, dtype: object)
#  (3, airport_code    LAX state_code       CA Name: 3, dtype: object)
#  (4, airport_code    DEN state_code       CO Name: 4, dtype: object)
