In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  How to create a Dask DataFrame from an array.
#
#     .  Given a source Dask DataFrame, create a second new DataFrame with just
#        unique values, and the ability to link back to the source.
#
#        We might use this technique when given a flat file of data, that we derive
#        nodes and edges from.


In [None]:

NUM_PARTITIONS  = 3

print("--")


In [None]:

import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


In [None]:

#  Create our source Dask DataFrame from an array

np_airports = np.array([
       #
   ["MKE", "Milwaukee"     , "WI", "Airport"],
   ["ORD", "Chicago O-Hare", "IL", "Airport"],
   ["SJC", "San Jose"      , "CA", "Airport"],
   ["LAX", "Los Angeles"   , "CA", "Airport"],
   ["DEN", "Denver"        , "CO", "Airport"],
       #
   ], dtype="str")

pd_airports = pd.DataFrame(np_airports, columns = ["airport_code", "airport_name", "state_code", "LABEL"])
   #
dd_airports = from_pandas(pd_airports, npartitions = NUM_PARTITIONS)


for l_each in dd_airports.itertuples():
   print("Airport:  %3s   %-18s   %-2s   %-10s" % (l_each.airport_code, l_each.airport_name, l_each.state_code, l_each.LABEL))

print("--")

#  Sample output,
#
#  Airport:  MKE   Milwaukee            WI   Airport   
#  Airport:  ORD   Chicago O-Hare       IL   Airport   
#  Airport:  SJC   San Jose             CA   Airport   
#  Airport:  LAX   Los Angeles          CA   Airport   
#  Airport:  DEN   Denver               CO   Airport  


In [None]:

#  Create a DataFrame with just the unique state_code values


#  Approach one, just the keys
#
dd_states = dd_airports[["state_code"]].drop_duplicates()
   #
for l_each in dd_states.itertuples():
   print(l_each)

print("")


#  Approach two, this also outputs a count per unique key
#
#  Don't use this one
#
dd_states = dd_airports.groupby("state_code").aggregate({"state_code": "count"})
   #
for l_each in dd_states.itertuples():
   print(l_each)

print("")


#  Approach three, this also outputs a count per unique key, an sorts DESC
#

print(type(dd_airports))

dd_states = dd_airports.groupby("state_code")["state_code"].count().compute().reset_index(name="count").sort_values(by="count", ascending=False)

for l_each in dd_states.itertuples():
   print(l_each)

print("")


#  Sample output,
#
#     Pandas(Index=0, state_code='WI')
#     Pandas(Index=1, state_code='IL')
#     Pandas(Index=2, state_code='CA')
#     Pandas(Index=4, state_code='CO')
#     
#     Pandas(Index='IL', state_code=1)
#     Pandas(Index='WI', state_code=1)
#     Pandas(Index='CA', state_code=2)
#     Pandas(Index='CO', state_code=1)
#     
#     <class 'dask.dataframe.core.DataFrame'>
#     Pandas(Index=2, state_code='CA', count=2)
#     Pandas(Index=0, state_code='IL', count=1)
#     Pandas(Index=1, state_code='WI', count=1)
#     Pandas(Index=3, state_code='CO', count=1)


In [63]:

#  And then if we need our edge between the two above

dd_edges = dd_airports[["airport_code", "state_code"]]
   #
for l_each in dd_edges.itertuples():
   print(l_each)

#  Sample output,
#
#     Pandas(Index=0, airport_code='MKE', state_code='WI')
#     Pandas(Index=1, airport_code='ORD', state_code='IL')
#     Pandas(Index=2, airport_code='SJC', state_code='CA')
#     Pandas(Index=3, airport_code='LAX', state_code='CA')
#     Pandas(Index=4, airport_code='DEN', state_code='CO')


Pandas(Index=0, airport_code='MKE', state_code='WI')
Pandas(Index=1, airport_code='ORD', state_code='IL')
Pandas(Index=2, airport_code='SJC', state_code='CA')
Pandas(Index=3, airport_code='LAX', state_code='CA')
Pandas(Index=4, airport_code='DEN', state_code='CO')
