In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  How to create a Dask DataFrame from an array.
#
#     .  Given a source Dask DataFrame, create a second new DataFrame with just
#        unique values, and the ability to link back to the source.
#
#        We might use this technique when given a flat file of data, that we derive
#        nodes and edges from.


In [None]:

NUM_PARTITIONS  = 3

print("--")


In [None]:

import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


In [None]:

#  Create our source Dask DataFrame from an array

np_airports = np.array([
       #
   ["MKE", "Milwaukee"     , "WI", "Airport"],
   ["ORD", "Chicago O-Hare", "IL", "Airport"],
   ["SJC", "San Jose"      , "CA", "Airport"],
   ["LAX", "Los Angeles"   , "CA", "Airport"],
   ["DEN", "Denver"        , "CO", "Airport"],
       #
   ], dtype="str")

pd_airports = pd.DataFrame(np_airports, columns = ["airport_code", "airport_name", "state_code", "LABEL"])
   #
dd_airports = from_pandas(pd_airports, npartitions = NUM_PARTITIONS)


for l_each in dd_airports.iterrows():
      #
   l_airport_code   = l_each[1][0]
   l_airport_name   = l_each[1][1]
   l_state_code     = l_each[1][2]
   l_LABEL          = l_each[1][3]
      #
   print("Airport:  %3s   %-18s   %-2s   %-10s" % (l_airport_code, l_airport_name, l_state_code, l_LABEL))

print("--")

#  Sample output,
#
#  Airport:  MKE   Milwaukee            WI   Airport   
#  Airport:  ORD   Chicago O-Hare       IL   Airport   
#  Airport:  SJC   San Jose             CA   Airport   
#  Airport:  LAX   Los Angeles          CA   Airport   
#  Airport:  DEN   Denver               CO   Airport  


In [49]:

#  Create a DataFrame with just the unique state_code values


#  Approach one, just the keys
#
dd_states = dd_airports[["state_code"]].drop_duplicates()
   #
for l_each in dd_states.iterrows():
   print(l_each)

print("")


#  Approach two, this also outputs a count per unique key
#
dd_states = dd_airports.groupby("state_code").aggregate({"state_code": "count"})
   #
for l_each in dd_states.iterrows():
   print(l_each)

print("")


#  Approach three, this also outputs a count per unique key, an sorts DESC
#
dd_states = dd_airports.groupby("state_code")["state_code"].count().compute().reset_index(name="count").sort_values(by="count", ascending=False)

for l_each in dd_states.iterrows():
   print(l_each)

print("")


#  Sample output,
#
#     (0, state_code    WI Name: 0, dtype: object)
#     (1, state_code    IL Name: 1, dtype: object)
#     (2, state_code    CA Name: 2, dtype: object)
#     (4, state_code    CO Name: 4, dtype: object)
#     
#     ('IL', state_code    1 Name: IL, dtype: int64)
#     ('WI', state_code    1 Name: WI, dtype: int64)
#     ('CA', state_code    2 Name: CA, dtype: int64)
#     ('CO', state_code    1 Name: CO, dtype: int64)
#     
#     (2, state_code    CA count          2 Name: 2, dtype: object) 
#     (0, state_code    IL count          1 Name: 0, dtype: object)
#     (1, state_code    WI count          1 Name: 1, dtype: object)
#     (3, state_code    CO count          1 Name: 3, dtype: object)



(0, state_code    WI
Name: 0, dtype: object)
(1, state_code    IL
Name: 1, dtype: object)
(2, state_code    CA
Name: 2, dtype: object)
(4, state_code    CO
Name: 4, dtype: object)

('IL', state_code    1
Name: IL, dtype: int64)
('WI', state_code    1
Name: WI, dtype: int64)
('CA', state_code    2
Name: CA, dtype: int64)
('CO', state_code    1
Name: CO, dtype: int64)

(2, state_code    CA
count          2
Name: 2, dtype: object)
(0, state_code    IL
count          1
Name: 0, dtype: object)
(1, state_code    WI
count          1
Name: 1, dtype: object)
(3, state_code    CO
count          1
Name: 3, dtype: object)



In [None]:

#  And then if we need our edge between the two above

dd_edges = dd_airports[["airport_code", "state_code"]]
   #
for l_each in dd_edges.iterrows():
   print(l_each)

#  Sample output,
#
#  (0, airport_code    MKE state_code       WI Name: 0, dtype: object)
#  (1, airport_code    ORD state_code       IL Name: 1, dtype: object)
#  (2, airport_code    SJC state_code       CA Name: 2, dtype: object)
#  (3, airport_code    LAX state_code       CA Name: 3, dtype: object)
#  (4, airport_code    DEN state_code       CO Name: 4, dtype: object)


In [48]:

dd_states1 = dd_airports[["state_code"]]
                        
dd_states1 = dd_states1.groupby("state_code")["state_code"].count().compute().reset_index(name="count").sort_values(by="count", ascending=False)


# dd_states = dd_states1.merge(dd_states2, out, on=["state_code"], how="left")








   #
# for l_each in dd_states1.sort_values(by="count", ascending=False).iterrows():
for l_each in dd_states1.iterrows():
   print(l_each)

print("")




#  import pandas as pd
#  import dask.dataframe as dd
#  
#  df = pd.DataFrame({"A":[0,0,1,1,1,2,2],
                   #  "B":[1,2,3,4,5,6,7]})
#  
#  df = dd.from_pandas(df, npartitions=2)
#  
#  out = df.groupby("A")["B"]\
        #  .count()\
        #  .compute()\
        #  .reset_index(name="new_column")
#  
#  df = dd.merge(df, out, on=["A"], how="left")



(2, state_code    CA
count          2
Name: 2, dtype: object)
(0, state_code    IL
count          1
Name: 0, dtype: object)
(1, state_code    WI
count          1
Name: 1, dtype: object)
(3, state_code    CO
count          1
Name: 3, dtype: object)

