In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  How to create a Dask DataFrame from an array.
#
#     .  Operate on two or more columns from a DataFrame in a map  (map_partitions)


In [2]:

NUM_PARTITIONS  = 3

print("--")


--


In [3]:

import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


--


In [4]:

#  Create our source Dask DataFrame from an array

np_airports = np.array([
       #
   ["MKE", "Milwaukee"     , "WI", "Airport"],
   ["ORD", "Chicago O-Hare", "IL", "Airport"],
   ["SJC", "San Jose"      , "CA", "Airport"],
   ["LAX", "Los Angeles"   , "CA", "Airport"],
   ["DEN", "Denver"        , "CO", "Airport"],
       #
   ], dtype="str")

pd_airports = pd.DataFrame(np_airports, columns = ["airport_code", "airport_name", "state_code", "LABEL"])
   #
dd_airports = from_pandas(pd_airports, npartitions = NUM_PARTITIONS)


for l_each in dd_airports.itertuples():
   print("Airport:  %3s   %-18s   %-2s   %-10s" % (l_each.airport_code, l_each.airport_name, l_each.state_code, l_each.LABEL))

print("--")

#  Sample output,
#
#  Airport:  MKE   Milwaukee            WI   Airport   
#  Airport:  ORD   Chicago O-Hare       IL   Airport   
#  Airport:  SJC   San Jose             CA   Airport   
#  Airport:  LAX   Los Angeles          CA   Airport   
#  Airport:  DEN   Denver               CO   Airport  


Airport:  MKE   Milwaukee            WI   Airport   
Airport:  ORD   Chicago O-Hare       IL   Airport   
Airport:  SJC   San Jose             CA   Airport   
Airport:  LAX   Los Angeles          CA   Airport   
Airport:  DEN   Denver               CO   Airport   
--


In [5]:

#  How to access two or more columns from the row

dd_concat = dd_airports[["airport_name", "state_code"]]


def f_concat(l_arg1):

   return (l_arg1.airport_name + ", " +  l_arg1.state_code)


dd_concat["col3"] = dd_concat.map_partitions(f_concat)


for l_each in dd_concat.itertuples():
   print(l_each)


Pandas(Index=0, airport_name='Milwaukee', state_code='WI', col3='Milwaukee, WI')
Pandas(Index=1, airport_name='Chicago O-Hare', state_code='IL', col3='Chicago O-Hare, IL')
Pandas(Index=2, airport_name='San Jose', state_code='CA', col3='San Jose, CA')
Pandas(Index=3, airport_name='Los Angeles', state_code='CA', col3='Los Angeles, CA')
Pandas(Index=4, airport_name='Denver', state_code='CO', col3='Denver, CO')


In [10]:

#  Another two or more columns, now with an if statement
#
#     Strangely, map() works with an "if", map_partitions() does not
#

def f_func(i_arg1):

   #  This form of "if" block,
   #
   #     if (i_arg1.keywords_length < 1):
   #        return '[{"id": -1, "name": "Unknown"}]'
   #     else:
   #        return i_arg1.keywords
   #     return l_return
   #
   #  Throws this error,
   #     >> ValueError('The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().')
   #
   #  So we use this form of "if",
    
   l_return = np.where(i_arg1.state_code < "M" , i_arg1.airport_name, "DDD")
                       
   return l_return
    

dd_airports["col6"] = dd_airports.map_partitions(f_func)


for l_each in dd_airports.itertuples():
   print(l_each)



Pandas(Index=0, airport_code='MKE', airport_name='Milwaukee', state_code='WI', LABEL='Airport', col6='DDD')
Pandas(Index=1, airport_code='ORD', airport_name='Chicago O-Hare', state_code='IL', LABEL='Airport', col6='Chicago O-Hare')
Pandas(Index=2, airport_code='SJC', airport_name='San Jose', state_code='CA', LABEL='Airport', col6='San Jose')
Pandas(Index=3, airport_code='LAX', airport_name='Los Angeles', state_code='CA', LABEL='Airport', col6='Los Angeles')
Pandas(Index=4, airport_code='DEN', airport_name='Denver', state_code='CO', LABEL='Airport', col6='Denver')
