In [None]:

#  Here we demonstrate the following Dask DataFrame techniques;
#
#     .  How to create a Dask DataFrame from an array.
#
#     .  Given a source Dask DataFrame, create a second new DataFrame with just
#        unique values, and the ability to link back to the source.
#
#        We might use this technique when given a flat file of data, that we derive
#        nodes and edges from.


In [None]:

NUM_PARTITIONS  = 3

print("--")


In [None]:

import numpy as np
import pandas as pd
   #
import dask.array as da
import dask.dataframe as dd
   #
from dask.dataframe import from_pandas

from tabulate import tabulate

print("--")


In [None]:

#  Create our source Dask DataFrame from an array

np_airports = np.array([
       #
   ["MKE", "Milwaukee"     , "WI", "Airport"],
   ["ORD", "Chicago O-Hare", "IL", "Airport"],
   ["SJC", "San Jose"      , "CA", "Airport"],
   ["LAX", "Los Angeles"   , "CA", "Airport"],
   ["DEN", "Denver"        , "CO", "Airport"],
       #
   ], dtype="str")

pd_airports = pd.DataFrame(np_airports, columns = ["airport_code", "airport_name", "state_code", "LABEL"])
   #
dd_airports = from_pandas(pd_airports, npartitions = NUM_PARTITIONS)


for l_each in dd_airports.iterrows():
      #
   l_airport_code   = l_each[1][0]
   l_airport_name   = l_each[1][1]
   l_state_code     = l_each[1][2]
   l_LABEL          = l_each[1][3]
      #
   print("Airport:  %3s   %-18s   %-2s   %-10s" % (l_airport_code, l_airport_name, l_state_code, l_LABEL))

print("--")

#  Sample output,
#
#  Airport:  MKE   Milwaukee            WI   Airport   
#  Airport:  ORD   Chicago O-Hare       IL   Airport   
#  Airport:  SJC   San Jose             CA   Airport   
#  Airport:  LAX   Los Angeles          CA   Airport   
#  Airport:  DEN   Denver               CO   Airport  


#  Write DataFrame to local TXT file  (local to Jupyter Docker container)

In [None]:

#  Write DataFrame to an ASCII file that is local to Jupyter Docker container

print(type(dd_airports))

dd_airports2 = dd_airports.repartition(npartitions=1) 

dd_airports2.to_csv("42_parent_folder", index=None, sep="|")


#  Write DataFrame to GS  (easier download later)

In [None]:

#  Settings:
#     Need a public or private accessible storage bucket on GCP or similar ..
#
#  From,
#     https://stackoverflow.com/questions/36314797/write-a-pandas-dataframe-to-google-cloud-storage-or-bigquery
#     https://stackoverflow.com/questions/29325458/dictionary-column-in-pandas-dataframe/29325954#29325954

import os
from google.cloud import storage


#  Setup for all work below ..
#
#  Url for viewing,  https://console.cloud.google.com/storage/browser/farrell-bucket

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/home/jovyan/work/My_KG_NoteBooks/03_MyKeys.json"

l_bucket = "farrell-bucket"


g_client = storage.Client()
   #
g_bucket = g_client.get_bucket(l_bucket)

print("--")


In [None]:

#  The technique used below worked for Pandas DataFrames, not Dask DataFrames.
#  So, we convert Dask to Pandas.

dd_airports2      = dd_airports.repartition(npartitions=1) 
   #
dd_airports2_AsPd = dd_airports2.compute()                    #  Convert to Pandas DataFrame

#  print(type(dd_airports2_AsPd))


l_file = "42_write_test/node.txt"
   #
g_bucket.blob(l_file).upload_from_string(dd_airports2_AsPd.to_csv(header=None, index=None, sep="|"), "text/plain")


#  This was working, then started throwing new error about "path"
#
#  l_file = "42_write_test/node.parquet"
#     #
#  g_bucket.blob(l_file).upload_from_string(dd_airports2.to_parquet(engine="pyarrow", version="2.6"), "application/octet-stream")


print("--")
