# Dask

[Dask DataFrame API](https://docs.dask.org/en/latest/dataframe-api.html)

[Dask DataFrames](https://examples.dask.org/dataframe.html)

[Create and Store Dask DataFrames](https://docs.dask.org/en/latest/dataframe-create.html)

[Dask insert](https://stackoverflow.com/questions/54344793/create-sql-table-from-dask-dataframe-using-map-partitions-and-pd-df-to-sql)

[Dask Examples](https://examples.dask.org/dataframes/01-data-access.html)

[SQL insert](https://www.dataquest.io/blog/sql-insert-tutorial/)

[SQL Alchemy examples](https://www.compose.com/articles/using-postgresql-through-sqlalchemy/)

In [1]:
#!python -m pip install "dask[complete]"

In [2]:
%%time
!ls -la zoo/

total 3736716
drwxr-xr-x  2 jovyan 1000       4096 Aug 18 08:46 .
drwxr-xr-x 10 jovyan 1000       4096 Nov 23 17:41 ..
-rw-r--r--  1 jovyan 1000    6558302 Jan  9  2014 all_ones_benchmark.csv
-rw-r--r--  1 jovyan 1000     271676 Dec 11  2019 all_ones_benchmark.zip
-rw-r--r--  1 jovyan 1000     271657 Dec 11  2019 all_zeros_benchmark.zip
-rw-r--r--  1 jovyan 1000     532227 Dec 11  2019 central_pixel_benchmark.zip
-rw-r--r--  1 jovyan 1000 1904094733 Aug 18 08:38 galaxy-zoo-the-galaxy-challenge.zip
-rw-r--r--  1 jovyan 1000 1079570035 Dec 11  2019 images_test_rev1.zip
-rw-r--r--  1 jovyan 1000  830207257 Dec 11  2019 images_training_rev1.zip
-rw-r--r--  1 jovyan 1000    4854569 Dec 11  2019 training_solutions_rev1.zip
CPU times: user 9.3 ms, sys: 10.4 ms, total: 19.7 ms
Wall time: 1.16 s


In [3]:
import dask
import dask.dataframe as dd
import pandas as pd
from sqlalchemy import create_engine

data = pd.DataFrame({"book_id": [12345,12346,12347], 
                     "title": ["Python Programming","Learn MySQL", "Data Science Cookbook"], 
                     "price": [29,23,27]
                    })

df = dd.read_csv("zoo/all_ones_benchmark.csv")
df.tail(3)

Unnamed: 0,GalaxyID,Class1.1,Class1.2,Class1.3,Class2.1,Class2.2,Class3.1,Class3.2,Class4.1,Class4.2,...,Class9.3,Class10.1,Class10.2,Class10.3,Class11.1,Class11.2,Class11.3,Class11.4,Class11.5,Class11.6
79972,999990,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
79973,999994,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
79974,999996,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [4]:
df.index

Dask Index Structure:
npartitions=1
    int64
      ...
dtype: int64
Dask Name: from-delayed, 4 tasks

In [5]:
df.npartitions

1

In [6]:
for i in df.compute():
    print(i)

GalaxyID
Class1.1
Class1.2
Class1.3
Class2.1
Class2.2
Class3.1
Class3.2
Class4.1
Class4.2
Class5.1
Class5.2
Class5.3
Class5.4
Class6.1
Class6.2
Class7.1
Class7.2
Class7.3
Class8.1
Class8.2
Class8.3
Class8.4
Class8.5
Class8.6
Class8.7
Class9.1
Class9.2
Class9.3
Class10.1
Class10.2
Class10.3
Class11.1
Class11.2
Class11.3
Class11.4
Class11.5
Class11.6


In [7]:
df.dtypes

GalaxyID     int64
Class1.1     int64
Class1.2     int64
Class1.3     int64
Class2.1     int64
Class2.2     int64
Class3.1     int64
Class3.2     int64
Class4.1     int64
Class4.2     int64
Class5.1     int64
Class5.2     int64
Class5.3     int64
Class5.4     int64
Class6.1     int64
Class6.2     int64
Class7.1     int64
Class7.2     int64
Class7.3     int64
Class8.1     int64
Class8.2     int64
Class8.3     int64
Class8.4     int64
Class8.5     int64
Class8.6     int64
Class8.7     int64
Class9.1     int64
Class9.2     int64
Class9.3     int64
Class10.1    int64
Class10.2    int64
Class10.3    int64
Class11.1    int64
Class11.2    int64
Class11.3    int64
Class11.4    int64
Class11.5    int64
Class11.6    int64
dtype: object

In [None]:
# Insert into database
_USER = ""
_PASSWORD = ""
_HOST = ""
_DB = ""
_DB_TABLE = ""

#db_url = "mysql+pymysql://{_USER}:{_PASSWORD}@{_HOST}/{_DB}"
db_url = f"postgres://{_USER}:{_PASSWORD}@{_HOST}/{_DB}"

connection = create_engine(db_url)

# SQL Alchemy examples
#connection.execute("SELECT * FROM test")
#connection.execute("CREATE TABLE IF NOT EXISTS films (title text, director text, year text)")  
#connection.execute("INSERT INTO films (title, director, year) VALUES ('Doctor Strange', 'Scott Derrickson', '2016')")

#meta = dict(df.dtypes)
#df.map_partitions(lambda df: df.to_sql(_DB_TABLE, db_url, if_exists="append", index=True), df, meta=meta)

#data.to_sql(_DB_TABLE, con=connection, if_exists="append", chunksize=1000)
df.to_sql(_DB_TABLE, con=connection, if_exists="append", chunksize=1000)

dto_sql = dask.delayed(pd.DataFrame.to_sql)
out = [dto_sql(i , _DB_TABLE, db_url, if_exists="append", index=True) for i in df.to_delayed()]
dask.compute(*out)