# Client setup

In [1]:
development = 'local'

In [2]:
from dask.distributed import Client, progress
import dask_jobqueue
import dask.dataframe as dd

try:
    client.shutdown()
except:
    pass
finally:
    if development == 'local':
        client = Client(n_workers= 4, threads_per_worker=2, timeout="200s")
        display(client.cluster)
    elif development == 'cluster':
        cluster = dask_jobqueue.SLURMCluster(
                queue = 'all',
                processes=2,
                cores=16,
                memory='32GB',
                scheduler_options={'dashboard_address': ':8087'},
                death_timeout=120 # seconds
              )
        client = Client(cluster, timeout="120s")
        display(client.cluster)

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 8,Total memory: 15.91 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:50750,Workers: 4
Dashboard: http://127.0.0.1:8787/status,Total threads: 8
Started: Just now,Total memory: 15.91 GiB

0,1
Comm: tcp://127.0.0.1:50771,Total threads: 2
Dashboard: http://127.0.0.1:50777/status,Memory: 3.98 GiB
Nanny: tcp://127.0.0.1:50753,
Local directory: C:\Users\Expoil\AppData\Local\Temp\dask-scratch-space\worker-8tmsvva5,Local directory: C:\Users\Expoil\AppData\Local\Temp\dask-scratch-space\worker-8tmsvva5

0,1
Comm: tcp://127.0.0.1:50770,Total threads: 2
Dashboard: http://127.0.0.1:50772/status,Memory: 3.98 GiB
Nanny: tcp://127.0.0.1:50754,
Local directory: C:\Users\Expoil\AppData\Local\Temp\dask-scratch-space\worker-3kibu56_,Local directory: C:\Users\Expoil\AppData\Local\Temp\dask-scratch-space\worker-3kibu56_

0,1
Comm: tcp://127.0.0.1:50769,Total threads: 2
Dashboard: http://127.0.0.1:50773/status,Memory: 3.98 GiB
Nanny: tcp://127.0.0.1:50755,
Local directory: C:\Users\Expoil\AppData\Local\Temp\dask-scratch-space\worker-5xoihr9w,Local directory: C:\Users\Expoil\AppData\Local\Temp\dask-scratch-space\worker-5xoihr9w

0,1
Comm: tcp://127.0.0.1:50776,Total threads: 2
Dashboard: http://127.0.0.1:50779/status,Memory: 3.98 GiB
Nanny: tcp://127.0.0.1:50756,
Local directory: C:\Users\Expoil\AppData\Local\Temp\dask-scratch-space\worker-qe2j8lb0,Local directory: C:\Users\Expoil\AppData\Local\Temp\dask-scratch-space\worker-qe2j8lb0


# Task

In [3]:
dtype = {'House Number': str,
       'Issuer Command': str,
       'Issuer Squad': str,
       'Time First Observed': str,
       'Unregistered Vehicle?': str,
       'Violation Description': str,
       'Violation Legal Code': str,
       'Violation Location': 'float64',
       'Violation Post Code': str}

# Load CSV file using Dask
df = dd.read_csv('../data/Parking_Violations_Issued_-_Fiscal_Year_2023.csv', dtype=dtype, blocksize='64MB')

print(df.dtypes)

# Store DataFrame in Parquet format
df.astype(str).to_parquet('../data/Parking_Violations_Issued_-_Fiscal_Year_2023.parquet')

# Store DataFrame in HDF5 format
df.astype(str).to_hdf('../data/hdf5/data-*.hdf', '/data', mode='w', complib='blosc',complevel=9, min_itemsize=75)

Summons Number                         int64
Plate ID                              object
Registration State                    object
Plate Type                            object
Issue Date                            object
Violation Code                         int64
Vehicle Body Type                     object
Vehicle Make                          object
Issuing Agency                        object
Street Code1                           int64
Street Code2                           int64
Street Code3                           int64
Vehicle Expiration Date                int64
Violation Location                   float64
Violation Precinct                     int64
Issuer Precinct                        int64
Issuer Code                            int64
Issuer Command                        object
Issuer Squad                          object
Violation Time                        object
Time First Observed                   object
Violation County                      object
Violation 



['../data/hdf5/data-00.hdf',
 '../data/hdf5/data-01.hdf',
 '../data/hdf5/data-02.hdf',
 '../data/hdf5/data-03.hdf',
 '../data/hdf5/data-04.hdf',
 '../data/hdf5/data-05.hdf',
 '../data/hdf5/data-06.hdf',
 '../data/hdf5/data-07.hdf',
 '../data/hdf5/data-08.hdf',
 '../data/hdf5/data-09.hdf',
 '../data/hdf5/data-10.hdf',
 '../data/hdf5/data-11.hdf',
 '../data/hdf5/data-12.hdf',
 '../data/hdf5/data-13.hdf',
 '../data/hdf5/data-14.hdf',
 '../data/hdf5/data-15.hdf',
 '../data/hdf5/data-16.hdf',
 '../data/hdf5/data-17.hdf',
 '../data/hdf5/data-18.hdf',
 '../data/hdf5/data-19.hdf',
 '../data/hdf5/data-20.hdf',
 '../data/hdf5/data-21.hdf',
 '../data/hdf5/data-22.hdf',
 '../data/hdf5/data-23.hdf',
 '../data/hdf5/data-24.hdf',
 '../data/hdf5/data-25.hdf',
 '../data/hdf5/data-26.hdf',
 '../data/hdf5/data-27.hdf',
 '../data/hdf5/data-28.hdf',
 '../data/hdf5/data-29.hdf',
 '../data/hdf5/data-30.hdf',
 '../data/hdf5/data-31.hdf',
 '../data/hdf5/data-32.hdf',
 '../data/hdf5/data-33.hdf',
 '../data/hdf5

# Client shutdown

In [4]:
client.shutdown()

