# Using Data Structures Effectively

## NumPy arrays

In [1]:
import numpy as np

### NumPy array functionality

In [2]:
python_2d_list = [[1, 3, 5], [2, 4, 6], [7, 9, 11]]

In [3]:
first_column = [python_2d_list[i][0] for i in range(len(python_2d_list))]

In [4]:
np_2d_array = np.array([[1, 3, 5],
                        [2, 4, 6], 
                        [7, 9, 11]])

In [5]:
first_columm = np_2d_array[:, 0]

### NumPy array performance

In [6]:
mixed_type_list = ["one", 2, 3.14]

In [7]:
mixed_type_array = np.array(["one", 2, 3.14])

In [8]:
print(mixed_type_array)

['one' '2' '3.14']


In [9]:
integer_array = np.array([1, 2, 3])

In [10]:
integer_array.dtype

dtype('int64')

In [11]:
array_to_fill = np.zeros(1000)

In [12]:
random_int_array = np.random.randint(1, 100000, 100000)
random_int_list = list(random_int_array)

In [13]:
%%timeit -r 7 -n 100
sum(random_int_list)



1.63 ms ± 35.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%%timeit -r 7 -n 100
np.sum(random_int_array)

57 μs ± 5.76 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [15]:
random_int_array_32 = random_int_array.astype(np.int32)

In [16]:
random_int_array.dtype

dtype('int32')

In [17]:
random_int_array_32.nbytes

400000

In [18]:
random_int_array.nbytes

400000

In [19]:
small_array = np.array([1, 3, 5], dtype=np.int16)

### Parallel array operations with Dask

In [23]:
import dask.array as da

In [34]:
large_np_array = np.random.randint(1, 100000, 1000000000)

In [35]:
%%timeit -r 1 -n 7
np.max(large_np_array)

159 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 7 loops each)


In [36]:
large_dask_array = da.random.randint(1, 100000, 1000000000)

In [37]:
array_max = large_dask_array.max()

In [38]:
%%timeit -r 1 -n 7
array_max = large_dask_array.max()
array_max.compute()

880 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 7 loops each)


In [39]:
from dask.distributed import Client

client = Client(n_workers=4)
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 4
Total threads: 16,Total memory: 31.77 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:55561,Workers: 0
Dashboard: http://127.0.0.1:8787/status,Total threads: 0
Started: Just now,Total memory: 0 B

0,1
Comm: tcp://127.0.0.1:55585,Total threads: 4
Dashboard: http://127.0.0.1:55587/status,Memory: 7.94 GiB
Nanny: tcp://127.0.0.1:55564,
Local directory: C:\Users\EVERTO~1.BAN\AppData\Local\Temp\dask-scratch-space\worker-hpw5ve7u,Local directory: C:\Users\EVERTO~1.BAN\AppData\Local\Temp\dask-scratch-space\worker-hpw5ve7u

0,1
Comm: tcp://127.0.0.1:55581,Total threads: 4
Dashboard: http://127.0.0.1:55582/status,Memory: 7.94 GiB
Nanny: tcp://127.0.0.1:55565,
Local directory: C:\Users\EVERTO~1.BAN\AppData\Local\Temp\dask-scratch-space\worker-3mpbmz65,Local directory: C:\Users\EVERTO~1.BAN\AppData\Local\Temp\dask-scratch-space\worker-3mpbmz65

0,1
Comm: tcp://127.0.0.1:55590,Total threads: 4
Dashboard: http://127.0.0.1:55591/status,Memory: 7.94 GiB
Nanny: tcp://127.0.0.1:55566,
Local directory: C:\Users\EVERTO~1.BAN\AppData\Local\Temp\dask-scratch-space\worker-hra0rko9,Local directory: C:\Users\EVERTO~1.BAN\AppData\Local\Temp\dask-scratch-space\worker-hra0rko9

0,1
Comm: tcp://127.0.0.1:55584,Total threads: 4
Dashboard: http://127.0.0.1:55586/status,Memory: 7.94 GiB
Nanny: tcp://127.0.0.1:55567,
Local directory: C:\Users\EVERTO~1.BAN\AppData\Local\Temp\dask-scratch-space\worker-3h4pwc2h,Local directory: C:\Users\EVERTO~1.BAN\AppData\Local\Temp\dask-scratch-space\worker-3h4pwc2h


In [40]:
large_dask_array = da.random.randint(1, 100000, 1000000000)

In [41]:
array_max = large_dask_array.max()

In [42]:
%%time
array_max.compute()

CPU times: total: 109 ms
Wall time: 1.07 s


np.int32(99999)