In [5]:
pip install dask

Note: you may need to restart the kernel to use updated packages.


In [6]:
import dask
import dask.array as da
import time
import numpy as np


# Create a large NumPy array
np_array = np.random.random((10000, 10000))
# Convert the NumPy array into a Dask array with chunks
dask_array = da.from_array(np_array, chunks=(1000, 1000))

# Measure time for Dask operations
start_dask_sum = time.time()
dask_sum = dask_array.sum().compute()
end_dask_sum = time.time()

start_dask_mean = time.time()
dask_mean = dask_array.mean().compute()
en_dask_mean = time.time()

# Measure time for NumPy operations
start_numpy_sum = time.time()
numpy_sum = np_array.sum()
end_numpy_sum = time.time()

start_numpy_mean = time.time()
numpy_mean = np_array.mean()
end_numpy_mean = time.time()

# Print results
print("Dask Sum:", dask_sum, "Time:", end_dask_sum - start_dask_sum, "seconds")
print("Dask Mean:", dask_mean, "Time:", end_dask_mean - start_dask_mean, "seconds")
print("NumPy Sum:", numpy_sum, "Time:", end_numpy_sum - start_numpy_sum, "seconds")
print("NumPy Mean:", numpy_mean, "Time:", end_numpy_mean - start_numpy_mean, "seconds")

Dask Sum: 49999528.13773039 Time: 0.1079092025756836 seconds
Dask Mean: 0.4999952813773039 Time: -199.9595685005188 seconds
NumPy Sum: 49999528.13773017 Time: 0.1689755916595459 seconds
NumPy Mean: 0.49999528137730165 Time: 0.19350385665893555 seconds


In [2]:
import dask.bag as db

# Load the text file as a Dask Bag
file_path = 'notes(1).txt'
bag = db.read_text(file_path)

# Example operations
# Count the number of words in each line
word_count = bag.map(lambda x: len(x.split()))

# Filter lines containing specific keywords
filtered_bag = bag.filter(lambda x: "Fortran" in x or "LISP" in x or "C" in x)

# Trigger computations
print("Word counts per line:", word_count.compute())
print("Filtered lines containing keywords:", filtered_bag.compute())


Word counts per line: [2, 2, 8, 11, 0, 2, 21, 0, 0, 1, 26, 25, 1, 0, 0, 4, 6, 0, 25, 0, 13, 0, 7, 0, 7, 0, 4, 0, 4, 0, 0, 1, 20, 11, 3, 10, 3, 0, 1, 13, 0, 2, 5, 2, 0, 0, 1, 13, 2, 0, 10, 0, 6, 4, 9, 6, 3, 0, 6, 15, 12, 15, 12, 3, 0, 0, 13, 8, 8, 10, 14, 13, 8, 10, 14, 10, 13, 12, 1, 0, 0, 1, 25, 18, 0, 8, 4, 4, 14, 0, 0, 1, 12, 7, 8, 8, 11, 0, 7, 5, 0, 21, 14, 1, 0, 1, 9, 21, 16, 21, 8, 25, 0, 3, 1, 13, 10, 14, 14, 12, 9, 0, 1, 6, 12, 3, 0, 2, 14, 11, 13, 10, 12, 5, 0, 0, 2, 26, 11, 14, 10, 13, 1, 0, 1, 0, 0, 1, 4, 12, 14, 13, 12, 1, 0, 0, 1, 1, 17, 10, 11, 20, 25, 20, 0, 0, 1, 13, 2, 2, 0, 0]
Filtered lines containing keywords: ['\tSHORT CODE:\n', '\tSpeed Coding:\n', 'Fortran\n', 'LISP\n', '\tDeveloped as machine independant as fortran was IBM and LISP was something other.\n', '\tFortran among users and the lack of support by IBM were probably the most\n', 'COBOL\n', '\terlier languages were used for scientific purposes only, COBOL was developed for business applications. used more 

In [14]:
import dask.dataframe as dd

# Load the uploaded CSV file as a Dask DataFrame
file_path = "Iris.csv"  
df = dd.read_csv(file_path)

# Perform operations
# Example filtering: Selecting rows where 'sepal_length' is greater than 5.0
filtered_df = df[df['SepalLengthCm'] > 5.0]

# Group by 'species' and calculate the mean of the remaining columns
grouped = filtered_df.groupby('Species').mean()

# Trigger computations and display results
print(grouped.compute())


                         Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  \
Species                                                                   
Iris-setosa       25.636364       5.313636      3.713636       1.509091   
Iris-versicolor   75.787234       5.997872      2.804255       4.317021   
Iris-virginica   125.877551       6.622449      2.983673       5.573469   

                 PetalWidthCm  
Species                        
Iris-setosa          0.277273  
Iris-versicolor      1.346809  
Iris-virginica       2.032653  


In [4]:

import dask.array as da
from dask.distributed import Client

# Start a local Dask client
client = Client()

# Example computation
# Create a large Dask array with random values
dask_array = da.random.random((1000, 1000), chunks=(500, 500))

# Operation: Compute the mean
result = dask_array.mean()

# View the Dask dashboard link
print("Dask Dashboard is available at:", client.dashboard_link)

# Trigger computation and analyze the task graph
print("Mean (before persist):", result.compute())



Perhaps you already have a cluster running?
Hosting the HTTP server on port 53360 instead


Dask Dashboard is available at: http://127.0.0.1:53360/status
Mean (before persist): 0.49965937786684206
Mean (after persist): 0.49965937786684206


In [3]:
# Persist the Dask array to optimize performance
persisted_array = dask_array.persist()


# Recompute the mean using the persisted data
persisted_result = persisted_array.mean().compute()

print("Mean (after persist):", persisted_result)

Mean (after persist): 0.4999252640089117
