# Exercise 1: Introduction to dask.delayed

In [18]:
pip install dask

Note: you may need to restart the kernel to use updated packages.


In [22]:
from dask import delayed, compute


# Wrap functions with delayed

@delayed

def square(x):

    return x ** 2


@delayed

def sum_of_squares(numbers):

    return sum(numbers)


# Parallel execution

numbers = range(10)

squared = [square(x) for x in numbers]

result = sum_of_squares(squared)


# Compute the final result

final_result = compute(result)

print("Sum of squares:", final_result)


Sum of squares: (285,)


# Exercise 2: Dask Arrays vs. NumPy

In [25]:
pip install dask numpy

Note: you may need to restart the kernel to use updated packages.


In [27]:
import numpy as np
import dask.array as da
import time

# Step 1: Create a large random array using NumPy and compute the sum of squares
n = 10**7  # Size of the array

# Timing NumPy
start_time_numpy = time.time()
numpy_array = np.random.random(n)
numpy_result = np.sum(numpy_array ** 2)  # Compute sum of squares
end_time_numpy = time.time()

numpy_time = end_time_numpy - start_time_numpy
print(f"NumPy Sum of Squares: {numpy_result}")
print(f"NumPy Execution Time: {numpy_time:.4f} seconds")

# Step 2: Create the same array using Dask Arrays (with chunk size of 10^6)
chunk_size = 10**6

# Timing Dask
start_time_dask = time.time()
dask_array = da.random.random(n, chunks=chunk_size)
dask_result = da.sum(dask_array ** 2).compute()  # Compute sum of squares
end_time_dask = time.time()

dask_time = end_time_dask - start_time_dask
print(f"Dask Sum of Squares: {dask_result}")
print(f"Dask Execution Time: {dask_time:.4f} seconds")

# Step 3: Compare Results
performance_improvement = numpy_time / dask_time if dask_time < numpy_time else dask_time / numpy_time
print(f"Performance Ratio (slower method vs faster method): {performance_improvement:.2f}")


NumPy Sum of Squares: 3332503.7232706896
NumPy Execution Time: 0.1292 seconds
Dask Sum of Squares: 3335073.849102824
Dask Execution Time: 0.0615 seconds
Performance Ratio (slower method vs faster method): 2.10


# task 03:

In [1]:
pip install dask[complete] pandas


Collecting pyarrow-hotfix (from dask[complete])
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Installing collected packages: pyarrow-hotfix
Successfully installed pyarrow-hotfix-0.6
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
n_rows = 10**7  # 10 million rows (approximately 1GB depending on the number of columns)
n_cols = 5

data = {
    'Category': np.random.choice(['A', 'B', 'C', 'D'], size=n_rows),
    'Value': np.random.rand(n_rows) * 100,
    'Score': np.random.randint(1, 100, size=n_rows)
}

df = pd.DataFrame(data)

# Save to CSV
df.to_csv('large_data.csv', index=False)


In [5]:
import dask.dataframe as dd

# Load CSV into Dask DataFrame
dask_df = dd.read_csv('large_data.csv')

# Check the first few rows to ensure the data is loaded correctly
print(dask_df.head())


  Category      Value  Score
0        C  51.665332     27
1        D  93.733723     87
2        A  45.094102     80
3        C  67.448808     30
4        C  18.884791     73


In [7]:
# Compute the mean of the 'Value' column
mean_value = dask_df['Value'].mean().compute()
print(f"Mean of Value column: {mean_value}")


Mean of Value column: 50.00310602031473


In [9]:
# Filter rows where 'Score' > 50
filtered_dask_df = dask_df[dask_df['Score'] > 50]
print(filtered_dask_df.head())


  Category      Value  Score
1        D  93.733723     87
2        A  45.094102     80
4        C  18.884791     73
8        C  33.391188     55
9        B  72.793317     64


In [11]:
# Group by 'Category' and compute summary statistics
grouped_dask_df = dask_df.groupby('Category').agg({
    'Value': 'mean',
    'Score': 'mean'
}).compute()

print(grouped_dask_df)


              Value      Score
Category                      
C         49.999157  49.978683
D         49.994485  49.981524
A         50.012384  49.992427
B         50.006399  50.014610


In [13]:
import time

# Load the data with pandas
start_time = time.time()
pandas_df = pd.read_csv('large_data.csv')

# Compute the mean of the 'Value' column
mean_value_pandas = pandas_df['Value'].mean()

# Filter rows where 'Score' > 50
filtered_pandas_df = pandas_df[pandas_df['Score'] > 50]

# Group by 'Category' and compute summary statistics
grouped_pandas_df = pandas_df.groupby('Category').agg({
    'Value': 'mean',
    'Score': 'mean'
})

end_time = time.time()

# Print the results
print(f"Mean of Value column (Pandas): {mean_value_pandas}")
print(filtered_pandas_df.head())
print(grouped_pandas_df)
print(f"Pandas execution time: {end_time - start_time} seconds")


Mean of Value column (Pandas): 50.00310602031466
  Category      Value  Score
1        D  93.733723     87
2        A  45.094102     80
4        C  18.884791     73
8        C  33.391188     55
9        B  72.793317     64
              Value      Score
Category                      
A         50.012384  49.992427
B         50.006399  50.014610
C         49.999157  49.978683
D         49.994485  49.981524
Pandas execution time: 2.697392702102661 seconds


In [15]:
# Dask operations execution time
start_time_dask = time.time()

# Compute the mean of the 'Value' column
mean_value_dask = dask_df['Value'].mean().compute()

# Filter rows where 'Score' > 50
filtered_dask_df = dask_df[dask_df['Score'] > 50]

# Group by 'Category' and compute summary statistics
grouped_dask_df = dask_df.groupby('Category').agg({
    'Value': 'mean',
    'Score': 'mean'
}).compute()

end_time_dask = time.time()

# Print the results
print(f"Mean of Value column (Dask): {mean_value_dask}")
print(filtered_dask_df.head())
print(grouped_dask_df)
print(f"Dask execution time: {end_time_dask - start_time_dask} seconds")


Mean of Value column (Dask): 50.00310602031473
  Category      Value  Score
1        D  93.733723     87
2        A  45.094102     80
4        C  18.884791     73
8        C  33.391188     55
9        B  72.793317     64
              Value      Score
Category                      
C         49.999157  49.978683
D         49.994485  49.981524
A         50.012384  49.992427
B         50.006399  50.014610
Dask execution time: 2.318621873855591 seconds
