# Big Data Scalability Demo: Dask & PySpark

This notebook demonstrates scalable analytics using Dask and PySpark on a synthetic clickstream dataset.

In [None]:
import numpy as np
import pandas as pd
import time

# Synthetic dataset generator
N = 10**6  # adjustable size
np.random.seed(42)

data = pd.DataFrame({
    'user_id': np.random.randint(1, 10000, N),
    'session_id': np.random.randint(1, 100000, N),
    'clicks': np.random.poisson(3, N),
    'timestamp': pd.date_range('2021-01-01', periods=N, freq='s')
})

data.head()


This notebook contains Dask & Spark pipelines; run locally with required packages.


In [None]:
start = time.time()
result_pandas = data.groupby('user_id')['clicks'].sum().reset_index()
end = time.time()
print(f'Pandas processing time: {end - start:.2f} seconds')
result_pandas.head()

No Parquet files found. Generate data with the notebook first.


In [None]:
try:
    import dask.dataframe as dd
    
    ddf = dd.from_pandas(data, npartitions=4)
    start = time.time()
    result_dask = ddf.groupby('user_id')['clicks'].sum().compute()
    end = time.time()
    print(f'Dask processing time: {end - start:.2f} seconds')
    result_dask.head()
except ImportError:
    print('⚠️ Dask not installed. Please install with: pip install dask')

In [None]:
try:
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import sum as spark_sum

    spark = SparkSession.builder.master('local[*]').appName('BigDataDemo').getOrCreate()
    sdf = spark.createDataFrame(data)

    start = time.time()
    result_spark = sdf.groupBy('user_id').agg(spark_sum('clicks').alias('total_clicks'))
    result_spark.show(5)
    end = time.time()
    print(f'PySpark processing time: {end - start:.2f} seconds')

    spark.stop()
except ImportError:
    print('⚠️ PySpark not installed. Please install with: pip install pyspark')