# **Apply Big Data Handling Strategies**

In [None]:
# 🔽 STEP 1: Install kaggle CLI
!pip install -q kaggle

# 🔽 STEP 2: Upload kaggle.json API token
from google.colab import files
print("Please upload your kaggle.json file:")
files.upload()

# 🔽 STEP 3: Set up Kaggle CLI config
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 🔽 STEP 4: Download your desired dataset
# Replace with your actual competition or dataset name
!kaggle datasets download mkechinov/ecommerce-behavior-data-from-multi-category-store

!unzip ecommerce-behavior-data-from-multi-category-store.zip
!ls

In [None]:
import pandas as pd
import time
import dask.dataframe as dd
import polars as pl

# Constants
FILENAME = '2019-Oct.csv'
USECOLS = ['event_type', 'price']
TARGET_EVENT_TYPE = 'purchase'
MEASUREMENT_COLUMN = 'price'
CHUNKSIZE = 100_000
DTYPE_MAP = {'event_type': 'category', 'price': 'float32'}

In [None]:
def print_strategy_results(name, mean_price, mem_usage, exec_time, throughput):
    """
    Prints consistent output across all strategies
    """
    print(f"📊 {name}")
    print("-" * 50)
    print(f"Mean Purchase Price : {mean_price:.2f}")
    print(f"Memory Usage        : {mem_usage:.2f} MB")
    print(f"Execution Time      : {exec_time:.2f} seconds")
    print(f"Throughput          : {throughput:.0f} rows/sec")
    print("-" * 50 + "\n")
    return {
        "strategy": name,
        "mean_price": mean_price,
        "memory_usage_mb": mem_usage,
        "execution_time_sec": exec_time,
        "throughput_rows_per_sec": throughput
    }

def compute_mean_purchase_price(df):
    return df[df['event_type'] == TARGET_EVENT_TYPE][MEASUREMENT_COLUMN].mean()

results = []

In [None]:
# 🧪 Load Sample for Inspection
df_sample = pd.read_csv(FILENAME)
print("📊" + "="*40)
print("     🔍 DATASET INSPECTION REPORT")
print("📊" + "="*40 + "\n")
print("🧾 Shape (Rows, Columns):")
print(f"    ➤ {df_sample.shape[0]} rows, {df_sample.shape[1]} columns\n")
print("📌 Column Names:")
print("    ➤ " + "\n    ➤ ".join(df_sample.columns.tolist()) + "\n")
print("⚙️ Data Types:")
for col, dtype in df_sample.dtypes.items():
    print(f"    ➤ {col.ljust(20)} : {dtype}")
print("\n💾 Memory Usage (MB):")
mem_usage = df_sample.memory_usage(deep=True) / (1024 ** 2)
for col, usage in mem_usage.items():
    print(f"    ➤ {col.ljust(20)} : {usage:.4f} MB")
print(f"\n    🧮 Total Memory : {mem_usage.sum():.4f} MB\n")

In [None]:
# 📊 Strategy 1: Load Less Data
df_lite = pd.read_csv(FILENAME, usecols=USECOLS, nrows=100_000)
start = time.time()
mean_lite = compute_mean_purchase_price(df_lite)
time_lite = time.time() - start
mem_lite = df_lite.memory_usage(deep=True).sum() / (1024 ** 2)
throughput_lite = len(df_lite) / time_lite if time_lite > 0 else float('nan')
results.append(print_strategy_results("Strategy 1: Load Less Data", mean_lite, mem_lite, time_lite, throughput_lite))

In [None]:
# 📊 Strategy 2: Optimize Data Types
df_optimized = pd.read_csv(FILENAME, usecols=USECOLS, dtype=DTYPE_MAP, nrows=100_000)
start = time.time()
mean_opt = compute_mean_purchase_price(df_optimized)
time_opt = time.time() - start
mem_opt = df_optimized.memory_usage(deep=True).sum() / (1024 ** 2)
throughput_opt = len(df_optimized) / time_opt if time_opt > 0 else float('nan')
results.append(print_strategy_results("Strategy 2: Optimize Data Types", mean_opt, mem_opt, time_opt, throughput_opt))

In [None]:
# 📊 Strategy 3: Sampling
df_sampled = df_optimized.sample(frac=0.01, random_state=42)
start = time.time()
mean_samp = compute_mean_purchase_price(df_sampled)
time_samp = time.time() - start
mem_samp = df_sampled.memory_usage(deep=True).sum() / (1024 ** 2)
throughput_samp = len(df_sampled) / time_samp if time_samp > 0 else float('nan')
results.append(print_strategy_results("Strategy 3: Sampling", mean_samp, mem_samp, time_samp, throughput_samp))

In [None]:
# 📊 Strategy 4: Chunked Processing
total_sum = 0
total_count = 0
peak_mem = 0
start = time.time()

for chunk in pd.read_csv(FILENAME, usecols=USECOLS, dtype=DTYPE_MAP, chunksize=CHUNKSIZE):
    filtered = chunk[chunk['event_type'] == TARGET_EVENT_TYPE]
    total_sum += filtered['price'].sum()
    total_count += len(filtered)
    mem = chunk.memory_usage(deep=True).sum() / (1024 ** 2)
    peak_mem = max(peak_mem, mem)

mean_chunk = total_sum / total_count
time_chunk = time.time() - start
throughput_chunk = total_count / time_chunk if time_chunk > 0 else float('nan')
results.append(print_strategy_results("Strategy 4: Chunked Processing", mean_chunk, peak_mem, time_chunk, throughput_chunk))

In [None]:
#  📊 Strategy 5a: Parallel Processing with Dask
def run_dask():
    start = time.time()
    ddf = dd.read_csv(FILENAME, usecols=USECOLS)
    mean = ddf[ddf['event_type'] == TARGET_EVENT_TYPE][MEASUREMENT_COLUMN].mean().compute()
    shape = (len(ddf), len(ddf.columns))
    mem = ddf.memory_usage(index=True, deep=True).sum().compute() / (1024 ** 2)
    t = time.time() - start
    throughput = shape[0] / t if t > 0 else float('nan')
    return {"mean": mean, "shape": shape, "mem": mem, "time": t, "throughput": throughput}

dask_result= run_dask()
results.append(print_strategy_results("Strategy 5a: Dask Processing", dask_result["mean"], dask_result["mem"], dask_result["time"], dask_result["throughput"]))

In [None]:
# Strategy 5b: Polars
def run_polars():
    start = time.time()
    df_pl = pl.read_csv(FILENAME, columns=USECOLS)
    mean = df_pl.filter(pl.col('event_type') == TARGET_EVENT_TYPE)[MEASUREMENT_COLUMN].cast(pl.Float64).mean()
    shape = df_pl.shape
    mem = df_pl.estimated_size() / (1024 ** 2)
    t = time.time() - start
    throughput = shape[0] / t if t > 0 else float('nan')
    return {"mean": mean, "shape": shape, "mem": mem, "time": t, "throughput": throughput}

polars_result = run_polars()
results.append(print_strategy_results("Strategy 5b: Polars Processing", polars_result["mean"], polars_result["mem"], polars_result["time"], polars_result["throughput"]))

In [None]:
# 📊 Final Comparison Table
!pip install pandas dask polars tabulate
!pip install pandas dask polars tabulate matplotlib

from tabulate import tabulate

table_data = [
    ["Load Less Data", mem_lite, time_lite, throughput_lite],
    ["Optimize Data Types", mem_opt, time_opt, throughput_opt],
    ["Sampling", mem_samp, time_samp, throughput_samp],
    ["Chunked Processing", peak_mem, time_chunk, throughput_chunk],
    ["Dask", dask_result["mem"], dask_result["time"], dask_result["throughput"]],
    ["Polars", polars_result["mem"], polars_result["time"], polars_result["throughput"]]
]

headers = ["Strategy", "Memory Usage (MB)", "Execution Time (s)", "Throughput (rows/sec)"]

print("\n📈 FINAL COMPARISON TABLE")
print(tabulate(table_data, headers=headers, tablefmt="grid"))

In [None]:
import matplotlib.pyplot as plt
results_df = pd.DataFrame(results)

# Memory Usage
plt.figure(figsize=(10, 6))
plt.bar(results_df['strategy'], results_df['memory_usage_mb'], color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Memory Usage (MB)')
plt.title('📊 Memory Usage by Strategy')
plt.tight_layout()
plt.show()

# Execution Time
plt.figure(figsize=(10, 6))
plt.bar(results_df['strategy'], results_df['execution_time_sec'], color='salmon')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Execution Time (seconds)')
plt.title('⏱️ Execution Time by Strategy')
plt.tight_layout()
plt.show()

# Throughput
plt.figure(figsize=(10, 6))
plt.bar(results_df['strategy'], results_df['throughput_rows_per_sec'], color='lightgreen')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Throughput (rows/sec)')
plt.title('🚀 Throughput by Strategy')
plt.tight_layout()
plt.show()