In [2]:
import pandas as pd
import fireducks.pandas as fd_pandas
import numpy as np
import time
import tempfile
import os
import io
import re

In [3]:
# Measure execution time for Pandas read
start = time.perf_counter()
pd_df = pd.read_csv("fake_users.csv")
pandas_time = time.perf_counter() - start
print(f"Pandas read time: {pandas_time:.4f} seconds")

# Measure execution time for Fireducks read
start = time.perf_counter()
fd_df = fd_pandas.read_csv("fake_users.csv")
fireducks_time = time.perf_counter() - start
print(f"Fireducks read time: {fireducks_time:.4f} seconds")

# Comparison
if pandas_time < fireducks_time:
    speedup = fireducks_time / pandas_time
    print(f"Pandas was {speedup:.2f}x faster")
else:
    speedup = pandas_time / fireducks_time
    print(f"Fireducks was {speedup:.2f}x faster")

# --- Single-line validation (compare DataFrames) ---
print("\n✅ Outputs match within tolerance" if np.allclose(
    pd_df.sort_values("account_balance").reset_index(drop=True)["account_balance"].values,
    fd_df.sort_values("account_balance").reset_index(drop=True)["account_balance"].values,
    rtol=1e-5, atol=1e-8
) else "\n❌ Outputs do not match")

Pandas read time: 27.8696 seconds
Fireducks read time: 0.0427 seconds
Fireducks was 652.75x faster

✅ Outputs match within tolerance


In [7]:
# Create a temporary file for the test
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file_path = temp_file.name
temp_file.close()  # Close it so we can write to it later

# Pandas to_csv
start = time.perf_counter()
pd_df.to_csv(temp_file_path, index=False)
pandas_time = time.perf_counter() - start
print(f"Pandas to_csv time: {pandas_time:.4f} seconds")

# Fireducks to_csv
start = time.perf_counter()
fd_df.to_csv(temp_file_path, index=False)
fireducks_time = time.perf_counter() - start
print(f"Fireducks to_csv time: {fireducks_time:.4f} seconds")

# Comparison
if pandas_time < fireducks_time:
    speedup = fireducks_time / pandas_time
    print(f"Pandas was {speedup:.2f}x faster")
else:
    speedup = pandas_time / fireducks_time
    print(f"Fireducks was {speedup:.2f}x faster")

# Clean up the temporary file
os.remove(temp_file_path)


Pandas to_csv time: 26.1454 seconds
Fireducks to_csv time: 5.2534 seconds
Fireducks was 4.98x faster


In [13]:
# --- Pandas .info() ---
pd_buffer = io.StringIO()
pd_df.info(buf=pd_buffer)
pd_info_output = pd_buffer.getvalue()

# --- Fireducks .info() ---
fd_buffer = io.StringIO()
fd_df.info(buf=fd_buffer)
fd_info_output = fd_buffer.getvalue()

# --- Extract and compare memory usage ---
def extract_memory_usage(info_output):
    # Match lines like "memory usage: 1.2 MB" or "memory usage: 88.7+ KB"
    match = re.search(r"memory usage: ([\d.,+]+)\s*(\w+)", info_output.lower())
    if match:
        number_str, unit = match.groups()
        number = float(re.sub(r'[+,]', '', number_str))  # Remove commas or pluses
        unit = unit.lower()
        multiplier = {"bytes": 1, "kb": 1024, "mb": 1024**2, "gb": 1024**3}.get(unit, 1)
        return number * multiplier
    return None

pd_memory = extract_memory_usage(pd_info_output)
fd_memory = extract_memory_usage(fd_info_output)

if pd_memory is not None and fd_memory is not None:
    print("\n📊 Memory Usage Comparison:")
    print(f"Pandas:    {pd_memory / 1024:.2f} KB")
    print(f"Fireducks: {fd_memory / 1024:.2f} KB")

    if pd_memory < fd_memory:
        diff = (fd_memory - pd_memory) / fd_memory * 100
        print(f"✅ Pandas used {diff:.1f}% less memory")
    else:
        diff = (pd_memory - fd_memory) / pd_memory * 100
        print(f"✅ Fireducks used {diff:.1f}% less memory")
else:
    print("⚠️ Could not parse memory usage from one of the outputs.")



📊 Memory Usage Comparison:
Pandas:    356454.40 KB
Fireducks: 352153.60 KB
✅ Fireducks used 1.2% less memory


In [24]:
# Define the list of columns to select
selected_columns = ["name", "email", "account_balance", "is_active"]

# --- Pandas column selection ---
start = time.perf_counter()
pd_selected = pd_df[selected_columns]
pandas_time = time.perf_counter() - start
print(f"Pandas column selection time: {pandas_time:.6f} seconds")

# --- Fireducks column selection ---
start = time.perf_counter()
fd_selected = fd_df[selected_columns]
fireducks_time = time.perf_counter() - start
print(f"Fireducks column selection time: {fireducks_time:.6f} seconds")

# --- Comparison ---
if pandas_time < fireducks_time:
    speedup = fireducks_time / pandas_time
    print(f"✅ Pandas was {speedup:.2f}x faster")
else:
    speedup = pandas_time / fireducks_time
    print(f"✅ Fireducks was {speedup:.2f}x faster")

# --- Single-line validation (sort and compare column values) ---
print("\n✅ Outputs match within tolerance" if np.allclose(
    pd_selected.sort_values("account_balance").reset_index(drop=True)["account_balance"].values,
    fd_selected.sort_values("account_balance").reset_index(drop=True)["account_balance"].values,
    rtol=1e-5, atol=1e-8
) else "\n❌ Outputs do not match")

Pandas column selection time: 0.230647 seconds
Fireducks column selection time: 0.000357 seconds
✅ Fireducks was 645.37x faster

✅ Outputs match within tolerance


In [None]:
# Define filter threshold
balance_threshold = 20000

# --- Pandas filter ---
start = time.perf_counter()
pd_filtered = pd_df[pd_df["account_balance"] > balance_threshold]
pandas_time = time.perf_counter() - start
print(f"Pandas filter time: {pandas_time:.6f} seconds")

# --- Fireducks filter ---
start = time.perf_counter()
fd_filtered = fd_df[fd_df["account_balance"] > balance_threshold]
fireducks_time = time.perf_counter() - start
print(f"Fireducks filter time: {fireducks_time:.6f} seconds")

# --- Comparison ---
if pandas_time < fireducks_time:
    speedup = fireducks_time / pandas_time
    print(f"✅ Pandas was {speedup:.2f}x faster")
else:
    speedup = pandas_time / fireducks_time
    print(f"✅ Fireducks was {speedup:.2f}x faster")

# --- Single-line validation (sort and compare account balances) ---
print("\n✅ Outputs match within tolerance" if np.allclose(
    pd_filtered.sort_values("account_balance").reset_index(drop=True)["account_balance"].values,
    fd_filtered.sort_values("account_balance").reset_index(drop=True)["account_balance"].values,
    rtol=1e-5, atol=1e-8
) else "\n❌ Outputs do not match")

Pandas filter time: 1.067617 seconds
Fireducks filter time: 0.000714 seconds
✅ Fireducks was 1494.57x faster
✅ Outputs match within tolerance


In [None]:
# Define grouping and aggregation columns
group_col = "job_title"
agg_col = "account_balance"

# --- Pandas groupby ---
start = time.perf_counter()
pd_grouped = pd_df.groupby(group_col)[agg_col].mean()
pandas_time = time.perf_counter() - start
print(f"Pandas groupby mean time: {pandas_time:.6f} seconds")

# --- Fireducks groupby ---
start = time.perf_counter()
fd_grouped = fd_df.groupby(group_col)[agg_col].mean()
fireducks_time = time.perf_counter() - start
print(f"Fireducks groupby mean time: {fireducks_time:.6f} seconds")

# --- Comparison ---
if pandas_time < fireducks_time:
    speedup = fireducks_time / pandas_time
    print(f"✅ Pandas was {speedup:.2f}x faster")
else:
    speedup = pandas_time / fireducks_time
    print(f"✅ Fireducks was {speedup:.2f}x faster")

    # --- Single-line validation ---
print("\n✅ Outputs match within tolerance" if np.allclose(
    pd_grouped.sort_index().values, 
    fd_grouped.sort_index().values, 
    rtol=1e-5, atol=1e-8
) else "\n❌ Outputs do not match")

Pandas groupby mean time: 0.539820 seconds
Fireducks groupby mean time: 0.000333 seconds
✅ Fireducks was 1623.31x faster

✅ Outputs match within tolerance


In [27]:
# Define the column to sort by
sort_column = "account_balance"

# --- Pandas sort_values ---
start = time.perf_counter()
pd_sorted = pd_df.sort_values(by=sort_column)
pandas_time = time.perf_counter() - start
print(f"Pandas sort_values time: {pandas_time:.6f} seconds")

# --- Fireducks sort_values ---
start = time.perf_counter()
fd_sorted = fd_df.sort_values(by=sort_column)
fireducks_time = time.perf_counter() - start
print(f"Fireducks sort_values time: {fireducks_time:.6f} seconds")

# --- Comparison ---
if pandas_time < fireducks_time:
    speedup = fireducks_time / pandas_time
    print(f"✅ Pandas was {speedup:.2f}x faster")
else:
    speedup = pandas_time / fireducks_time
    print(f"✅ Fireducks was {speedup:.2f}x faster")

# --- Single-line validation (compare sorted DataFrames) ---
print("\n✅ Outputs match within tolerance" if np.allclose(
    pd_sorted.reset_index(drop=True)[sort_column].values,
    fd_sorted.reset_index(drop=True)[sort_column].values,
    rtol=1e-5, atol=1e-8
) else "\n❌ Outputs do not match")

Pandas sort_values time: 4.739213 seconds
Fireducks sort_values time: 0.000256 seconds
✅ Fireducks was 18489.08x faster

✅ Outputs match within tolerance


In [12]:
# Define the size of the sample data
size = 1000000  # Reduced size for testing to avoid memory issues

# Create smaller subset of data based on the defined size
pd_df_small = pd_df.sample(size)
merge_df_small = pd_df_small[["name", "email", "job_title", "company"]].copy()

# --- Pandas merge with smaller data ---
start = time.perf_counter()
pd_merged = pd_df_small.merge(merge_df_small, on="name", how="inner")
pandas_time = time.perf_counter() - start
print(f"Pandas merge time: {pandas_time:.6f} seconds")

# --- Fireducks merge with same size data ---
start = time.perf_counter()
# Explicitly convert to pandas for Fireducks DataFrame
fd_df_small = fd_df.sample(size).to_pandas()  # Convert to Pandas before merge
fd_merged = fd_df_small.merge(merge_df_small, on="name", how="inner")
fireducks_time = time.perf_counter() - start
print(f"Fireducks merge time: {fireducks_time:.6f} seconds")

# --- Comparison ---
if pandas_time < fireducks_time:
    speedup = fireducks_time / pandas_time
    print(f"✅ Pandas was {speedup:.2f}x faster")
else:
    speedup = pandas_time / fireducks_time
    print(f"✅ Fireducks was {speedup:.2f}x faster")

Pandas merge time: 9.585626 seconds
Fireducks merge time: 9.803472 seconds
✅ Pandas was 1.02x faster


In [14]:
# Define the size of the sample data
size = 1000000  # You can adjust this depending on the available memory

# Create smaller subset of data based on the defined size
pd_df_small = pd_df.sample(size)
merge_df_small = pd_df_small[["name", "email", "job_title", "company"]].copy()

# --- Pandas concat --- (Concatenate along rows, axis 0)
start = time.perf_counter()
pd_concat = pd.concat([pd_df_small, merge_df_small], axis=0)
pandas_time = time.perf_counter() - start
print(f"Pandas concat time: {pandas_time:.6f} seconds")

# --- Fireducks concat --- (Concatenate along rows, axis 0)
start = time.perf_counter()
fd_df_small = fd_df.sample(size).to_pandas()  # Convert to Pandas for concat
fd_concat = pd.concat([fd_df_small, merge_df_small], axis=0)
fireducks_time = time.perf_counter() - start
print(f"Fireducks concat time: {fireducks_time:.6f} seconds")

# --- Comparison ---
if pandas_time < fireducks_time:
    speedup = fireducks_time / pandas_time
    print(f"✅ Pandas was {speedup:.2f}x faster")
else:
    speedup = pandas_time / fireducks_time
    print(f"✅ Fireducks was {speedup:.2f}x faster")


Pandas concat time: 0.579931 seconds
Fireducks concat time: 2.483235 seconds
✅ Pandas was 4.28x faster
