Load data with Pandas from a CSV

In [6]:
import pandas as pd
import time

start_time = time.time()

df = pd.read_csv(r"C:\Users\es422\Documents\xentity\BLM\MLRS\Data\Snapshots\MLRS\2025-03-02_MLRS_Full\2025-03-02\consolidated_tables\CR_FULL_BLM_CASE.csv", sep='|', low_memory=False)

print(f"Number of rows in file: {df.shape[0]}")
print(f"Time taken: {time.time() - start_time:.2f} seconds")

Number of rows in file: 6187847
Time taken: 86.26 seconds


Load data with Pandas from a Parquet

In [7]:
import pandas as pd

start_time = time.time()

df = pd.read_parquet(r"C:\Users\es422\Documents\xentity\BLM\MLRS\Data\Snapshots\MLRS\2025-03-02_MLRS_Full\2025-03-02\consolidated_tables\CR_FULL_BLM_CASE.parquet")
print(f"Number of rows in file: {df.shape[0]}")
print(f"Time taken: {time.time() - start_time:.2f} seconds")

Number of rows in file: 6187847
Time taken: 39.09 seconds


Test PySpark from parquet

In [8]:
import os
import sys

# Set Hadoop environment variables
os.environ['HADOOP_HOME'] = r'C:\hadoop'
os.environ['PATH'] = os.environ['PATH'] + r';C:\hadoop\bin'

# Create SparkSession with datetime rebase configuration
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("LocalSpark") \
    .config("spark.driver.memory", "16g") \
    .config("spark.sql.parquet.datetimeRebaseModeInRead", "LEGACY") \
    .config("spark.sql.parquet.datetimeRebaseModeInWrite", "LEGACY") \
    .config("spark.sql.parquet.int96RebaseModeInWrite", "LEGACY") \
    .getOrCreate()

start_time = time.time()

df = spark.read.parquet(r"C:\Users\es422\Documents\xentity\BLM\MLRS\Data\Snapshots\MLRS\2025-03-02_MLRS_Full\2025-03-02\consolidated_tables\CR_FULL_BLM_CASE.parquet")
print(f"Number of rows in file: {df.count()}")
print(f"Time taken: {time.time() - start_time:.2f} seconds")

Number of rows in file: 6187847
Time taken: 3.76 seconds


Dask from CSV

In [9]:
import dask.dataframe as dd

start_time = time.time()

df = dd.read_csv(
    r"C:\Users\es422\Documents\xentity\BLM\MLRS\Data\Snapshots\MLRS\2025-03-02_MLRS_Full\2025-03-02\consolidated_tables\CR_FULL_BLM_CASE.csv",
    sep='|',
    dtype = 'object',
    low_memory=False,  
)
print(f"Number of rows: {df.shape[0].compute()}")
print(f"Time taken: {time.time() - start_time:.2f} seconds")

Number of rows: 6187847
Time taken: 101.83 seconds


Dask from parquet

In [10]:
import dask.dataframe as dd

start_time = time.time()

df = dd.read_parquet(r"C:\Users\es422\Documents\xentity\BLM\MLRS\Data\Snapshots\MLRS\2025-03-02_MLRS_Full\2025-03-02\consolidated_tables\CR_FULL_BLM_CASE.parquet")
print(f"Number of rows: {df.shape[0].compute()}")
print(f"Time taken: {time.time() - start_time:.2f} seconds")

Number of rows: 6187847
Time taken: 21.30 seconds
