In [1]:
import pandas as pd
import os
import time

csv_file = "C:/Users/Gurkanwal Singh/Documents/DOUGLAS SEM6/Adv Topics in Data Analytics/all_stocks_5yr.csv"

data = pd.read_csv(csv_file)

In [2]:
# Function to scale dataset
def expand_dataset(df, multiplier):
    return pd.concat([df.copy() for _ in range(multiplier)], ignore_index=True)

# Define scaling factors
scaling_factors = [1, 10, 100]
labels = ["1x", "10x", "100x"]
benchmark_data = []

# Run benchmarking tests
for factor, tag in zip(scaling_factors, labels):
    dataset_expanded = data if factor == 1 else expand_dataset(data, factor)
    
    # File paths
    csv_file = f"C:/Users/Gurkanwal Singh/Documents/DOUGLAS SEM6/Adv Topics in Data Analytics/data_{tag}.csv"
    parquet_file = f"C:/Users/Gurkanwal Singh/Documents/DOUGLAS SEM6/Adv Topics in Data Analytics/data_{tag}.parquet"
    
    # Measure CSV write speed
    start_time = time.time()
    dataset_expanded.to_csv(csv_file, index=False)
    csv_write_duration = time.time() - start_time
    
    # Measure CSV read speed
    start_time = time.time()
    pd.read_csv(csv_file)
    csv_read_duration = time.time() - start_time
    
    # Measure Parquet write speed
    start_time = time.time()
    dataset_expanded.to_parquet(parquet_file, engine="pyarrow", compression="snappy", index=False)
    parquet_write_duration = time.time() - start_time
    
    # Measure Parquet read speed
    start_time = time.time()
    pd.read_parquet(parquet_file, engine="pyarrow")
    parquet_read_duration = time.time() - start_time
    
    # Get file sizes
    csv_file_size = os.path.getsize(csv_file) / (1024 * 1024)  # Convert bytes to MB
    parquet_file_size = os.path.getsize(parquet_file) / (1024 * 1024)
    
    # Store results
    benchmark_data.append({
        "Scale": tag,
        "CSV Write Time (s)": round(csv_write_duration, 2),
        "CSV Read Time (s)": round(csv_read_duration, 2),
        "Parquet Write Time (s)": round(parquet_write_duration, 2),
        "Parquet Read Time (s)": round(parquet_read_duration, 2),
        "CSV File Size (MB)": round(csv_file_size, 2),
        "Parquet File Size (MB)": round(parquet_file_size, 2)
    })
    
# Convert to DataFrame for display
benchmark_results_df = pd.DataFrame(benchmark_data)
print(benchmark_results_df)

  Scale  CSV Write Time (s)  CSV Read Time (s)  Parquet Write Time (s)  \
0    1x                1.45               0.26                    0.25   
1   10x               19.75               3.62                    2.75   
2  100x              227.02              41.40                   31.29   

   Parquet Read Time (s)  CSV File Size (MB)  Parquet File Size (MB)  
0                   0.09               28.80                   10.15  
1                   0.82              288.01                   95.35  
2                  10.31             2880.05                  951.71  
