## Final Project for Data 605 - Big data systems
- Name: Mohammed Ateeq Ur Rehman
- UID: 120872334

In [2]:
# Run this if libraires are not already installed
%pip install petastorm pyarrow pandas matplotlib torch torchvision tensorflow

[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import time
import pandas as pd
import pyarrow.parquet as pq
import os
from pyarrow import fs
import petastorm_bitcoin_processing_utils as utils

##  Part 1: Data Ingestion

### Task: Get the data using coingecko API

In [4]:
output_dir = 'test_bitcoin_data'
os.makedirs(output_dir, exist_ok=True)

base_url = "https://api.coingecko.com/api/v3"
historical_data = []

In [5]:
# Parameters
interval_sec = 30  # every 30 seconds
duration_min = 2   # run for 2 minutes
save_interval = 2  # save every 2 fetches

end_time = time.time() + duration_min * 60
fetch_count = 0
historical_data = []

output_file = os.path.join(output_dir, "bitcoin_price_data.csv")

while time.time() < end_time:
    try:
        price_data = utils.fetch_current_price(base_url)
        historical_data.append(price_data)
        fetch_count += 1
        print(f"Collected data at {price_data['timestamp']}")

        # Save to file every `save_interval` fetches
        if fetch_count % save_interval == 0:
            new_df = pd.DataFrame(historical_data)
            historical_data = []  # clear buffer after saving

            # Append to existing file or create new one
            if os.path.exists(output_file):
                new_df.to_csv(output_file, mode='a', header=False, index=False)
            else:
                new_df.to_csv(output_file, mode='w', header=True, index=False)

    except Exception as e:
        print(f"Error fetching data: {e}")
        break  # Use break instead of exit in notebooks/scripts

    time.sleep(interval_sec)
print("Data collection complete.")

Collected data at 2025-05-18T19:07:40.643565
Collected data at 2025-05-18T19:08:09.099452
Collected data at 2025-05-18T19:08:37.488312
Collected data at 2025-05-18T19:09:05.949489
Collected data at 2025-05-18T19:09:34.336577
Data collection complete.


## Part2: Batch Data Storage (Parquet Format)

### Task: Convert the CSV to parquet using PETASTROM

In [6]:
# 1. Load all price CSVs
csv_folder = r"test_bitcoin_data"
df = utils.load_all_csvs_from_folder(csv_folder)
print(f"Loaded {len(df)} rows from CSV files.")

# 2. Save to Parquet
parquet_path = "file:///test_bitcoin_data/parquet"
utils.save_to_parquet_arrow(df, parquet_path)

# 3. Load from Parquet and preview
batches = list(utils.load_from_parquet(input_dir='test_bitcoin_data\\parquet'))

Combined 3 CSV files.
Loaded 386 rows from CSV files.
Loading data from file:///data/test_bitcoin_data\parquet...


OSError: Passed non-file path: /data/test_bitcoin_data\parquet

Columns of the parquet file

In [None]:
parquet_path = "test_bitcoin_data/parquet/data.parquet"
table = pq.read_table(parquet_path)
schema_columns = table.column_names
print("Columns:", table.column_names)

Sample batch data from parquet file

In [None]:
for batch in batches[:1]:
    df = pd.DataFrame(batch)
    print("Batch shape:", df.shape)
    print(df.head())