In [4]:
from pathlib import Path
from dotenv import load_dotenv
import os
import pandas as pd

load_dotenv()
API_KEY = os.getenv("API_KEY")
data_raw = Path(os.getenv("DATA_RAW"))
data_processed = Path(os.getenv("DATA_PROCESSED"))
csv_path = data_raw / "api_pull.csv"

if API_KEY is not None:
    print("API KEY Loaded") 
if csv_path.exists():
    print("Data Loaded")

API KEY Loaded
Data Loaded


### Saving csv file to `data/raw` and parquet file to `data/processed`

In [17]:
parquet_path = data_processed / "api_pull.parquet"

df = pd.read_csv(csv_path)

if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])
    
df.to_csv(csv_path, index=False)
df.to_parquet(parquet_path, index=False)
print("Saved both files")

Saved both files


In [24]:
df_csv = pd.read_csv(csv_path)
df_parquet = pd.read_parquet(parquet_path)

### Validation function to check data from both files

In [23]:
import pandas as pd

def validate_dataframes(df_csv, df_parquet, critical_cols: dict):
    """
    Validate shape, datatype and if both files contain the same data
    """
    results = []
    results.append({"check": "shape_match","csv_value": df_csv.shape,"parquet_value": df_parquet.shape,"pass": df_csv.shape == df_parquet.shape})

    for col, expected_dtype in critical_cols.items():
        csv_dtype = str(df_csv.dtypes.get(col, "missing"))
        pq_dtype = str(df_parquet.dtypes.get(col, "missing"))
        pass_check = (csv_dtype == pq_dtype == expected_dtype)
        results.append({"check": f"dtype[{col}]","csv_value": csv_dtype,"parquet_value": pq_dtype,"expected": expected_dtype, "pass": pass_check})

    return pd.DataFrame(results)

critical_columns = {
    "contractID": "object",
    "symbol": "object",
    "expiration": "object",
    "strike": "float64",
    "volume": "int64",
    "implied_volatility": "float64",
}

validation_df = validate_dataframes(df_csv, df_parquet, critical_columns)
display(validation_df) 

Unnamed: 0,check,csv_value,parquet_value,pass,expected
0,shape_match,"(1710, 20)","(1710, 20)",True,
1,dtype[contractID],object,object,True,object
2,dtype[symbol],object,object,True,object
3,dtype[expiration],object,object,True,object
4,dtype[strike],float64,float64,True,float64
5,dtype[volume],int64,int64,True,int64
6,dtype[implied_volatility],float64,float64,True,float64


### Utility Functions

#### `write_d)ne`
Saves a DataFrame to disk.  
- Routes automatically by file suffix (`.csv` → `to_csv`, `.parquet` → `to_parquet`).  
- Ensures the parent directory exists before writing.  
- Raises a clear error if the Parquet engine is missing.  

#### `tataFrame`
Loads a DataFrame from disk.  
- Routes automatically by file suffix (`.csv` → `read_csv`, `.parquet` → `read_parquet`).  
- Raises a clear error for unsupported file types. 

In [28]:
import pandas as pd
from pathlib import Path

def write_df(df, path):
    """
    Save DataFrame to CSV or Parquet depending on file suffix.
    Ensures directories exist.
    """
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

    if path.suffix == ".csv":
        df.to_csv(path, index=False)
        print(f"Saved CSV")
    elif path.suffix == ".parquet":
        try:
            df.to_parquet(path, index=False)
            print(f"Saved Parquet")
        except ImportError:
            print("Missing Parquet engine.")
    else:
        raise ValueError(f"Unsupported file type: {path.suffix}")


def read_df(path):
    """
    Load DataFrame from CSV or Parquet depending on file suffix.
    """
    path = Path(path)

    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

    if path.suffix == ".csv":
        return pd.read_csv(path)
    elif path.suffix == ".parquet":
        try:
            return pd.read_parquet(path)
        except ImportError:
            raise ImportError("Missing Parquet engine.")
    else:
        raise ValueError(f"Unsupported file type: {path.suffix}")

csv_path = data_raw / "api_pull.csv"
parquet_path = data_processed / "api_pull.parquet"

# Write
write_df(df, csv_path)
write_df(df, parquet_path)

# Read
df_csv = read_df(csv_path)
df_parquet = read_df(parquet_path)

Saved CSV
Saved Parquet
