# Parquet2Csv 

Wolfrank Guzman 



In [1]:
#version 2

# convert_1min_csv_to_parquet_in_place.py
# Saves one merged .parquet file per currency pair folder

import pandas as pd
from pathlib import Path
from collections import defaultdict

# ============================
# CONFIG
# ============================
ROOT_FOLDER = r"C:\Users\Wolfrank\Desktop\AlgoHaus\OandaHistoricalData\1MinCharts"

# ============================
# MAIN
# ============================
root = Path(ROOT_FOLDER)

# Find all subfolders that look like currency pairs (XXX_YYY format)
pair_folders = [f for f in root.iterdir() if f.is_dir() and "_" in f.name and len(f.name) == 7]

print(f"Found {len(pair_folders)} currency pair folders.\n")
print("Processing...\n")

for folder in pair_folders:
    pair_name = folder.name  # e.g., "EUR_USD"
    print(f"→ {pair_name}")

    # Find all CSV files recursively inside this folder
    csv_files = list(folder.rglob("*.csv"))
    
    if not csv_files:
        print(f"   No CSV files found in {pair_name}/\n")
        continue

    dataframes = []
    for csv in csv_files:
        try:
            df = pd.read_csv(csv, parse_dates=True, low_memory=False)
            # Standardize column names
            df.columns = df.columns.str.strip().str.lower()
            df.rename(columns={
                "datetime": "time",
                "date": "time",
                "timestamp": "time"
            }, inplace=True)
            dataframes.append(df)
            print(f"   Loaded: {csv.name} ({len(df):,} rows)")
        except Exception as e:
            print(f"   Failed to read {csv.name}: {e}")

    if not dataframes:
        print(f"   No valid data loaded for {pair_name}\n")
        continue

    # Merge all CSVs
    combined = pd.concat(dataframes, ignore_index=True)

    # Deduplicate and sort by time
    if "time" in combined.columns:
        combined = combined.drop_duplicates(subset="time", keep="first")
        combined = combined.sort_values("time").reset_index(drop=True)

    # Save Parquet in the same folder
    parquet_path = folder / f"{pair_name}.parquet"
    combined.to_parquet(parquet_path, compression="snappy", index=False)

    rows = len(combined)
    period = "?"
    if "time" in combined.columns and rows > 0:
        try:
            start = combined["time"].iloc[0].strftime("%Y-%m-%d")
            end = combined["time"].iloc[-1].strftime("%Y-%m-%d")
            period = f"{start} → {end}"
        except:
            period = "?"

    print(f"   Saved: {parquet_path.name} → {rows:,} rows ({period})\n")

print("All done! Each currency pair folder now has its merged .parquet file.")

Found 20 currency pair folders.

Processing...

→ AUD_CHF
   Loaded: AUD_CHF_M1_2014_2025_Data.csv (3,895,000 rows)
   Saved: AUD_CHF.parquet → 3,895,000 rows (?)

→ AUD_JPY
   Loaded: AUD_JPY_M1_2014_2025_Data.csv (3,990,000 rows)
   Saved: AUD_JPY.parquet → 3,990,000 rows (?)

→ AUD_NZD
   Loaded: AUD_NZD_M1_2014_2025_Data.csv (3,995,000 rows)
   Saved: AUD_NZD.parquet → 3,995,000 rows (?)

→ AUD_USD
   Loaded: AUD_USD_M1_2014_2025_Data.csv (3,725,000 rows)
   Saved: AUD_USD.parquet → 3,725,000 rows (?)

→ CAD_CHF
   Loaded: CAD_CHF_M1_2014_2025_Data.csv (3,875,000 rows)
   Saved: CAD_CHF.parquet → 3,875,000 rows (?)

→ CHF_JPY
   Loaded: CHF_JPY_M1_2014_2025_Data.csv (4,005,000 rows)
   Saved: CHF_JPY.parquet → 4,005,000 rows (?)

→ EUR_AUD
   Loaded: EUR_AUD_M1_2014_2025_Data.csv (3,935,000 rows)
   Saved: EUR_AUD.parquet → 3,935,000 rows (?)

→ EUR_CHF
   Loaded: EUR_CHF_M1_2014_2025_Data.csv (3,765,000 rows)
   Saved: EUR_CHF.parquet → 3,765,000 rows (?)

→ EUR_GBP
   Loaded: EUR