#DateRangeFinder
Scans data for date ranges 

In [5]:
import os
import re
from pathlib import Path
from datetime import datetime

# Path to scan
folder_path = Path(r'D:\compressedworld\AlgoHaus\OandaHistoricalData\1MinCharts')

def diagnostic_scan(target_dir):
    if not target_dir.exists():
        print(f"Path does not exist: {target_dir}")
        return

    dates = []
    # Stricter regex: looks for 8 digits (YYYYMMDD) or YYYY-MM-DD
    date_pattern = re.compile(r'(20\d{2})[-/_]?(\d{2})[-/_]?(\d{2})')
    
    parquet_files = list(target_dir.rglob("*.parquet"))
    
    if not parquet_files:
        print("No .parquet files found in this folder or its subfolders.")
        return

    print(f"Scanning {len(parquet_files)} parquet files...")

    for file in parquet_files:
        match = date_pattern.search(file.name)
        if match:
            try:
                year = int(match.group(1))
                month = int(match.group(2))
                day = int(match.group(3))
                
                # This line will now skip invalid dates instead of crashing
                current_date = datetime(year, month, day)
                dates.append(current_date)
            except ValueError:
                # This happens if the regex finds something like Month 20
                continue 

    if dates:
        print("-" * 30)
        print(f"Earliest Date Found: {min(dates).date()}")
        print(f"Latest Date Found:   {max(dates).date()}")
        print("-" * 30)
        print(f"Example file format found: {parquet_files[0].name}")
    else:
        print("Files found, but no valid dates could be extracted from the filenames.")
        print(f"Sample filename: {parquet_files[0].name}")

# Run it
diagnostic_scan(folder_path)

Scanning 20 parquet files...
Files found, but no valid dates could be extracted from the filenames.
Sample filename: AUD_CHF.parquet


In [6]:
import pandas as pd
from pathlib import Path

# Your root directory
root_path = Path(r'D:\compressedworld\AlgoHaus\OandaHistoricalData\1MinCharts')

def get_parquet_internal_ranges(base_dir):
    if not base_dir.exists():
        print(f"Error: Path {base_dir} not found.")
        return

    # Potential column names for the timestamp in Oanda data
    time_cols = ['time', 'datetime', 'Timestamp', 'date', 'Date/Time']
    
    # Dictionary to store results {Folder: (MinDate, MaxDate)}
    results = {}

    # Find all subdirectories (Currency Pairs)
    subfolders = [f for f in base_dir.iterdir() if f.is_dir()]
    
    if not subfolders:
        print("No subfolders found. Scanning root for parquet files...")
        subfolders = [base_dir]

    for folder in subfolders:
        parquet_files = list(folder.glob("*.parquet"))
        if not parquet_files:
            continue
            
        all_dates = []
        print(f"Scanning {folder.name} ({len(parquet_files)} files)...")

        for file in parquet_files:
            try:
                # Optimized: Read ONLY the timestamp column to save RAM/Time
                # We try common column names until one works
                df = None
                for col in time_cols:
                    try:
                        df = pd.read_parquet(file, columns=[col])
                        found_col = col
                        break
                    except:
                        continue
                
                if df is not None:
                    # Convert to datetime if it's currently a string or integer
                    times = pd.to_datetime(df[found_col])
                    all_dates.append(times.min())
                    all_dates.append(times.max())
            except Exception as e:
                print(f"  [!] Could not read {file.name}: {e}")

        if all_dates:
            results[folder.name] = (min(all_dates), max(all_dates))

    return results

# Run the scan
summary = get_parquet_internal_ranges(root_path)

# Display Results
print("\n" + "="*50)
print(f"{'CURRENCY PAIR':<15} | {'START DATE':<12} | {'END DATE':<12}")
print("-" * 50)
if summary:
    for pair, (start, end) in summary.items():
        print(f"{pair:<15} | {start.date()} | {end.date()}")
else:
    print("No data found.")

Scanning AUD_CHF (1 files)...
Scanning AUD_JPY (1 files)...
Scanning AUD_NZD (1 files)...
Scanning AUD_USD (1 files)...
Scanning CAD_CHF (1 files)...
Scanning CHF_JPY (1 files)...
Scanning EUR_AUD (1 files)...
Scanning EUR_CHF (1 files)...
Scanning EUR_GBP (1 files)...
Scanning EUR_JPY (1 files)...
Scanning EUR_USD (1 files)...
Scanning GBP_CHF (1 files)...
Scanning GBP_USD (1 files)...
Scanning NZD_CHF (1 files)...
Scanning NZD_JPY (1 files)...
Scanning NZD_USD (1 files)...
Scanning USD_CAD (1 files)...
Scanning USD_CHF (1 files)...
Scanning USD_JPY (1 files)...
Scanning USD_THB (1 files)...

CURRENCY PAIR   | START DATE   | END DATE    
--------------------------------------------------
AUD_CHF         | 2014-12-09 | 2025-10-06
AUD_JPY         | 2014-12-10 | 2025-10-06
AUD_NZD         | 2014-12-10 | 2025-10-06
AUD_USD         | 2014-12-09 | 2025-10-06
CAD_CHF         | 2014-12-10 | 2025-10-06
CHF_JPY         | 2014-12-08 | 2025-10-06
EUR_AUD         | 2014-12-09 | 2025-10-06
EUR_CHF 