In [2]:
import zipfile
import numpy as np
import pandas as pd
from pathlib import Path
import tempfile
import shutil

def load_all_background_files(zip_path):
    """Load all background files (18:05-18:21) using offset +3"""
    
    temp_dir = tempfile.mkdtemp()
    
    try:
        print(f"Extracting {zip_path}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)
        
        temp_path = Path(temp_dir)
        all_files = sorted(temp_path.rglob("*.dat"))
        real_files = [f for f in all_files if not f.name.startswith('._')]
        
        # Filter for background period (18:05-18:21)
        background_files = []
        for f in real_files:
            parts = f.stem.split('.')
            if len(parts) >= 4:
                try:
                    hour, minute = int(parts[2]), int(parts[3])
                    if hour == 18 and 5 <= minute <= 21:
                        background_files.append(f)
                except ValueError:
                    continue
        
        print(f"Found {len(background_files)} background files")
        
        # Load all files
        data_chunks = []
        for i, filepath in enumerate(background_files):
            with open(filepath, 'rb') as f:
                raw = f.read()
            
            marker_pos = raw.find(b"Isd\tVsd")
            offset = marker_pos + len(b"Isd\tVsd") + 3  # KEY: +3 offset!
            trim_len = len(raw) - offset
            trimmed = raw[offset:offset + (trim_len - (trim_len % 8))]
            chunk = np.frombuffer(trimmed, dtype='<f4').reshape(-1, 2)
            data_chunks.append(chunk)
            
            if (i + 1) % 100 == 0:
                print(f"  Loaded {i + 1}/{len(background_files)}")
        
        print(f"\nConcatenating {len(data_chunks)} files...")
        combined = np.vstack(data_chunks)
        
        # Create DataFrame
        background_df = pd.DataFrame({
            'time_s': np.arange(len(combined)) * 0.00002,
            'Isd': combined[:, 0],
            'Vsd': combined[:, 1]
        })
        
        print(f"\nBackground Dataset Created:")
        print(f"  Samples: {len(background_df):,}")
        print(f"  Duration: {background_df['time_s'].iloc[-1]/60:.2f} minutes")
        print(f"  Memory: {background_df.memory_usage(deep=True).sum()/1024**2:.0f} MB")
        print(f"\n  Isd: mean={background_df['Isd'].mean():.6f}, std={background_df['Isd'].std():.6f}")
        print(f"  Vsd: mean={background_df['Vsd'].mean():.3e}, std={background_df['Vsd'].std():.3e}")
        
        return background_df
    
    finally:
        shutil.rmtree(temp_dir)

# Load the dataset
background_df = load_all_background_files("/Users/jacob/Desktop/ecc physics/09-20-2006/background-rawdata.zip")

Extracting /Users/jacob/Desktop/ecc physics/09-20-2006/background-rawdata.zip...
Found 925 background files
  Loaded 100/925
  Loaded 200/925
  Loaded 300/925
  Loaded 400/925
  Loaded 500/925
  Loaded 600/925
  Loaded 700/925
  Loaded 800/925
  Loaded 900/925

Concatenating 925 files...

Background Dataset Created:
  Samples: 47,273,698
  Duration: 15.76 minutes
  Memory: 721 MB

  Isd: mean=0.064609, std=0.039479
  Vsd: mean=2.748e-08, std=7.738e-09


In [3]:
# Save to avoid re-processing
#background_df.to_parquet('background_data.parquet')
# Or CSV (larger file)
background_df.to_csv('background_data.csv', index=False)