In [2]:
import zipfile
import numpy as np
import pandas as pd
from pathlib import Path
import tempfile
import shutil

def load_edc_first_5min(zip_path):
    """Load first 5 minutes of EDC data (18:26-18:31) using offset +3"""
    
    temp_dir = tempfile.mkdtemp()
    
    try:
        print(f"Extracting {zip_path}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)
        
        temp_path = Path(temp_dir)
        all_files = sorted(temp_path.rglob("*.dat"))
        real_files = [f for f in all_files if not f.name.startswith('._')]
        
        # Filter for EDC period (18:26-18:59)
        edc_files = []
        for f in real_files:
            parts = f.stem.split('.')
            if len(parts) >= 4:
                try:
                    hour, minute = int(parts[2]), int(parts[3])
                    if hour == 18 and 26 <= minute <= 59:
                        edc_files.append(f)
                except ValueError:
                    continue
        
        print(f"Found {len(edc_files)} EDC files")
        
        # Target: 5 minutes
        target_samples = 5 * 60 * 50000
        print(f"Target: {target_samples:,} samples (~5 minutes)")
        
        data_chunks = []
        total_samples = 0
        
        for i, filepath in enumerate(edc_files):
            with open(filepath, 'rb') as f:
                raw = f.read()
            
            marker_pos = raw.find(b"Isd\tVsd")
            offset = marker_pos + len(b"Isd\tVsd") + 3  # KEY: +3 offset!
            trim_len = len(raw) - offset
            trimmed = raw[offset:offset + (trim_len - (trim_len % 8))]
            chunk = np.frombuffer(trimmed, dtype='<f4').reshape(-1, 2)
            
            if total_samples + len(chunk) > target_samples:
                remaining = target_samples - total_samples
                data_chunks.append(chunk[:remaining])
                total_samples += remaining
                print(f"  Loaded {i+1} files, stopped at {total_samples:,} samples")
                break
            else:
                data_chunks.append(chunk)
                total_samples += len(chunk)
            
            if (i + 1) % 50 == 0:
                print(f"  {i+1} files, {total_samples:,} samples")
        
        combined = np.vstack(data_chunks)
        
        edc_df = pd.DataFrame({
            'time_s': np.arange(len(combined)) * 0.00002,
            'Isd': combined[:, 0],
            'Vsd': combined[:, 1]
        })
        
        print(f"\nEDC Dataset:")
        print(f"  Samples: {len(edc_df):,}")
        print(f"  Duration: {edc_df['time_s'].iloc[-1]/60:.2f} minutes")
        print(f"  Memory: {edc_df.memory_usage(deep=True).sum()/1024**2:.0f} MB")
        print(f"\n  Isd: mean={edc_df['Isd'].mean():.6f}, std={edc_df['Isd'].std():.6f}")
        print(f"  Range: [{edc_df['Isd'].min():.6f}, {edc_df['Isd'].max():.6f}]")
        
        return edc_df
    
    finally:
        shutil.rmtree(temp_dir)

# Load EDC data
edc_df = load_edc_first_5min("09-20-2006/fluctuation-rawdata.zip")

# Save for future use
edc_df.to_csv('edc_data.csv', index=False)
print("\nSaved to 'edc_data.csv'")

Extracting 09-20-2006/fluctuation-rawdata.zip...
Found 2030 EDC files
Target: 15,000,000 samples (~5 minutes)
  50 files, 2,500,000 samples
  100 files, 5,000,000 samples
  150 files, 7,500,000 samples
  200 files, 10,000,000 samples
  250 files, 12,500,000 samples
  300 files, 15,000,000 samples
  Loaded 301 files, stopped at 15,000,000 samples

EDC Dataset:
  Samples: 15,000,000
  Duration: 5.00 minutes
  Memory: 229 MB

  Isd: mean=0.070311, std=0.027307
  Range: [0.031258, 0.124999]

Saved to 'edc_data.csv'


In [10]:
import zipfile
import numpy as np
import pandas as pd
from pathlib import Path
import tempfile
import shutil

def load_all_edc_data(zip_path):
    """Load ALL EDC data (18:26-18:59) using offset +3"""
    
    temp_dir = tempfile.mkdtemp()
    
    try:
        print(f"Extracting {zip_path}...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(temp_dir)
        
        temp_path = Path(temp_dir)
        all_files = sorted(temp_path.rglob("*.dat"))
        real_files = [f for f in all_files if not f.name.startswith('._')]
        
        # Filter for EDC period (18:26-18:59)
        edc_files = []
        for f in real_files:
            parts = f.stem.split('.')
            if len(parts) >= 4:
                try:
                    hour, minute = int(parts[2]), int(parts[3])
                    if hour == 18 and 26 <= minute <= 59:
                        edc_files.append(f)
                except ValueError:
                    continue
        
        print(f"Found {len(edc_files)} EDC files")
        print(f"Estimated: ~{len(edc_files) * 50000:,} samples")
        
        # Load all files
        data_chunks = []
        total_samples = 0
        
        for i, filepath in enumerate(edc_files):
            with open(filepath, 'rb') as f:
                raw = f.read()
            
            marker_pos = raw.find(b"Isd\tVsd")
            offset = marker_pos + len(b"Isd\tVsd") + 3  # +3 offset
            trim_len = len(raw) - offset
            trimmed = raw[offset:offset + (trim_len - (trim_len % 8))]
            chunk = np.frombuffer(trimmed, dtype='<f4').reshape(-1, 2)
            
            data_chunks.append(chunk)
            total_samples += len(chunk)
            
            if (i + 1) % 200 == 0:
                print(f"  {i+1}/{len(edc_files)} files, {total_samples:,} samples so far")
        
        print(f"\nConcatenating {len(data_chunks)} chunks...")
        combined = np.vstack(data_chunks)
        
        print("Creating DataFrame...")
        edc_full_df = pd.DataFrame({
            'time_s': np.arange(len(combined)) * 0.00002,
            'Isd': combined[:, 0],
            'Vsd': combined[:, 1]
        })
        
        print(f"\nComplete EDC Dataset:")
        print(f"  Samples: {len(edc_full_df):,}")
        print(f"  Duration: {edc_full_df['time_s'].iloc[-1]/60:.2f} minutes")
        print(f"  Memory: {edc_full_df.memory_usage(deep=True).sum()/1024**2:.0f} MB")
        print(f"\n  Isd: mean={edc_full_df['Isd'].mean():.6f}, std={edc_full_df['Isd'].std():.6f}")
        print(f"  Range: [{edc_full_df['Isd'].min():.6f}, {edc_full_df['Isd'].max():.6f}]")
        
        return edc_full_df
    
    finally:
        shutil.rmtree(temp_dir)

# Load complete EDC dataset
print("Loading FULL EDC dataset (this may take a few minutes)...\n")
edc_full_df = load_all_edc_data("09-20-2006/fluctuation-rawdata.zip")

# Save as CSV
print("\nSaving to CSV...")
edc_full_df.to_csv('edc_full_data.csv', index=False)
print("Saved as 'edc_full_data.csv'")

print(f"\nFile size: ~{len(edc_full_df) * 30 / 1024**2:.0f} MB (estimated)")

Loading FULL EDC dataset (this may take a few minutes)...

Extracting 09-20-2006/fluctuation-rawdata.zip...
Found 2030 EDC files
Estimated: ~101,500,000 samples
  200/2030 files, 10,000,000 samples so far
  400/2030 files, 20,000,000 samples so far
  600/2030 files, 30,000,000 samples so far
  800/2030 files, 40,000,000 samples so far
  1000/2030 files, 50,000,000 samples so far
  1200/2030 files, 60,000,000 samples so far
  1400/2030 files, 70,000,000 samples so far
  1600/2030 files, 80,000,000 samples so far
  1800/2030 files, 90,000,000 samples so far
  2000/2030 files, 100,000,000 samples so far

Concatenating 2030 chunks...
Creating DataFrame...

Complete EDC Dataset:
  Samples: 101,535,470
  Duration: 33.85 minutes
  Memory: 1549 MB

  Isd: mean=0.070324, std=0.048659
  Range: [0.031250, 0.124999]

Saving to CSV...
Saved as 'edc_full_data.csv'

File size: ~2905 MB (estimated)
