In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

# Configure matplotlib backend properly
plt.switch_backend('TkAgg')  # Try different backend if needed
base_dir = 'historic_data'

class ROISelector:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax = None
        
    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=2, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        # Auto-close after 4 selections
        if len(self.rois) >= 4:
            plt.close(self.fig)

    def select_rois(self, background, title):
        """Interactive ROI selection with proper figure management."""
        self.rois = []
        self.fig, self.ax = plt.subplots(figsize=(10, 10))
        self.ax.imshow(background, cmap='gray')
        self.ax.set_title(f"{title}\nSelect 4 ROIs (click & drag)\nClose window when done")
        
        rs = RectangleSelector(self.ax, self.on_select,
                              useblit=True,
                              button=[1],
                              minspanx=5, minspany=5,
                              spancoords='pixels',
                              interactive=True)
        
        plt.show(block=True)  # Critical for proper interaction
        return self.rois

def process_hdf4(file_path):
    """Read HDF4 file and extract background image."""
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')  # Update to your dataset name
        data = dataset.get()
        background = data[1, :, :]  # Second element is background
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def process_year(year):
    """Process all files for a specific year."""
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        print(f"Skipping {year_dir}: Directory not found")
        return
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    if not hdf_files:
        print(f"No HDF files found in {year_dir}")
        return
    
    # Find first valid file for ROI selection
    selector = ROISelector()
    rois = []
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        background = process_hdf4(file_path)
        if background is not None:
            try:
                rois = selector.select_rois(background, 
                                           f"First valid image for {year}: {file_name}")
                if len(rois) != 4:
                    print(f"Warning: Selected {len(rois)} ROIs instead of 4. Using first 4.")
                    rois = rois[:4]
                break
            except Exception as e:
                print(f"ROI selection failed: {str(e)}")
                return
    else:
        print(f"No valid files found in {year_dir}")
        return
    
    print(f"Selected ROIs for {year}: {rois}")
    
    # Process all files with selected ROIs
    stats_list = []
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        background = process_hdf4(file_path)
        if background is None:
            continue
        
        fig, ax = plt.subplots(figsize=(10, 10))
        ax.imshow(background, cmap='gray')
        ax.set_title(f'Background: {file_name} ({year})')
        
        file_stats = []
        for roi_idx, (x, y, w, h) in enumerate(rois):
            # Draw rectangle
            rect = plt.Rectangle((x, y), w, h,
                                linewidth=2, edgecolor='red', facecolor='none')
            ax.add_patch(rect)
            
            # Calculate statistics
            roi_data = background[y:y+h, x:x+w]
            mean = np.mean(roi_data)
            std = np.std(roi_data)
            
            # Add annotation
            ax.text(x, y-10, f'ROI {roi_idx+1}: μ={mean:.2f}, σ={std:.2f}',
                    color='red', fontsize=8, backgroundcolor='white')
            
            file_stats.append({
                'Year': year,
                'File': file_name,
                'ROI': roi_idx+1,
                'Mean': mean,
                'Std': std
            })
        
        # Save plot
        plot_dir = os.path.join(year_dir, 'plots')
        os.makedirs(plot_dir, exist_ok=True)
        plot_path = os.path.join(plot_dir, f'{os.path.splitext(file_name)[0]}_plot.png')
        plt.savefig(plot_path)
        plt.close()
        
        stats_list.extend(file_stats)
    
    # Save statistics
    if stats_list:
        df = pd.DataFrame(stats_list)
        csv_path = os.path.join(year_dir, 'roi_statistics.csv')
        df.to_csv(csv_path, index=False)
        print(f"Saved statistics for {year} to {csv_path}")

# Main execution
for year in range(2015, 2026):
    process_year(year)

print("Processing completed.")

Selected ROIs for 2015: [(109, 214, 323, 386), (117, 210, 323, 386), (665, 228, 243, 385), (65, 754, 343, 192)]
Saved statistics for 2015 to historic_data\2015\roi_statistics.csv
Selected ROIs for 2016: [(159, 139, 291, 438), (740, 183, 184, 398), (528, 745, 357, 184), (173, 692, 182, 259)]
Saved statistics for 2016 to historic_data\2016\roi_statistics.csv
No HDF files found in historic_data\2017
Skipping historic_data\2018: Directory not found
Skipping historic_data\2019: Directory not found
Skipping historic_data\2020: Directory not found
Skipping historic_data\2021: Directory not found
Skipping historic_data\2022: Directory not found
Skipping historic_data\2023: Directory not found
Skipping historic_data\2024: Directory not found
Skipping historic_data\2025: Directory not found
Processing completed.


In [3]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC

# Configure matplotlib backend for interactivity
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

class ROISelector:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax = None
        
    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=2, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        # Auto-close after 4 selections
        if len(self.rois) >= 4:
            plt.close(self.fig)

    def select_rois(self, background, title):
        """Interactive ROI selection with proper figure management."""
        self.rois = []
        self.fig, self.ax = plt.subplots(figsize=(10, 10))
        self.ax.imshow(background, cmap='gray')
        self.ax.set_title(f"{title}\nSelect 4 ROIs (click & drag)\nClose window when done")
        
        rs = RectangleSelector(self.ax, self.on_select,
                              useblit=True,
                              button=[1],
                              minspanx=5, minspany=5,
                              spancoords='pixels',
                              interactive=True)
        
        plt.show(block=True)
        return self.rois

def process_hdf4(file_path):
    """Read HDF4 file and extract background image."""
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')  # Update to your dataset name
        data = dataset.get()
        background = data[1, :, :]  # Second element is background
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def process_year(year):
    """Process all files for a specific year."""
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        print(f"Skipping {year_dir}: Directory not found")
        return
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    if not hdf_files:
        print(f"No HDF files found in {year_dir}")
        return
    
    # ROI Selection
    selector = ROISelector()
    rois = []
    
    # Find first valid file for ROI selection
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        background = process_hdf4(file_path)
        if background is not None:
            try:
                rois = selector.select_rois(background, f"First valid image for {year}: {file_name}")
                if len(rois) != 4:
                    print(f"Selected {len(rois)} ROIs. Using first 4.")
                    rois = rois[:4]
                break
            except Exception as e:
                print(f"ROI selection failed: {str(e)}")
                return
    else:
        print(f"No valid files found in {year_dir}")
        return
    
    print(f"\n{'='*40}")
    print(f"Processing year {year} with ROIs at:")
    for idx, (x, y, w, h) in enumerate(rois):
        print(f"ROI {idx+1}: X={x}-{x+w}, Y={y}-{y+h}")
    print(f"{'='*40}\n")
    
    # Process all files
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        background = process_hdf4(file_path)
        if background is None:
            continue
        
        print(f"\nYear {year} - File: {file_name}")
        for roi_idx, (x, y, w, h) in enumerate(rois):
            roi_data = background[y:y+h, x:x+w]
            mean = np.mean(roi_data)
            std = np.std(roi_data)
            
            print(f"  ROI {roi_idx+1}:")
            print(f"    Mean/Average: {mean:.4f}")
            print(f"    Standard Deviation: {std:.4f}")
            print(f"    Value Range: {np.min(roi_data):.2f} - {np.max(roi_data):.2f}")

# Main execution
for year in range(2015, 2026):
    process_year(year)

print("\nProcessing completed. All results printed above.")


Processing year 2015 with ROIs at:
ROI 1: X=61-478, Y=80-246
ROI 2: X=58-454, Y=706-1004
ROI 3: X=577-1004, Y=46-550
ROI 4: X=566-1004, Y=640-999


Year 2015 - File: EPWROSS_2w_527cw_150um.hdf
  ROI 1:
    Mean/Average: 656.6950
    Standard Deviation: 23.6948
    Value Range: 618.00 - 2650.00
  ROI 2:
    Mean/Average: 658.5005
    Standard Deviation: 21.0857
    Value Range: 617.00 - 2681.00
  ROI 3:
    Mean/Average: 660.7198
    Standard Deviation: 29.8071
    Value Range: 614.00 - 5803.00
  ROI 4:
    Mean/Average: 662.0486
    Standard Deviation: 25.6554
    Value Range: 618.00 - 3559.00

Year 2015 - File: EPW_2wFiber_200umPH_CCD.hdf
  ROI 1:
    Mean/Average: 603.6989
    Standard Deviation: 5.7911
    Value Range: 533.00 - 649.00
  ROI 2:
    Mean/Average: 605.6432
    Standard Deviation: 5.8730
    Value Range: 534.00 - 654.00
  ROI 3:
    Mean/Average: 603.9088
    Standard Deviation: 5.8106
    Value Range: 525.00 - 662.00
  ROI 4:
    Mean/Average: 605.2995
    Standard De

In [34]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC

# Configure matplotlib backend for interactivity
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

class ROISelector:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax = None
        
    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=2, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        # Auto-close after 2 selections
        if len(self.rois) >= 2:
            plt.close(self.fig)

    def select_rois(self, background, title):
        """Interactive ROI selection with proper figure management."""
        self.rois = []
        self.fig, self.ax = plt.subplots(figsize=(10, 10))
        self.ax.imshow(background, cmap='gray')
        self.ax.set_title(f"{title}\nSelect 2 ROIs (click & drag)\nClose window when done")
        
        rs = RectangleSelector(self.ax, self.on_select,
                              useblit=True,
                              button=[1],
                              minspanx=5, minspany=5,
                              spancoords='pixels',
                              interactive=True)
        
        plt.show(block=True)
        return self.rois

def process_hdf4(file_path):
    """Read HDF4 file and extract background image."""
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')  # Update to your dataset name
        data = dataset.get()
        background = data[1, :, :]  # Second element is background
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def process_year(year):
    """Process all files for a specific year."""
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        print(f"Skipping {year_dir}: Directory not found")
        return
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    if not hdf_files:
        print(f"No HDF files found in {year_dir}")
        return
    
    # ROI Selection
    selector = ROISelector()
    rois = []
    
    # Find first valid file for ROI selection
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        background = process_hdf4(file_path)
        if background is not None:
            try:
                rois = selector.select_rois(background, f"First valid image for {year}: {file_name}")
                if len(rois) != 2:
                    print(f"Selected {len(rois)} ROIs. Using first 2.")
                    rois = rois[:2]
                break
            except Exception as e:
                print(f"ROI selection failed: {str(e)}")
                return
    else:
        print(f"No valid files found in {year_dir}")
        return
    
    # Process all files
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        background = process_hdf4(file_path)
        if background is None:
            continue
        
        # Collect all pixel values from all ROIs
        all_pixels = []
        stats = []
        
        print(f"\nYear {year} - File: {file_name}")
        for roi_idx, (x, y, w, h) in enumerate(rois):
            try:
                roi_data = background[y:y+h, x:x+w]
                all_pixels.extend(roi_data.flatten())
                
                # Calculate statistics
                mean = np.mean(roi_data)
                std = np.std(roi_data)
                stats.append((mean, std))
                
                print(f"  ROI {roi_idx+1}:")
                print(f"    Mean: {mean:.4f}")
                print(f"    Std Dev: {std:.4f}")
                
            except Exception as e:
                print(f"  Error processing ROI {roi_idx+1}: {str(e)}")
                continue
        
        # Create combined histogram
        if all_pixels:
            plt.figure(figsize=(10, 6))
            plt.hist(all_pixels, bins=50, color='blue', alpha=0.7,
                    edgecolor='black', density=False)
            plt.title(f"Pixel Value Distribution\n{file_name} ({year})")
            plt.xlabel("Pixel Value")
            plt.ylabel("Frequency")
            plt.grid(True, alpha=0.3)
            
            # Add statistics to plot
            stats_text = "\n".join([f"ROI {i+1}: μ={m:.2f} σ={s:.2f}" 
                                  for i, (m, s) in enumerate(stats)])
            plt.annotate(stats_text, xy=(0.98, 0.98), xycoords='axes fraction',
                        ha='right', va='top', fontsize=9,
                        bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
            
            plt.tight_layout()
            plt.show()

# Main execution
for year in range(2015, 2026):
    process_year(year)

print("\nProcessing completed. All histograms displayed.")


Year 2015 - File: EPWROSS_2w_527cw_150um.hdf
  ROI 1:
    Mean: 657.3184
    Std Dev: 21.1244
  ROI 2:
    Mean: 660.7423
    Std Dev: 25.2478

Year 2015 - File: EPW_2wFiber_200umPH_CCD.hdf
  ROI 1:
    Mean: 604.4100
    Std Dev: 5.8834
  ROI 2:
    Mean: 604.4996
    Std Dev: 5.8544

Year 2015 - File: IAWROSS_2w_532cw_150um.hdf
  ROI 1:
    Mean: 631.1819
    Std Dev: 26.6383
  ROI 2:
    Mean: 636.2883
    Std Dev: 30.1290

Year 2015 - File: IAW_2wFiber_200umPH_CCD.hdf
  ROI 1:
    Mean: 605.6520
    Std Dev: 6.1153
  ROI 2:
    Mean: 605.7463
    Std Dev: 6.2014
Selected 0 ROIs. Using first 2.

Year 2016 - File: EPW_ccd_o_tcc.hdf

Year 2016 - File: IAW_ccd_o_tcc.hdf

Year 2016 - File: epw_ross_4w_263p25cw.hdf

Year 2016 - File: iaw_ross_4w_263p25cw.hdf
No HDF files found in historic_data\2017
Skipping historic_data\2018: Directory not found
Skipping historic_data\2019: Directory not found
Skipping historic_data\2020: Directory not found
Skipping historic_data\2021: Directory not f

In [33]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

class ROIAnalyzer:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax_image = None
        self.ax_hist = None
        
    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        if len(self.rois) >= 4:
            return
        
        # Get coordinates
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax_image.add_patch(rect)
        self.fig.canvas.draw()
        
        # Auto-process when 4 ROIs are selected
        if len(self.rois) == 4:
            self.process_rois()

    def process_rois(self):
        """Calculate statistics and update histogram."""
        # Collect all pixel values
        all_pixels = []
        stats = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.background[y:y+h, x:x+w]
                pixels = roi_data.flatten()
                all_pixels.extend(pixels)
                stats.append((np.mean(pixels), np.std(pixels)))
            except Exception as e:
                print(f"Error processing ROI: {str(e)}")
                continue
        
        # Update histogram
        self.ax_hist.clear()
        if all_pixels:
            self.ax_hist.hist(all_pixels, bins=50, color='blue', 
                            alpha=0.7, edgecolor='black')
            self.ax_hist.set_title("Pixel Value Distribution")
            self.ax_hist.set_xlabel("Pixel Value")
            self.ax_hist.set_ylabel("Frequency")
            self.ax_hist.grid(True, alpha=0.3)
            
            # Add statistics
            stats_text = "\n".join(
                [f"ROI {i+1}: μ={m:.2f} σ={s:.2f}" 
                 for i, (m, s) in enumerate(stats)]
            )
            self.ax_hist.text(0.98, 0.98, stats_text,
                            transform=self.ax_hist.transAxes,
                            ha='right', va='top',
                            bbox=dict(facecolor='white', alpha=0.8))
            
        self.fig.canvas.draw()

def process_hdf4(file_path):
    """Read HDF4 file and extract background image."""
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')  # Update dataset name
        data = dataset.get()
        background = data[1, :, :]  # Second element is background
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    """Analyze a single file with combined interface."""
    background = process_hdf4(file_path)
    if background is None:
        return
    
    analyzer = ROIAnalyzer()
    analyzer.background = background
    
    # Create figure with 2 subplots
    analyzer.fig = plt.figure(figsize=(14, 6))
    gs = analyzer.fig.add_gridspec(1, 2, width_ratios=[1.2, 1])
    
    # Image subplot
    analyzer.ax_image = analyzer.fig.add_subplot(gs[0])
    analyzer.ax_image.imshow(background, cmap='gray')
    analyzer.ax_image.set_title(f"Select 4 ROIs\n{os.path.basename(file_path)} ({year})")
    
    # Histogram subplot
    analyzer.ax_hist = analyzer.fig.add_subplot(gs[1])
    analyzer.ax_hist.set_title("Histogram will update after 4 ROI selections")
    analyzer.ax_hist.axis('off')
    
    # Add ROI selector
    rs = RectangleSelector(analyzer.ax_image, analyzer.on_select,
                          useblit=True,
                          button=[1],
                          minspanx=5, minspany=5,
                          spancoords='pixels',
                          interactive=True)
    
    # Add instructions
    analyzer.fig.text(0.5, 0.02, 
                    "Select 4 ROIs by click-and-drag | Close window to continue",
                    ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nProcessed: {os.path.basename(file_path)}")
    print(f"Total pixels analyzed: {sum([w*h for x,y,w,h in analyzer.rois])}")

def process_year(year):
    """Process all files in a year directory."""
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        print(f"Skipping {year_dir} - not found")
        return
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        analyze_file(file_path, year)

# Main execution
for year in range(2015, 2026):
    process_year(year)

print("\nProcessing completed. All files analyzed.")


Processed: EPWROSS_2w_527cw_150um.hdf
Total pixels analyzed: 0

Processed: EPW_2wFiber_200umPH_CCD.hdf
Total pixels analyzed: 0

Processed: IAWROSS_2w_532cw_150um.hdf
Total pixels analyzed: 0

Processed: IAW_2wFiber_200umPH_CCD.hdf
Total pixels analyzed: 0

Processed: EPW_ccd_o_tcc.hdf
Total pixels analyzed: 0

Processed: IAW_ccd_o_tcc.hdf
Total pixels analyzed: 0

Processed: epw_ross_4w_263p25cw.hdf
Total pixels analyzed: 0

Processed: iaw_ross_4w_263p25cw.hdf
Total pixels analyzed: 0
Skipping historic_data\2018 - not found
Skipping historic_data\2019 - not found
Skipping historic_data\2020 - not found
Skipping historic_data\2021 - not found
Skipping historic_data\2022 - not found
Skipping historic_data\2023 - not found
Skipping historic_data\2024 - not found
Skipping historic_data\2025 - not found

Processing completed. All files analyzed.


In [32]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

class ROIAnalyzer:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax_image = None
        self.ax_hist = None
        self.stats = {}
        
    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        if len(self.rois) >= 2:
            return
        
        # Get coordinates
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax_image.add_patch(rect)
        self.fig.canvas.draw()
        
        # Auto-process when 2 ROIs are selected
        if len(self.rois) == 2:
            self.process_rois()

    def process_rois(self):
        """Calculate statistics and update histogram."""
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.background[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                print(f"Error processing ROI: {str(e)}")
                continue
        
        # Calculate combined statistics
        if combined_pixels:
            stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels)
            }
            
            # Update histogram
            self.ax_hist.clear()
            n, bins, patches = self.ax_hist.hist(combined_pixels, bins=50, 
                                                color='blue', alpha=0.7, 
                                                edgecolor='black')
            
            # Add statistics annotation
            stats_text = (f"Combined Statistics:\n"
                         f"Mean: {stats['mean']:.2f}\n"
                         f"Median: {stats['median']:.2f}\n"
                         f"Std Dev: {stats['std']:.2f}\n"
                         f"Total Pixels: {len(combined_pixels):,}")
            
            self.ax_hist.text(0.98, 0.98, stats_text,
                            transform=self.ax_hist.transAxes,
                            ha='right', va='top',
                            bbox=dict(facecolor='white', alpha=0.8))
            
            self.ax_hist.set_title("Combined Pixel Distribution")
            self.ax_hist.set_xlabel("Pixel Value")
            self.ax_hist.set_ylabel("Frequency")
            self.ax_hist.grid(True, alpha=0.3)
            
            self.fig.canvas.draw()

def process_hdf4(file_path):
    """Read HDF4 file and extract background image."""
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')  # Update dataset name if needed
        data = dataset.get()
        background = data[1, :, :]  # Second element is background
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    """Analyze a single file with combined interface."""
    background = process_hdf4(file_path)
    if background is None:
        return
    
    analyzer = ROIAnalyzer()
    analyzer.background = background
    
    # Create figure with 2 subplots
    analyzer.fig = plt.figure(figsize=(14, 6))
    gs = analyzer.fig.add_gridspec(1, 2, width_ratios=[1.2, 1])
    
    # Image subplot
    analyzer.ax_image = analyzer.fig.add_subplot(gs[0])
    analyzer.ax_image.imshow(background, cmap='gray')
    analyzer.ax_image.set_title(f"Select 2 ROIs\n{os.path.basename(file_path)} ({year})")
    
    # Histogram subplot
    analyzer.ax_hist = analyzer.fig.add_subplot(gs[1])
    analyzer.ax_hist.set_title("Histogram will update after 2 ROI selections")
    analyzer.ax_hist.axis('off')
    
    # Add ROI selector
    rs = RectangleSelector(analyzer.ax_image, analyzer.on_select,
                          useblit=True,
                          button=[1],
                          minspanx=5, minspany=5,
                          spancoords='pixels',
                          interactive=True)
    
    # Add instructions
    analyzer.fig.text(0.5, 0.02, 
                    "Select 2 ROIs by click-and-drag | Close window to continue",
                    ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nProcessed: {os.path.basename(file_path)}")
    print(f"Combined Statistics:")
    print(f"- Mean: {analyzer.stats.get('mean', 'N/A'):.2f}")
    print(f"- Median: {analyzer.stats.get('median', 'N/A'):.2f}")
    print(f"- Std Dev: {analyzer.stats.get('std', 'N/A'):.2f}")
    print(f"- Total Pixels: {analyzer.stats.get('total_pixels', 'N/A'):,}")

def process_year(year):
    """Process all files in a year directory."""
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        print(f"Skipping {year_dir} - not found")
        return
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        analyze_file(file_path, year)

# Main execution
for year in range(2015, 2026):
    process_year(year)

print("\nProcessing completed. All files analyzed.")


Processed: EPWROSS_2w_527cw_150um.hdf
Combined Statistics:


ValueError: Unknown format code 'f' for object of type 'str'

In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

class ROIAnalyzer:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax_image = None
        self.ax_hist = None
        self.stats = {}  # Initialize stats dictionary
        
    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        if len(self.rois) >= 2:
            return
        
        # Get coordinates
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax_image.add_patch(rect)
        self.fig.canvas.draw()
        
        # Auto-process when 2 ROIs are selected
        if len(self.rois) == 2:
            self.process_rois()

    def process_rois(self):
        """Calculate statistics and update histogram."""
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.background[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                print(f"Error processing ROI: {str(e)}")
                continue
        
        # Calculate combined statistics
        self.stats = {}  # Reset stats
        if combined_pixels:
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total_pixels': len(combined_pixels)
            }
            
            # Update histogram
            self.ax_hist.clear()
            n, bins, patches = self.ax_hist.hist(combined_pixels, bins=50, 
                                                color='blue', alpha=0.7, 
                                                edgecolor='black')
            
            # Add statistics annotation
            stats_text = (f"Combined Statistics:\n"
                         f"Mean: {self.stats['mean']:.2f}\n"
                         f"Median: {self.stats['median']:.2f}\n"
                         f"Std Dev: {self.stats['std']:.2f}\n"
                         f"Total Pixels: {self.stats['total_pixels']:,}")
            
            self.ax_hist.text(0.98, 0.98, stats_text,
                            transform=self.ax_hist.transAxes,
                            ha='right', va='top',
                            bbox=dict(facecolor='white', alpha=0.8))
            
            self.ax_hist.set_title("Combined Pixel Distribution")
            self.ax_hist.set_xlabel("Pixel Value")
            self.ax_hist.set_ylabel("Frequency")
            self.ax_hist.grid(True, alpha=0.3)
            
            self.fig.canvas.draw()

def process_hdf4(file_path):
    """Read HDF4 file and extract background image."""
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')  # Update dataset name if needed
        data = dataset.get()
        background = data[1, :, :]  # Second element is background
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    """Analyze a single file with combined interface."""
    background = process_hdf4(file_path)
    if background is None:
        return
    
    analyzer = ROIAnalyzer()
    analyzer.background = background
    
    # Create figure with 2 subplots
    analyzer.fig = plt.figure(figsize=(14, 6))
    gs = analyzer.fig.add_gridspec(1, 2, width_ratios=[1.2, 1])
    
    # Image subplot
    analyzer.ax_image = analyzer.fig.add_subplot(gs[0])
    analyzer.ax_image.imshow(background, cmap='gray')
    analyzer.ax_image.set_title(f"Select 2 ROIs\n{os.path.basename(file_path)} ({year})")
    
    # Histogram subplot
    analyzer.ax_hist = analyzer.fig.add_subplot(gs[1])
    analyzer.ax_hist.set_title("Histogram will update after 2 ROI selections")
    analyzer.ax_hist.axis('off')
    
    # Add ROI selector
    rs = RectangleSelector(analyzer.ax_image, analyzer.on_select,
                          useblit=True,
                          button=[1],
                          minspanx=5, minspany=5,
                          spancoords='pixels',
                          interactive=True)
    
    # Add instructions
    analyzer.fig.text(0.5, 0.02, 
                    "Select 2 ROIs by click-and-drag | Close window to continue",
                    ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    
    # Print statistics after processing
    print(f"\nProcessed: {os.path.basename(file_path)}")
    if analyzer.stats:
        print(f"Combined Statistics:")
        print(f"- Mean: {analyzer.stats.get('mean', 'N/A'):.2f}")
        print(f"- Median: {analyzer.stats.get('median', 'N/A'):.2f}")
        print(f"- Std Dev: {analyzer.stats.get('std', 'N/A'):.2f}")
        print(f"- Total Pixels: {analyzer.stats.get('total_pixels', 'N/A'):,}")
    else:
        print("No valid statistics calculated")

def process_year(year):
    """Process all files in a year directory."""
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        print(f"Skipping {year_dir} - not found")
        return
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        analyze_file(file_path, year)

# Main execution
for year in range(2015, 2026):
    process_year(year)

# Main processing
all_data = []
for year in range(2015, 2026):
    year_result = process_year(year)
    if year_result:
        all_data.append(year_result)

# Create CSV structure
if all_data:
    # Collect all unique filenames
    filenames = set()
    for entry in all_data:
        filenames.update(entry.keys() - {'Year'})
    sorted_files = sorted(filenames)
    
    # Create column headers
    columns = ['Year']
    for fname in sorted_files:
        columns.extend([
            f"{fname}_mean",
            f"{fname}_median",
            f"{fname}_std",
            f"{fname}_total"
        ])
    
    # Build CSV rows
    csv_rows = []
    for entry in all_data:
        row = [entry['Year']]
        for fname in sorted_files:
            if fname in entry:
                stats = entry[fname]
                row.extend([
                    stats.get('mean', np.nan),
                    stats.get('median', np.nan),
                    stats.get('std', np.nan),
                    stats.get('total', np.nan)
                ])
            else:
                row.extend([np.nan]*4)
        csv_rows.append(row)
    
    # Create and save DataFrame
    df = pd.DataFrame(csv_rows, columns=columns)
    df.to_csv('analysis_results.csv', index=False)
    print("Results saved to analysis_results.csv")
else:
    print("No data processed")
    
print("\nProcessing completed. All files analyzed.")


Processed: EPWROSS_2w_527cw_150um.hdf
Combined Statistics:
- Mean: 660.18
- Median: 657.00
- Std Dev: 26.94
- Total Pixels: 518,379

Processed: EPW_2wFiber_200umPH_CCD.hdf
Combined Statistics:
- Mean: 604.26
- Median: 604.00
- Std Dev: 5.86
- Total Pixels: 489,657

Processed: IAWROSS_2w_532cw_150um.hdf
Combined Statistics:
- Mean: 634.83
- Median: 629.00
- Std Dev: 37.65
- Total Pixels: 506,520

Processed: IAW_2wFiber_200umPH_CCD.hdf
Combined Statistics:
- Mean: 605.73
- Median: 606.00
- Std Dev: 6.16
- Total Pixels: 516,310

Processed: EPW_ccd_o_tcc.hdf
Combined Statistics:
- Mean: 604.85
- Median: 605.00
- Std Dev: 5.99
- Total Pixels: 365,292

Processed: IAW_ccd_o_tcc.hdf
Combined Statistics:
- Mean: 605.58
- Median: 606.00
- Std Dev: 6.17
- Total Pixels: 454,181

Processed: epw_ross_4w_263p25cw.hdf
Combined Statistics:
- Mean: 738.07
- Median: 717.00
- Std Dev: 124.69
- Total Pixels: 564,112

Processed: iaw_ross_4w_263p25cw.hdf
Combined Statistics:
- Mean: 724.09
- Median: 704.00


KeyboardInterrupt: 

In [5]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

plt.switch_backend('TkAgg')
base_dir = 'historic_data'
results = []

class ROIAnalyzer:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax_image = None
        self.ax_hist = None
        self.stats = {}

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax_image.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.process_rois()

    def process_rois(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.background[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }
            
            self.ax_hist.clear()
            self.ax_hist.hist(combined_pixels, bins=50, color='blue', alpha=0.7, edgecolor='black')
            
            stats_text = (f"Combined Statistics:\n"
                         f"Mean: {self.stats['mean']:.2f}\n"
                         f"Median: {self.stats['median']:.2f}\n"
                         f"Std Dev: {self.stats['std']:.2f}\n"
                         f"Total Pixels: {self.stats['total']:,}")
            
            self.ax_hist.text(0.98, 0.98, stats_text,
                            transform=self.ax_hist.transAxes,
                            ha='right', va='top',
                            bbox=dict(facecolor='white', alpha=0.8))
            
            self.ax_hist.set_title("Combined Pixel Distribution")
            self.ax_hist.set_xlabel("Pixel Value")
            self.ax_hist.set_ylabel("Frequency")
            self.ax_hist.grid(True, alpha=0.3)
            
            self.fig.canvas.draw()

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None
    
    analyzer = ROIAnalyzer()
    analyzer.background = background
    
    analyzer.fig = plt.figure(figsize=(14, 6))
    gs = analyzer.fig.add_gridspec(1, 2, width_ratios=[1.2, 1])
    
    analyzer.ax_image = analyzer.fig.add_subplot(gs[0])
    analyzer.ax_image.imshow(background, cmap='gray')
    analyzer.ax_image.set_title(f"Select 2 ROIs\n{os.path.basename(file_path)} ({year})")
    
    analyzer.ax_hist = analyzer.fig.add_subplot(gs[1])
    analyzer.ax_hist.axis('off')
    
    RectangleSelector(analyzer.ax_image, analyzer.on_select,
                     useblit=True, button=[1],
                     minspanx=5, minspany=5,
                     spancoords='pixels', interactive=True)
    
    analyzer.fig.text(0.5, 0.02, 
                     "Select 2 ROIs by click-and-drag | Close window to continue",
                     ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.show()
    plt.close()
    
    return os.path.basename(file_path), analyzer.stats

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_data = {'Year': year}
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats = analyze_file(file_path, year)
        if filename and stats:
            year_data[filename] = stats
    
    return year_data

# Main processing
all_data = []
for year in range(2015, 2026):
    year_result = process_year(year)
    if year_result:
        all_data.append(year_result)

# Create CSV structure
if all_data:
    # Collect all unique filenames
    filenames = set()
    for entry in all_data:
        filenames.update(entry.keys() - {'Year'})
    sorted_files = sorted(filenames)
    
    # Create column headers
    columns = ['Year']
    for fname in sorted_files:
        columns.extend([
            f"{fname}_mean",
            f"{fname}_median",
            f"{fname}_std",
            f"{fname}_total"
        ])
    
    # Build CSV rows
    csv_rows = []
    for entry in all_data:
        row = [entry['Year']]
        for fname in sorted_files:
            if fname in entry:
                stats = entry[fname]
                row.extend([
                    stats.get('mean', np.nan),
                    stats.get('median', np.nan),
                    stats.get('std', np.nan),
                    stats.get('total', np.nan)
                ])
            else:
                row.extend([np.nan]*4)
        csv_rows.append(row)
    
    # Create and save DataFrame
    df = pd.DataFrame(csv_rows, columns=columns)
    df.to_csv('analysis_results.csv', index=False)
    print("Results saved to analysis_results.csv")
else:
    print("No data processed")

print("Processing completed.")

Results saved to analysis_results.csv
Processing completed.


In [7]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

plt.switch_backend('TkAgg')  # Force interactive backend
base_dir = 'historic_data'
results = []

class ROIAnalyzer:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax_image = None
        self.ax_hist = None
        self.stats = {}
        self.selection_complete = False

    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        if len(self.rois) >= 2:
            return
        
        # Get coordinates
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                            linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax_image.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.process_rois()
            self.selection_complete = True
            plt.close(self.fig)

    def process_rois(self):
        """Calculate statistics and update histogram."""
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.background[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None
    
    analyzer = ROIAnalyzer()
    analyzer.background = background
    
    # Create figure with 2 subplots
    analyzer.fig = plt.figure(figsize=(14, 6))
    gs = analyzer.fig.add_gridspec(1, 2, width_ratios=[1.2, 1])
    
    # Image subplot
    analyzer.ax_image = analyzer.fig.add_subplot(gs[0])
    analyzer.ax_image.imshow(background, cmap='gray')
    analyzer.ax_image.set_title(f"Select 2 ROIs\n{os.path.basename(file_path)} ({year})")
    
    # Histogram subplot
    analyzer.ax_hist = analyzer.fig.add_subplot(gs[1])
    analyzer.ax_hist.axis('off')
    
    # Add ROI selector
    rs = RectangleSelector(analyzer.ax_image, analyzer.on_select,
                          useblit=True,
                          button=[1],
                          minspanx=5, minspany=5,
                          spancoords='pixels',
                          interactive=True)
    
    plt.tight_layout()
    plt.show(block=True)  # Block until window closes
    
    return os.path.basename(file_path), analyzer.stats

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_data = {'Year': year}
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats = analyze_file(file_path, year)
        if filename and stats:
            year_data[f"{filename}_mean"] = stats['mean']
            year_data[f"{filename}_median"] = stats['median']
            year_data[f"{filename}_std"] = stats['std']
            year_data[f"{filename}_total"] = stats['total']
    
    return year_data

# Main processing
all_data = []
for year in range(2015, 2026):
    year_result = process_year(year)
    if year_result:
        all_data.append(year_result)

# Create and save DataFrame
if all_data:
    df = pd.DataFrame(all_data)
    
    # Reorder columns: Year first, then alphabetical files with metrics
    columns = ['Year'] + sorted([c for c in df.columns if c != 'Year'])
    df = df[columns]
    
    df.to_csv('analysis_results.csv', index=False)
    print("Results saved to analysis_results.csv")
else:
    print("No data processed")

print("Processing completed.")

Results saved to analysis_results.csv
Processing completed.


In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

plt.switch_backend('TkAgg')
base_dir = 'historic_data'

# Category mapping function
def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw ccd'
        elif 'ross' in lower_name:
            return 'iaw ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw ccd'
        elif 'ross' in lower_name:
            return 'epw ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self):
        self.rois = []
        self.fig = None
        self.ax_image = None
        self.ax_hist = None
        self.stats = {}
        self.selection_complete = False

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax_image.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.process_rois()
            self.selection_complete = True
            plt.close(self.fig)

    def process_rois(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.background[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('ImageData')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None
    
    analyzer = ROIAnalyzer()
    analyzer.background = background
    
    analyzer.fig = plt.figure(figsize=(14, 6))
    gs = analyzer.fig.add_gridspec(1, 2, width_ratios=[1.2, 1])
    
    analyzer.ax_image = analyzer.fig.add_subplot(gs[0])
    analyzer.ax_image.imshow(background, cmap='gray')
    analyzer.ax_image.set_title(f"Select 2 ROIs\n{os.path.basename(file_path)} ({year})")
    
    analyzer.ax_hist = analyzer.fig.add_subplot(gs[1])
    analyzer.ax_hist.axis('off')
    
    RectangleSelector(analyzer.ax_image, analyzer.on_select,
                    useblit=True, button=[1],
                    minspanx=5, minspany=5,
                    spancoords='pixels', interactive=True)
    
    plt.tight_layout()
    plt.show(block=True)
    
    return os.path.basename(file_path), analyzer.stats

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_data = {'Year': year}
    
    # Initialize category storage
    categories = {
        'iaw ccd': {'means': [], 'medians': [], 'stds': [], 'totals': []},
        'iaw ross': {'means': [], 'medians': [], 'stds': [], 'totals': []},
        'epw ccd': {'means': [], 'medians': [], 'stds': [], 'totals': []},
        'epw ross': {'means': [], 'medians': [], 'stds': [], 'totals': []}
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats = analyze_file(file_path, year)
        
        if filename and stats:
            category = categorize_file(filename)
            if category in categories:
                categories[category]['means'].append(stats['mean'])
                categories[category]['medians'].append(stats['median'])
                categories[category]['stds'].append(stats['std'])
                categories[category]['totals'].append(stats['total'])
    
    # Calculate aggregated statistics for each category
    for category, data in categories.items():
        if data['means']:
            year_data[f"{category}_mean"] = np.mean(data['means'])
            year_data[f"{category}_median"] = np.mean(data['medians'])
            year_data[f"{category}_std"] = np.mean(data['stds'])
            year_data[f"{category}_total"] = np.sum(data['totals'])
    
    return year_data

# Main processing
all_data = []
for year in range(2015, 2026):
    year_result = process_year(year)
    if year_result:
        all_data.append(year_result)

# Create and save DataFrame
if all_data:
    df = pd.DataFrame(all_data)
    
    # Define column order
    columns = ['Year']
    for category in ['iaw ccd', 'iaw ross', 'epw ccd', 'epw ross']:
        columns += [
            f"{category}_mean",
            f"{category}_median", 
            f"{category}_std",
            f"{category}_total"
        ]
    
    df = df.reindex(columns=columns)
    df.to_csv('categorized_results.csv', index=False)
    print("Results saved to categorized_results.csv")
else:
    print("No data processed")

print("Processing completed.")

Results saved to categorized_results.csv
Processing completed.


In [4]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

# Category mapping function
def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self):
        self.rois = []
        self.stats = {}
        self.background = None
        self.fig = None
        self.ax = None

    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        if len(self.rois) >= 2:
            return
        
        # Get coordinates
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        plt.draw()
        
        if len(self.rois) == 2:
            self.process_rois()
            plt.close(self.fig)

    def process_rois(self):
        """Calculate statistics from selected ROIs."""
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.background[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                print(f"Error processing ROI: {str(e)}")
                continue
        
        if combined_pixels:
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }

def process_hdf4(file_path):
    """Read HDF4 file and verify data structure."""
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')  # Verify dataset name
        data = dataset.get()
        
        # Verify data shape
        if data.shape[0] < 2:
            raise ValueError("Invalid dataset structure")
            
        background = data[1, :, :]  # Second element is background
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    """Analyze a single file with interactive ROI selection."""
    background = process_hdf4(file_path)
    if background is None:
        return None, None
    
    analyzer = ROIAnalyzer()
    analyzer.background = background
    
    # Create figure
    analyzer.fig, analyzer.ax = plt.subplots(figsize=(10, 6))
    analyzer.ax.imshow(background, cmap='gray')
    analyzer.ax.set_title(f"Select 2 ROIs\n{os.path.basename(file_path)} ({year})")
    
    # Connect selector
    rs = RectangleSelector(analyzer.ax, analyzer.on_select,
                          useblit=True,
                          button=[1],
                          minspanx=5, minspany=5,
                          spancoords='pixels',
                          interactive=True)
    
    # Show plot and block execution
    plt.show(block=True)
    plt.close()  # Ensure figure is properly closed
    
    return os.path.basename(file_path), analyzer.stats

def process_year(year):
    """Process all files in a year directory."""
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        print(f"Directory not found: {year_dir}")
        return None
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_data = {'Year': year}
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        print(f"Processing: {file_path}")
        
        filename, stats = analyze_file(file_path, year)
        if not stats:
            continue
        
        # Categorize file
        category = categorize_file(filename)
        if category not in category_data:
            continue
            
        # Store stats
        category_data[category].append({
            'mean': stats['mean'],
            'median': stats['median'],
            'std': stats['std'],
            'total': stats['total']
        })
    
    # Aggregate category data
    for category, entries in category_data.items():
        if entries:
            year_data[f"{category}_mean"] = np.mean([e['mean'] for e in entries])
            year_data[f"{category}_median"] = np.mean([e['median'] for e in entries])
            year_data[f"{category}_std"] = np.mean([e['std'] for e in entries])
            year_data[f"{category}_total"] = np.sum([e['total'] for e in entries])
    
    return year_data

# Main processing
all_data = []
for year in range(2015, 2026):
    print(f"\nProcessing year {year}")
    year_result = process_year(year)
    if year_result:
        all_data.append(year_result)

# Create and save DataFrame
if all_data:
    df = pd.DataFrame(all_data)
    
    # Ensure column order
    columns = ['Year']
    for category in ['iaw_ccd', 'iaw_ross', 'epw_ccd', 'epw_ross']:
        columns += [f"{category}_mean", f"{category}_median", 
                   f"{category}_std", f"{category}_total"]
    
    df = df.reindex(columns=columns)
    df.to_csv('categorized_results.csv', index=False)
    print("\nResults successfully saved to categorized_results.csv")
else:
    print("\nNo data processed")

print("\nProcessing completed.")


Processing year 2015
Processing: historic_data\2015\EPWROSS_2w_527cw_150um.hdf
Processing: historic_data\2015\EPW_2wFiber_200umPH_CCD.hdf
Processing: historic_data\2015\IAWROSS_2w_532cw_150um.hdf
Processing: historic_data\2015\IAW_2wFiber_200umPH_CCD.hdf

Processing year 2016
Processing: historic_data\2016\EPW_ccd_o_tcc.hdf
Processing: historic_data\2016\IAW_ccd_o_tcc.hdf
Processing: historic_data\2016\epw_ross_4w_263p25cw.hdf
Processing: historic_data\2016\iaw_ross_4w_263p25cw.hdf

Processing year 2017

Processing year 2018
Directory not found: historic_data\2018

Processing year 2019
Directory not found: historic_data\2019

Processing year 2020
Directory not found: historic_data\2020

Processing year 2021
Directory not found: historic_data\2021

Processing year 2022
Directory not found: historic_data\2022

Processing year 2023
Directory not found: historic_data\2023

Processing year 2024
Directory not found: historic_data\2024

Processing year 2025
Directory not found: historic_data

In [6]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

plt.switch_backend('TkAgg')
base_dir = 'historic_data'

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self):
        self.rois = []
        self.stats = {}
        self.hist_data = {}
        self.background = None

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        plt.draw()
        
        if len(self.rois) == 2:
            self.process_rois()
            plt.close()

    def process_rois(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.background[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            # Calculate statistics
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }
            
            # Calculate histogram data
            counts, bins = np.histogram(combined_pixels, bins=50)
            self.hist_data = {
                'bin_starts': bins[:-1],
                'bin_ends': bins[1:],
                'counts': counts
            }

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None, None
    
    analyzer = ROIAnalyzer()
    analyzer.background = background
    
    fig, ax = plt.subplots(figsize=(10, 6))
    analyzer.ax = ax
    ax.imshow(background, cmap='gray')
    ax.set_title(f"Select 2 ROIs\n{os.path.basename(file_path)} ({year})")
    
    RectangleSelector(ax, analyzer.on_select,
                    useblit=True, button=[1],
                    minspanx=5, minspany=5,
                    spancoords='pixels', interactive=True)
    
    plt.show(block=True)
    plt.close()
    
    return (os.path.basename(file_path), 
           analyzer.stats, 
           analyzer.hist_data)

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_records = []
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats, hist = analyze_file(file_path, year)
        
        if stats and hist:
            category = categorize_file(filename)
            
            # Collect statistics
            if category in category_data:
                category_data[category].append(stats)
            
            # Collect histogram data
            hist_records.append({
                'Year': year,
                'Filename': filename,
                'Category': category,
                'Bin_Starts': hist['bin_starts'],
                'Bin_Ends': hist['bin_ends'],
                'Counts': hist['counts']
            })
    
    # Aggregate statistics
    for category, entries in category_data.items():
        if entries:
            year_stats[f"{category}_mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{category}_median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{category}_std"] = np.mean([e['std'] for e in entries])
            year_stats[f"{category}_total"] = np.sum([e['total'] for e in entries])
    
    return year_stats, hist_records

# Main processing
all_stats = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_stats.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# Create DataFrames
stats_df = pd.DataFrame(all_stats)
hist_df = pd.DataFrame(all_hists)

# Save to Excel with two sheets
with pd.ExcelWriter('analysis_results.xlsx') as writer:
    # Statistics sheet
    stats_df.to_excel(writer, sheet_name='Statistics', index=False)
    
    # Histogram data sheet
    hist_expanded = hist_df.explode(['Bin_Starts', 'Bin_Ends', 'Counts'])
    hist_expanded.to_excel(writer, sheet_name='Histograms', index=False)

print("Analysis results saved to analysis_results.xlsx")

Processing 2015
Processing 2016


KeyboardInterrupt: 

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

# Configure matplotlib backend
plt.switch_backend('TkAgg')  # Try 'Qt5Agg' if this doesn't work
base_dir = 'historic_data'


def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROISelector:
    def __init__(self, image, filename, year):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.rois = []
        self.stats = {}
        self.hist_data = {}

        # Setup plot
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs\n{filename} ({year})")
        
        # Create rectangle selector
        self.selector = RectangleSelector(
            self.ax,
            self.on_select,
            useblit=True,
            button=[1],
            minspanx=5,
            minspany=5,
            spancoords='pixels',
            interactive=True
        )
        
        # Connect event handler
        self.fig.canvas.mpl_connect('key_press_event', self.close_fig)
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        """Handle rectangle selection events."""
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        # Draw rectangle
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.calculate_stats()
            plt.close(self.fig)

    def close_fig(self, event):
        """Handle window closing."""
        if event.key == 'enter':
            plt.close(self.fig)

    def calculate_stats(self):
        """Calculate statistics from selected ROIs."""
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.image[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            # Calculate statistics
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }
            
            # Calculate histogram data
            counts, bins = np.histogram(combined_pixels, bins=50)
            self.hist_data = {
                'bin_starts': bins[:-1],
                'bin_ends': bins[1:],
                'counts': counts
            }

def process_hdf4(file_path):
    """Read HDF4 file and verify data structure."""
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        
        if data.shape[0] < 2:
            raise ValueError("Invalid dataset structure")
            
        background = data[1, :, :]  # Second element is background
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    """Analyze a single file with interactive ROI selection."""
    background = process_hdf4(file_path)
    if background is None:
        return None, None, None
    
    filename = os.path.basename(file_path)
    selector = ROISelector(background, filename, year)
    
    return filename, selector.stats, selector.hist_data


def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_records = []
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats, hist = analyze_file(file_path, year)
        
        if stats and hist:
            category = categorize_file(filename)
            
            # Collect statistics
            if category in category_data:
                category_data[category].append(stats)
            
            # Collect histogram data
            hist_records.append({
                'Year': year,
                'Filename': filename,
                'Category': category,
                'Bin_Starts': hist['bin_starts'],
                'Bin_Ends': hist['bin_ends'],
                'Counts': hist['counts']
            })
    
    # Aggregate statistics
    for category, entries in category_data.items():
        if entries:
            year_stats[f"{category}_mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{category}_median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{category}_std"] = np.mean([e['std'] for e in entries])
            year_stats[f"{category}_total"] = np.sum([e['total'] for e in entries])
    
    return year_stats, hist_records

# Main processing
all_stats = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_stats.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# Create DataFrames
stats_df = pd.DataFrame(all_stats)
hist_df = pd.DataFrame(all_hists)

# Save to Excel with two sheets
with pd.ExcelWriter('analysis_results.xlsx') as writer:
    # Statistics sheet
    stats_df.to_excel(writer, sheet_name='Statistics', index=False)
    
    # Histogram data sheet
    hist_expanded = hist_df.explode(['Bin_Starts', 'Bin_Ends', 'Counts'])
    hist_expanded.to_excel(writer, sheet_name='Histograms', index=False)

print("Analysis results saved to analysis_results.xlsx")

# Rest of the processing functions remain the same as previous version
# (process_year, main loop, Excel export, etc.)

Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025
Analysis results saved to analysis_results.xlsx


In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd
from io import BytesIO

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self, image, filename, year):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.rois = []
        self.stats = {}
        self.hist_image = None

        # Setup plot and selector
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs\n{filename} ({year})")
        self.selector = RectangleSelector(
            self.ax,
            self.on_select,
            useblit=True,
            button=[1],
            minspanx=5,
            minspany=5,
            spancoords='pixels',
            interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.process_data()
            plt.close()

    def process_data(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.image[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            # Calculate statistics
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }
            
            # Generate and save histogram plot
            fig, ax = plt.subplots(figsize=(6, 4))
            ax.hist(combined_pixels, bins=50, color='blue', alpha=0.7, edgecolor='black')
            ax.set_title(f"Histogram\n{self.filename}")
            ax.set_xlabel("Pixel Value")
            ax.set_ylabel("Frequency")
            
            # Save plot to Bytes buffer
            buf = BytesIO()
            plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
            plt.close(fig)
            self.hist_image = buf.getvalue()

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None, None
    
    filename = os.path.basename(file_path)
    analyzer = ROIAnalyzer(background, filename, year)
    return filename, analyzer.stats, analyzer.hist_image


    # Histogram sheet code remains the same...
def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_data = []
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats, hist_image = analyze_file(file_path, year)
        
        if stats and hist_image:
            category = categorize_file(filename)
            
            if category in category_data:
                category_data[category].append(stats)
            
            hist_data.append({
                'Year': year,
                'Filename': filename,
                'Category': category,
                'Histogram': hist_image
            })
    
    # Aggregate statistics
    for category, entries in category_data.items():
        if entries:
            year_stats[f"{category}_mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{category}_median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{category}_std"] = np.mean([e['std'] for e in entries])
            year_stats[f"{category}_total"] = np.sum([e['total'] for e in entries])
    
    return year_stats, hist_data

# Main processing
all_stats = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_stats.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# Create DataFrames
stats_df = pd.DataFrame(all_stats)
hist_df = pd.DataFrame(all_hists)

# Save to Excel with images
with pd.ExcelWriter('analysis_results.xlsx', engine='xlsxwriter') as writer:
    # Statistics sheet
    stats_df.to_excel(writer, sheet_name='Statistics', index=False)

    # Histograms sheet
    workbook = writer.book
    hist_sheet = workbook.add_worksheet('Histograms')
    
    # Write headers
    headers = ['Year', 'Filename', 'Category', 'Histogram']
    hist_sheet.write_row(0, 0, headers)
    
    # Set column widths
    hist_sheet.set_column(0, 2, 15)
    hist_sheet.set_column(3, 3, 30)
    
    for row_idx, hist in enumerate(hist_df.to_dict('records'), start=1):
        # Write metadata
        hist_sheet.write(row_idx, 0, hist['Year'])
        hist_sheet.write(row_idx, 1, hist['Filename'])
        hist_sheet.write(row_idx, 2, hist['Category'])
        
        # Insert histogram image
        img_stream = BytesIO(hist['Histogram'])
        hist_sheet.insert_image(
            row_idx, 3,
            hist['Filename'],
            {'image_data': img_stream, 'x_offset': 5, 'y_offset': 5}
        )
        
        # Set row height
        hist_sheet.set_row(row_idx, 100)

print("Analysis results saved to analysis_results.xlsx")

Processing 2015


KeyboardInterrupt: 

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd
from io import BytesIO

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self, image, filename, year):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.rois = []
        self.stats = {}
        self.hist_image = None

        # Setup plot and selector
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs\n{filename} ({year})")
        self.selector = RectangleSelector(
            self.ax,
            self.on_select,
            useblit=True,
            button=[1],
            minspanx=5,
            minspany=5,
            spancoords='pixels',
            interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.process_data()
            plt.close()

    def process_data(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.image[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            # Calculate statistics
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }
            
            # Generate and save histogram plot
            fig, ax = plt.subplots(figsize=(6, 4))
            ax.hist(combined_pixels, bins=50, color='blue', alpha=0.7, edgecolor='black')
            ax.set_title(f"Histogram\n{self.filename}")
            ax.set_xlabel("Pixel Value")
            ax.set_ylabel("Frequency")
            
            # Save plot to Bytes buffer
            buf = BytesIO()
            plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
            plt.close(fig)
            self.hist_image = buf.getvalue()

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None, None
    
    filename = os.path.basename(file_path)
    analyzer = ROIAnalyzer(background, filename, year)
    return filename, analyzer.stats, analyzer.hist_image


    # Histogram sheet code remains the same...
def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_data = []
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats, hist_image = analyze_file(file_path, year)
        
        if stats and hist_image:
            category = categorize_file(filename)
            
            if category in category_data:
                category_data[category].append(stats)
            
            hist_data.append({
                'Year': year,
                'Filename': filename,
                'Category': category,
                'Histogram': hist_image
            })
    
    # Aggregate statistics
    for category, entries in category_data.items():
        if entries:
            year_stats[f"{category}_mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{category}_median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{category}_std"] = np.mean([e['std'] for e in entries])
            year_stats[f"{category}_total"] = np.sum([e['total'] for e in entries])
    
    return year_stats, hist_data

# Main processing
all_stats = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_stats.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# Create DataFrames
stats_df = pd.DataFrame(all_stats)
hist_df = pd.DataFrame(all_hists)

# Save to Excel with images
with pd.ExcelWriter('analysis_results.xlsx', engine='xlsxwriter') as writer:
    # Statistics sheet
    stats_df.to_excel(writer, sheet_name='Statistics', index=False)

    # Histograms sheet
    workbook = writer.book
    hist_sheet = workbook.add_worksheet('Histograms')
    
    # Write headers
    headers = ['Year', 'Filename', 'Category', 'Histogram']
    hist_sheet.write_row(0, 0, headers)
    
    # Set column widths
    hist_sheet.set_column(0, 2, 15)
    hist_sheet.set_column(3, 3, 30)
    
    for row_idx, hist in enumerate(hist_df.to_dict('records'), start=1):
        # Write metadata
        hist_sheet.write(row_idx, 0, hist['Year'])
        hist_sheet.write(row_idx, 1, hist['Filename'])
        hist_sheet.write(row_idx, 2, hist['Category'])
        
        # Insert histogram image
        img_stream = BytesIO(hist['Histogram'])
        hist_sheet.insert_image(
            row_idx, 3,
            hist['Filename'],
            {'image_data': img_stream, 'x_offset': 5, 'y_offset': 5}
        )
        
        # Set row height
        hist_sheet.set_row(row_idx, 100)

print("Analysis results saved to analysis_results.xlsx")

Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025
Analysis results saved to analysis_results.xlsx


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pyhdf.SD import SD, SDC
from matplotlib.widgets import RectangleSelector

# List with paths to the files
names_list = [
    'CCD_gain/IAW_1000msSlowRamp_lighton_2.hdf',
    'CCD_gain/IAW_1000msSlowRamp_lighton_3.hdf',
    'CCD_gain/IAW_1000msSlowRamp_lightoff_3.hdf',
    'CCD_gain/IAW_1000msSlowRamp_lightoff_1.hdf'
]

# List with flat arrays after subtracting the dark current 
flats = []

# Open hdf files and process data
for fname in names_list:
    hdf = SD(fname, SDC.READ)
    dataset = hdf.select('Streak_array')
    data = dataset[:,:].astype(np.float64)
    hdf.end()
    flat = data[0] - data[1]
    flats.append(flat)

flats_add = []
flats_diff = []

# Process pairs of flats
for i in range(0, len(flats), 2):
    pair = flats[i:i+2]
    if len(pair) != 2:
        raise ValueError("Uneven number of flats for averaging")
    avg_flat = (pair[0] + pair[1]) / 2.0
    flats_add.append(avg_flat)
    diff = pair[0] - pair[1]
    flats_diff.append(diff)

diff_box = []
roi_box = []

# Function to handle ROI selection
def onselect(eclick, erelease, roi_coords):
    x1 = int(eclick.xdata)
    y1 = int(eclick.ydata)
    x2 = int(erelease.xdata)
    y2 = int(erelease.ydata)
    x_start, x_end = sorted([x1, x2])
    y_start, y_end = sorted([y1, y2])
    roi_coords.append((y_start, y_end, x_start, x_end))
    if len(roi_coords) >= 5:
        plt.close()

# Loop through each averaged flat to select ROIs
for idx in range(len(flats_add)):
    current_flat = flats_add[idx]
    current_diff = flats_diff[idx]
    roi_coords = []
    
    fig, ax = plt.subplots()
    ax.imshow(current_flat, cmap='prism')
    ax.set_title(f'Pair {idx+1}: Select 5 ROIs, then close window')
    
    # Connect the RectangleSelector
    rs = RectangleSelector(ax, lambda eclick, erelease: onselect(eclick, erelease, roi_coords),
                           useblit=True, button=[1],
                           minspanx=5, minspany=5,
                           spancoords='pixels', interactive=True)
    
    plt.show(block=True)
    
    # Ensure exactly 5 ROIs are selected
    if len(roi_coords) != 5:
        raise ValueError(f"Exactly 5 ROIs must be selected for pair {idx+1}, but {len(roi_coords)} were provided.")
    
    # Extract ROIs from current_flat and current_diff
    for coords in roi_coords:
        y_start, y_end, x_start, x_end = coords
        roi = current_flat[y_start:y_end, x_start:x_end]
        roi_box.append(roi)
        d_roi = current_diff[y_start:y_end, x_start:x_end]
        diff_box.append(d_roi)

# Calculate variances and means
variances_100 = []
means_100 = []
for i in range(len(diff_box)):
    stdev = np.std(diff_box[i])
    variance = (stdev ** 2) / 2
    variances_100.append(variance)
    mean = np.mean(roi_box[i])
    means_100.append(mean)

coordinates = list(zip(means_100, variances_100))
#print(coordinates)

# Plotting
x = [point[0] for point in coordinates]
y = [point[1] for point in coordinates]

plt.plot(x, y, 'o', color='blue', markersize=8)
plt.xlabel('Mean')
plt.ylabel('Variance')
plt.title('1000ms Lights on')
plt.grid(True)
plt.show()

# Linear regression and plot
if len(coordinates) < 2:
    print("Error: At least two points are required to compute a slope.")
else:
    x = np.array([point[0] for point in coordinates])
    y = np.array([point[1] for point in coordinates])
    
    if np.all(x == x[0]):
        print("The line is vertical; slope is undefined.")
    else:
        slope, intercept = np.polyfit(x, y, 1)
        print(f"The slope of the line is {slope:.2f}")
        
        plt.scatter(x, y, color='red', label='Data Points')
        plt.plot(x, slope * x + intercept, label=f'Line: y = {slope:.2f}x + {intercept:.2f}')
        plt.legend()
        plt.xlabel('Mean')
        plt.ylabel('Variance')
        plt.title('1000ms Lights on')
        plt.grid(True)
        plt.show()

In [None]:

with pd.ExcelWriter(
    'analysis_results.xlsx',
    engine='xlsxwriter',
    engine_kwargs={'options': {'nan_inf_to_errors': True}}
) as writer:
    # Write statistics data starting at row 3 (Excel row 4)
    df_stats.to_excel(writer, sheet_name='Statistics', index=False, startrow=3)
    
    workbook = writer.book
    stats_sheet = writer.sheets['Statistics']
    
    # Create header format
    header_format = workbook.add_format({
        'bold': True,
        'align': 'center',
        'valign': 'vcenter',
        'border': 1
    })
    
    # Write and merge headers
    # Year header (rows 1-2 in Excel)
    stats_sheet.merge_range('A1:A2', 'Year', header_format)
    
    # Category headers (rows 1-2)
    col_idx = 1
    for category in categories:
        stats_sheet.merge_range(0, col_idx, 0, col_idx+3, category, header_format)
        for i, metric in enumerate(['mean', 'median', 'stdv', 'total']):
            stats_sheet.write(1, col_idx+i, metric, header_format)
        col_idx += 4
    
    # Write data with proper alignment (starting at row 3)
    num_format = workbook.add_format({'num_format': '0.000'})
    total_format = workbook.add_format({'num_format': '#,##0'})
    
    for df_row in range(len(df_stats)):
        excel_row = df_row + 3  # Data starts at Excel row 4
        # Year column
        stats_sheet.write(excel_row, 0, df_stats.iloc[df_row, 0])
        
        # Data columns
        for col in range(1, len(df_stats.columns)):
            value = df_stats.iloc[df_row, col]
            if pd.isna(value):
                stats_sheet.write_blank(excel_row, col, None)
            elif (col % 4) == 0:  # Total columns
                stats_sheet.write(excel_row, col, value, total_format)
            else:
                stats_sheet.write(excel_row, col, value, num_format)
    
    # Set column widths
    stats_sheet.set_column(0, 0, 10)  # Year column
    for col in range(1, len(df_stats.columns)):
        stats_sheet.set_column(col, col, 15)


In [None]:

    # Histogram sheet code remains the same...
def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_data = []
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats, hist_image = analyze_file(file_path, year)
        
        if stats and hist_image:
            category = categorize_file(filename)
            
            if category in category_data:
                category_data[category].append(stats)
            
            hist_data.append({
                'Year': year,
                'Filename': filename,
                'Category': category,
                'Histogram': hist_image
            })
    
    # Aggregate statistics
    for category, entries in category_data.items():
        if entries:
            year_stats[f"{category}_mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{category}_median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{category}_std"] = np.mean([e['std'] for e in entries])
            year_stats[f"{category}_total"] = np.sum([e['total'] for e in entries])
    
    return year_stats, hist_data

# Main processing
all_stats = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_stats.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# Create DataFrames
stats_df = pd.DataFrame(all_stats)
hist_df = pd.DataFrame(all_hists)

# Save to Excel with images
with pd.ExcelWriter('analysis_results.xlsx', engine='xlsxwriter') as writer:
    # Statistics sheet
    stats_df.to_excel(writer, sheet_name='Statistics', index=False)
    

In [1]:

import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd
from io import BytesIO

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self, image, filename, year):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.rois = []
        self.stats = {}
        self.hist_image = None

        # Setup plot and selector
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs\n{filename} ({year})")
        self.selector = RectangleSelector(
            self.ax,
            self.on_select,
            useblit=True,
            button=[1],
            minspanx=5,
            minspany=5,
            spancoords='pixels',
            interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.process_data()
            plt.close()

    def process_data(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.image[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            # Calculate statistics
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }
            
            # Generate and save histogram plot
            fig, ax = plt.subplots(figsize=(6, 4))
            ax.hist(combined_pixels, bins=50, color='blue', alpha=0.7, edgecolor='black')
            ax.set_title(f"Histogram\n{self.filename}")
            ax.set_xlabel("Pixel Value")
            ax.set_ylabel("Frequency")
            
            # Save plot to Bytes buffer
            buf = BytesIO()
            plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
            plt.close(fig)
            self.hist_image = buf.getvalue()

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None, None
    
    filename = os.path.basename(file_path)
    analyzer = ROIAnalyzer(background, filename, year)
    return filename, analyzer.stats, analyzer.hist_image
def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_records = []
    
    category_data = {
        'iaw_ccd': [], 
        'iaw_ross': [],
        'epw_ccd': [], 
        'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats, hist = analyze_file(file_path, year)
        
        if stats and hist:
            category = categorize_file(filename)
            if category in category_data:
                category_data[category].append(stats)
    
    # Aggregate statistics with formatted column names
    category_order = ['iaw_ccd', 'iaw_ross', 'epw_ccd', 'epw_ross']
    for category in category_order:
        entries = category_data[category]
        formatted_category = category.replace('_', ' ')
        
        if entries:
            year_stats[f"{formatted_category} - mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{formatted_category} - median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{formatted_category} - std"] = np.mean([e['std'] for e in entries])
            year_stats[f"{formatted_category} - total"] = np.sum([e['total'] for e in entries])
        else:
            year_stats[f"{formatted_category} - mean"] = np.nan
            year_stats[f"{formatted_category} - median"] = np.nan
            year_stats[f"{formatted_category} - std"] = np.nan
            year_stats[f"{formatted_category} - total"] = 0

    return year_stats, hist_records

# Define column order for Excel output
column_order = ['Year']
categories = ['iaw_ccd', 'iaw_ross', 'epw_ccd', 'epw_ross']
for category in categories:
    formatted = category.replace('_', ' ')
    column_order += [
        f"{formatted} - mean",
        f"{formatted} - median",
        f"{formatted} - std",
        f"{formatted} - total"
    ]


# Main processing
all_stats = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_stats.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# After main processing loop:
stats_df = pd.DataFrame(all_stats).reindex(columns=column_order)
hist_df = pd.DataFrame(all_hists)
# Save to Excel with images
with pd.ExcelWriter('analysis_results.xlsx', engine='xlsxwriter') as writer:
    # Statistics sheet
    stats_df.to_excel(writer, sheet_name='Statistics', index=False)
    
    # Histograms sheet
    workbook = writer.book
    hist_sheet = workbook.add_worksheet('Histograms')
    
    # Write headers
    headers = ['Year', 'Filename', 'Category', 'Histogram']
    hist_sheet.write_row(0, 0, headers)
    
    # Set column widths
    hist_sheet.set_column(0, 2, 15)
    hist_sheet.set_column(3, 3, 30)
    
    for row_idx, hist in enumerate(hist_df.to_dict('records'), start=1):
        # Write metadata
        hist_sheet.write(row_idx, 0, hist['Year'])
        hist_sheet.write(row_idx, 1, hist['Filename'])
        hist_sheet.write(row_idx, 2, hist['Category'])
        
        # Insert histogram image
        img_stream = BytesIO(hist['Histogram'])
        hist_sheet.insert_image(
            row_idx, 3,
            hist['Filename'],
            {'image_data': img_stream, 'x_offset': 5, 'y_offset': 5}
        )
        
        # Set row height
        hist_sheet.set_row(row_idx, 100)

print("Analysis results saved to analysis_results.xlsx")

Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025
Analysis results saved to analysis_results.xlsx


In [10]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self, image, filename, year):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.rois = []
        self.stats = {}

        # Setup plot and selector
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs\n{filename} ({year})")
        self.selector = RectangleSelector(
            self.ax,
            self.on_select,
            useblit=True,
            button=[1],
            minspanx=5,
            minspany=5,
            spancoords='pixels',
            interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.calculate_stats()
            plt.close()

    def calculate_stats(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.image[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'stdv': np.std(combined_pixels),
                'total': len(combined_pixels)
            }

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None
    
    filename = os.path.basename(file_path)
    analyzer = ROIAnalyzer(background, filename, year)
    return filename, analyzer.stats

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_data = {'Year': year}
    
    categories = {
        'iaw_ccd': {'means': [], 'medians': [], 'stdvs': [], 'totals': []},
        'iaw_ross': {'means': [], 'medians': [], 'stdvs': [], 'totals': []},
        'epw_ccd': {'means': [], 'medians': [], 'stdvs': [], 'totals': []},
        'epw_ross': {'means': [], 'medians': [], 'stdvs': [], 'totals': []}
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats = analyze_file(file_path, year)
        
        if stats:
            category = categorize_file(filename)
            if category in categories:
                categories[category]['means'].append(stats['mean'])
                categories[category]['medians'].append(stats['median'])
                categories[category]['stdvs'].append(stats['stdv'])
                categories[category]['totals'].append(stats['total'])
    
    # Calculate averages and totals
    for cat, data in categories.items():
        prefix = cat.replace('_', ' ')
        year_data[f"{prefix} - mean"] = np.mean(data['means']) if data['means'] else np.nan
        year_data[f"{prefix} - median"] = np.mean(data['medians']) if data['medians'] else np.nan
        year_data[f"{prefix} - stdv"] = np.mean(data['stdvs']) if data['stdvs'] else np.nan
        year_data[f"{prefix} - total"] = np.sum(data['totals']) if data['totals'] else 0
    
    return year_data

# Main processing
all_data = []
for year in range(2015, 2026):
    print(f"Processing {year}")
    year_result = process_year(year)
    if year_result:
        all_data.append(year_result)

# Define categories and metrics
categories = ['iaw ccd', 'iaw ross', 'epw ccd', 'epw ross']
metrics = ['mean', 'median', 'stdv', 'total']

# Create column structure
columns = pd.MultiIndex.from_tuples(
    [('Year', '')] + 
    [(cat, metric) for cat in categories for metric in metrics],
    names=['Category', 'Metric']
)

# Prepare data rows
formatted_data = []
for entry in all_data:
    row = [entry['Year']]
    for cat in categories:
        row += [
            entry.get(f"{cat} - mean", np.nan),
            entry.get(f"{cat} - median", np.nan),
            entry.get(f"{cat} - stdv", np.nan),
            entry.get(f"{cat} - total", 0)
        ]
    formatted_data.append(row)

# Create DataFrame with correct column count
df = pd.DataFrame(formatted_data, columns=columns)

# Save to Excel with formatting
with pd.ExcelWriter('analysis_results.xlsx', engine='xlsxwriter') as writer:
    df.to_excel(writer, index=False, startrow=1)
    
    workbook = writer.book
    worksheet = writer.sheets['Sheet1']
    
    # Merge category headers
    for col_idx, category in enumerate([''] + categories):
        if col_idx == 0:  # Skip Year column
            continue
        worksheet.merge_range(0, col_idx*4-3, 0, col_idx*4, category)
    
    # Write metric subheaders
    metric_headers = ['Year'] + metrics*len(categories)
    for col_idx, header in enumerate(metric_headers):
        worksheet.write(1, col_idx, header)

print("Analysis results saved to analysis_results.xlsx")

Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025


NotImplementedError: Writing to Excel with MultiIndex columns and no index ('index'=False) is not yet implemented.

In [12]:
# After processing all data, create a flat DataFrame
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self, image, filename, year):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.rois = []
        self.stats = {}

        # Setup plot and selector
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs\n{filename} ({year})")
        self.selector = RectangleSelector(
            self.ax,
            self.on_select,
            useblit=True,
            button=[1],
            minspanx=5,
            minspany=5,
            spancoords='pixels',
            interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.calculate_stats()
            plt.close()

    def calculate_stats(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.image[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'stdv': np.std(combined_pixels),
                'total': len(combined_pixels)
            }

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None
    
    filename = os.path.basename(file_path)
    analyzer = ROIAnalyzer(background, filename, year)
    return filename, analyzer.stats

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_data = {'Year': year}
    
    categories = {
        'iaw_ccd': {'means': [], 'medians': [], 'stdvs': [], 'totals': []},
        'iaw_ross': {'means': [], 'medians': [], 'stdvs': [], 'totals': []},
        'epw_ccd': {'means': [], 'medians': [], 'stdvs': [], 'totals': []},
        'epw_ross': {'means': [], 'medians': [], 'stdvs': [], 'totals': []}
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats = analyze_file(file_path, year)
        
        if stats:
            category = categorize_file(filename)
            if category in categories:
                categories[category]['means'].append(stats['mean'])
                categories[category]['medians'].append(stats['median'])
                categories[category]['stdvs'].append(stats['stdv'])
                categories[category]['totals'].append(stats['total'])
    
    # Calculate averages and totals
    for cat, data in categories.items():
        prefix = cat.replace('_', ' ')
        year_data[f"{prefix} - mean"] = np.mean(data['means']) if data['means'] else np.nan
        year_data[f"{prefix} - median"] = np.mean(data['medians']) if data['medians'] else np.nan
        year_data[f"{prefix} - stdv"] = np.mean(data['stdvs']) if data['stdvs'] else np.nan
        year_data[f"{prefix} - total"] = np.sum(data['total']) if data['total'] else 0
    
    return year_data

# Main processing
all_data = []
for year in range(2015, 2026):
    print(f"Processing {year}")
    year_result = process_year(year)
    if year_result:
        all_data.append(year_result)

flat_columns = ['Year']
for category in ['iaw ccd', 'iaw ross', 'epw ccd', 'epw ross']:
    flat_columns += [f"{category} - {metric}" for metric in ['mean', 'median', 'stdv', 'total']]

flat_data = []
for entry in all_data:
    row = [entry['Year']]
    for category in ['iaw ccd', 'iaw ross', 'epw ccd', 'epw ross']:
        row += [
            entry.get(f"{category} - mean", np.nan),
            entry.get(f"{category} - median", np.nan),
            entry.get(f"{category} - stdv", np.nan),
            entry.get(f"{category} - total", 0)
        ]
    flat_data.append(row)

df = pd.DataFrame(flat_data, columns=flat_columns)

# After creating the DataFrame:

with pd.ExcelWriter(
    'analysis_results.xlsx',
    engine='xlsxwriter',
    engine_kwargs={'options': {'nan_inf_to_errors': True}}
) as writer:
    df.to_excel(writer, sheet_name='Statistics', index=False, startrow=2)
    
    workbook = writer.book
    worksheet = writer.sheets['Statistics']
    
    # Category headers formatting
    category_headers = ['Year', 'iaw ccd', 'iaw ross', 'epw ccd', 'epw ross']
    category_format = workbook.add_format({
        'bold': True,
        'align': 'center',
        'valign': 'vcenter',
        'border': 1
    })
    
    # Merge and format headers
    worksheet.merge_range('A1:A2', 'Year', category_format)
    for idx, category in enumerate(category_headers[1:], start=1):
        start_col = 1 + (idx-1)*4
        end_col = start_col + 3
        worksheet.merge_range(
            0, start_col, 0, end_col,
            category,
            category_format
        )
    
    # Metric subheaders
    metric_format = workbook.add_format({
        'bold': True,
        'border': 1
    })
    metrics = ['mean', 'median', 'stdv', 'total']
    for col in range(1, len(df.columns)):
        if (col-1) % 4 == 0:
            worksheet.write(1, col, metrics[0], metric_format)
            worksheet.write(1, col+1, metrics[1], metric_format)
            worksheet.write(1, col+2, metrics[2], metric_format)
            worksheet.write(1, col+3, metrics[3], metric_format)
    
    # Data formatting
    num_format = workbook.add_format({'num_format': '0.000'})
    total_format = workbook.add_format({'num_format': '#,##0'})
    
    for row_idx in range(2, len(df)+2):
        # Year column
        worksheet.write(row_idx, 0, df.iloc[row_idx-2, 0])
        
        # Data columns
        for col_idx in range(1, len(df.columns)):
            value = df.iloc[row_idx-2, col_idx]
            if pd.isna(value):
                worksheet.write_blank(row_idx, col_idx, None)
            elif (col_idx % 4) == 0:  # Total columns
                worksheet.write(row_idx, col_idx, value, total_format)
            else:
                worksheet.write(row_idx, col_idx, value, num_format)
    
    # Column widths
    worksheet.set_column(0, 0, 10)  # Year column
    for col in range(1, len(df.columns)):
        worksheet.set_column(col, col, 12)

print("Analysis results successfully saved to analysis_results.xlsx")


Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025
Analysis results successfully saved to analysis_results.xlsx


In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd
from io import BytesIO

# Configure matplotlib backend
plt.switch_backend('TkAgg')
base_dir = 'historic_data'

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROIAnalyzer:
    def __init__(self, image, filename, year):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.rois = []
        self.stats = {}
        self.hist_image = None

        # Setup plot and selector
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs\n{filename} ({year})")
        self.selector = RectangleSelector(
            self.ax,
            self.on_select,
            useblit=True,
            button=[1],
            minspanx=5,
            minspany=5,
            spancoords='pixels',
            interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            self.process_data()
            plt.close()

    def process_data(self):
        combined_pixels = []
        for x, y, w, h in self.rois:
            try:
                roi_data = self.image[y:y+h, x:x+w]
                combined_pixels.extend(roi_data.flatten())
            except Exception as e:
                continue
        
        if combined_pixels:
            # Calculate statistics
            self.stats = {
                'mean': np.mean(combined_pixels),
                'median': np.median(combined_pixels),
                'std': np.std(combined_pixels),
                'total': len(combined_pixels)
            }
            
            # Generate and save histogram plot
            fig, ax = plt.subplots(figsize=(6, 4))
            ax.hist(combined_pixels, bins=50, color='blue', alpha=0.7, edgecolor='black')
            ax.set_title(f"Histogram\n{self.filename}")
            ax.set_xlabel("Pixel Value")
            ax.set_ylabel("Frequency")
            
            # Save plot to Bytes buffer
            buf = BytesIO()
            plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
            plt.close(fig)
            self.hist_image = buf.getvalue()

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year):
    background = process_hdf4(file_path)
    if background is None:
        return None, None, None
    
    filename = os.path.basename(file_path)
    analyzer = ROIAnalyzer(background, filename, year)
    return filename, analyzer.stats, analyzer.hist_image


    # Histogram sheet code remains the same...
def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_data = []
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename, stats, hist_image = analyze_file(file_path, year)
        
        if stats and hist_image:
            category = categorize_file(filename)
            
            if category in category_data:
                category_data[category].append(stats)
            
            hist_data.append({
                'Year': year,
                'Filename': filename,
                'Category': category,
                'Histogram': hist_image
            })
    
    # Aggregate statistics
    for category, entries in category_data.items():
        if entries:
            year_stats[f"{category}_mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{category}_median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{category}_std"] = np.mean([e['std'] for e in entries])
            year_stats[f"{category}_total"] = np.sum([e['total'] for e in entries])
    
    return year_stats, hist_data

# Main processing
all_stats = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_stats.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# Create DataFrames
stats_df = pd.DataFrame(all_stats)
hist_df = pd.DataFrame(all_hists)

# Save to Excel with images
with pd.ExcelWriter('analysis_results.xlsx', engine='xlsxwriter') as writer:
    # Statistics sheet
    stats_df.to_excel(writer, sheet_name='Statistics', index=False)

    # Histograms sheet
    workbook = writer.book
    hist_sheet = workbook.add_worksheet('Histograms')
    
    # Write headers
    headers = ['Year', 'Filename', 'Category', 'Histogram']
    hist_sheet.write_row(0, 0, headers)
    
    # Set column widths
    hist_sheet.set_column(0, 2, 15)
    hist_sheet.set_column(3, 3, 30)
    
    for row_idx, hist in enumerate(hist_df.to_dict('records'), start=1):
        # Write metadata
        hist_sheet.write(row_idx, 0, hist['Year'])
        hist_sheet.write(row_idx, 1, hist['Filename'])
        hist_sheet.write(row_idx, 2, hist['Category'])
        
        # Insert histogram image
        img_stream = BytesIO(hist['Histogram'])
        hist_sheet.insert_image(
            row_idx, 3,
            hist['Filename'],
            {'image_data': img_stream, 'x_offset': 5, 'y_offset': 5}
        )
        
        # Set row height
        hist_sheet.set_row(row_idx, 100)

print("Analysis results saved to analysis_results.xlsx")

Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025
Analysis results saved to analysis_results.xlsx


In [5]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd
from io import BytesIO

plt.switch_backend('TkAgg')
base_dir = 'historic_data'

class RossROIManager:
    def __init__(self):
        self.rois = {
            'iaw_ross': None,
            'epw_ross': None
        }
        self.first_files_processed = {
            'iaw_ross': False,
            'epw_ross': False
        }

    def get_rois(self, category):
        return self.rois[category]

    def set_rois(self, category, rois):
        self.rois[category] = rois
        self.first_files_processed[category] = True

    def needs_roi_selection(self, category):
        return not self.first_files_processed[category]

ross_manager = RossROIManager()

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROISelector:
    def __init__(self, image, filename, year, category):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.category = category
        self.rois = []
        
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs for {category}\n{filename} ({year})")
        
        self.selector = RectangleSelector(
            self.ax, self.on_select,
            useblit=True, button=[1],
            minspanx=5, minspany=5,
            spancoords='pixels', interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            plt.close()

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def analyze_file(file_path, year, category):
    background = process_hdf4(file_path)
    if background is None:
        return None

    filename = os.path.basename(file_path)
    
    if 'ccd' in category:
        # Use full frame for CCD categories
        combined_pixels = background.flatten()
    else:
        # Ross categories
        if ross_manager.needs_roi_selection(category):
            # First file in category - get ROIs
            selector = ROISelector(background, filename, year, category)
            ross_manager.set_rois(category, selector.rois)
        
        # Get stored ROIs
        rois = ross_manager.get_rois(category)
        combined_pixels = []
        for x, y, w, h in rois:
            roi_data = background[y:y+h, x:x+w]
            combined_pixels.extend(roi_data.flatten())

    if len(combined_pixels) > 0:
        return {
            'mean': np.mean(combined_pixels),
            'median': np.median(combined_pixels),
            'stdv': np.std(combined_pixels),
            'total': len(combined_pixels)
        }
    return None

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename = os.path.basename(file_path)
        category = categorize_file(filename)
        
        if category not in category_data:
            continue
            
        stats = analyze_file(file_path, year, category)
        if stats:
            category_data[category].append(stats)
    
    # Aggregate statistics
    for cat, entries in category_data.items():
        prefix = cat.replace('_', ' ')
        if entries:
            year_stats[f"{prefix} - mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{prefix} - median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{prefix} - stdv"] = np.mean([e['stdv'] for e in entries])
            year_stats[f"{prefix} - total"] = np.sum([e['total'] for e in entries])
        else:
            year_stats[f"{prefix} - mean"] = np.nan
            year_stats[f"{prefix} - median"] = np.nan
            year_stats[f"{prefix} - stdv"] = np.nan
            year_stats[f"{prefix} - total"] = 0
    
    return year_stats

# Main processing
all_data = []
for year in range(2015, 2026):
    print(f"Processing {year}")
    year_result = process_year(year)
    if year_result:
        all_data.append(year_result)

# Create and save Excel report
if all_data:
    df = pd.DataFrame(all_data)
    
    # Define column order
    categories = ['iaw ccd', 'iaw ross', 'epw ccd', 'epw ross']
    column_order = ['Year']
    for cat in categories:
        column_order += [f"{cat} - {metric}" for metric in ['mean', 'median', 'stdv', 'total']]
    
    df = df.reindex(columns=column_order)
    
    # Save to Excel with formatting
    with pd.ExcelWriter(
        'analysis_results.xlsx',
        engine='xlsxwriter',
        engine_kwargs={'options': {'nan_inf_to_errors': True}}
    ) as writer:
        df.to_excel(writer, sheet_name='Statistics', index=False, startrow=2)
        
        workbook = writer.book
        worksheet = writer.sheets['Statistics']
        
        # Create header format
        header_format = workbook.add_format({
            'bold': True,
            'align': 'center',
            'valign': 'vcenter',
            'border': 1
        })
        
        # Merge category headers
        worksheet.merge_range('A1:A2', 'Year', header_format)
        col_idx = 1
        for category in categories:
            worksheet.merge_range(0, col_idx, 0, col_idx+3, category, header_format)
            for i, metric in enumerate(['mean', 'median', 'stdv', 'total']):
                worksheet.write(1, col_idx+i, metric, header_format)
            col_idx += 4
        
        # Format numbers
        num_format = workbook.add_format({'num_format': '0.000'})
        total_format = workbook.add_format({'num_format': '#,##0'})
        
        for row in range(2, len(df)+2):
            worksheet.set_row(row, 20)
            for col in range(1, len(df.columns)):
                value = df.iloc[row-2, col]
                if pd.isna(value):
                    worksheet.write_blank(row, col, None)
                elif (col % 4) == 0:  # Total columns
                    worksheet.write(row, col, value, total_format)
                else:
                    worksheet.write(row, col, value, num_format)
        
        # Set column widths
        worksheet.set_column(0, 0, 10)  # Year column
        for col in range(1, len(df.columns)):
            worksheet.set_column(col, col, 15)

    print("Analysis results saved to analysis_results.xlsx")
else:
    print("No data processed")

Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025
Analysis results saved to analysis_results.xlsx


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from pyhdf.SD import SD, SDC


# Configuration parameters
ROI_SIZE = 30  # Size of square ROI in pixels
NUM_ROIS = 5    # Number of ROIs to select per image

# List of file paths
names_list = [
    '/IAW_1000msSlowRamp_lighton_2.hdf',
    'data/IAW_1000msSlowRamp_lighton_3.hdf',
    'data/IAW_1000msSlowRamp_lightoff_2.hdf',
    'data/IAW_1000msSlowRamp_lightoff_3.hdf'
]

# Load and process HDF files
flats = []
for fname in names_list:
    try:
        hdf = SD(fname, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset[:, :].astype(np.float64)
        hdf.end()
        flat = data[0] - data[1]
        flats.append(flat)
    except Exception as e:
        print(f"Error processing {fname}: {str(e)}")
        raise

# Create averaged and difference images
flats_add = []
flats_diff = []
for i in range(0, len(flats), 2):
    pair = flats[i:i+2]
    if len(pair) != 2:
        raise ValueError("Uneven number of flats for averaging")
    
    flats_add.append((pair[0] + pair[1]) / 2.0)
    flats_diff.append(pair[0] - pair[1])

# ROI selection and analysis
roi_add = []  # Stores ROIs from averaged images
roi_diff = []  # Stores corresponding ROIs from difference images
stats_add = []  # Statistics for averaged ROIs
stats_diff = []  # Statistics for difference ROIs

for idx in range(len(flats_add)):
    # Display image for ROI selection
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.imshow(flats_add[idx], cmap='gray', origin='upper')
    ax.set_title(f"Select {NUM_ROIS} ROIs (click centers) | Image {idx+1}/{len(flats_add)}")
    plt.tight_layout()
    
    # Get ROI centers from user clicks
    print(f"Click {NUM_ROIS} locations for image {idx+1}")
    centers = plt.ginput(NUM_ROIS, timeout=0)
    plt.close()
    
    # Process each selected ROI
    for i, (x, y) in enumerate(centers):
        # Convert click coordinates to array indices
        col = int(round(x))
        row = int(round(y))
        
        # Calculate ROI boundaries with edge protection
        height, width = flats_add[idx].shape
        half = ROI_SIZE // 2
        
        row_start = max(0, row - half)
        row_end = min(height, row + half + (ROI_SIZE % 2))
        col_start = max(0, col - half)
        col_end = min(width, col + half + (ROI_SIZE % 2))
        
        # Extract ROIs from both image types
        roi_a = flats_add[idx][row_start:row_end, col_start:col_end]
        roi_d = flats_diff[idx][row_start:row_end, col_start:col_end]
        
        # Store ROIs
        roi_add.append(roi_a)
        roi_diff.append(roi_d)
        
        # Calculate statistics
        stats_add.append({
            'mean': np.mean(roi_a),
            'median': np.median(roi_a),
            'std': np.std(roi_a),
            'var': np.var(roi_a)
        })
        
        stats_diff.append({
            'mean': np.mean(roi_d),
            'median': np.median(roi_d),
            'std': np.std(roi_d),
            'var': np.var(roi_d)
        })

# Print summary statistics
print("\nSummary Statistics:")
print(f"{'ROI #':<6} | {'Type':<6} | {'Mean':<10} | {'Std Dev':<10} | {'Variance':<10} | {'Median':<10}")
print("-"*75)
for i in range(len(roi_add)):
    print(f"{i+1:<6} | {'Add':<6} | {stats_add[i]['mean']:10.2f} | {stats_add[i]['std']:10.2f} | {stats_add[i]['var']:10.2f} | {stats_add[i]['median']:10.2f}")
    print(f"{i+1:<6} | {'Diff':<6} | {stats_diff[i]['mean']:10.2f} | {stats_diff[i]['std']:10.2f} | {stats_diff[i]['var']:10.2f} | {stats_diff[i]['median']:10.2f}")
    print("-"*75)

# Optional: Save ROIs and statistics
np.savez('roi_data.npz', 
         roi_add=roi_add,
         roi_diff=roi_diff,
         stats_add=stats_add,
         stats_diff=stats_diff)

Error processing data/IAW_1000msSlowRamp_lighton_2.hdf: SD: no such file


HDF4Error: SD: no such file

In [7]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd
from io import BytesIO

plt.switch_backend('TkAgg')
base_dir = 'historic_data'

class RossROIManager:
    def __init__(self):
        self.rois = {
            'iaw_ross': None,
            'epw_ross': None
        }
        self.first_files_processed = {
            'iaw_ross': False,
            'epw_ross': False
        }

    def get_rois(self, category):
        return self.rois[category]

    def set_rois(self, category, rois):
        self.rois[category] = rois
        self.first_files_processed[category] = True

    def needs_roi_selection(self, category):
        return not self.first_files_processed[category]

ross_manager = RossROIManager()

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROISelector:
    def __init__(self, image, filename, year, category):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.category = category
        self.rois = []
        
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs for {category}\n{filename} ({year})")
        
        self.selector = RectangleSelector(
            self.ax, self.on_select,
            useblit=True, button=[1],
            minspanx=5, minspany=5,
            spancoords='pixels', interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            plt.close()

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def generate_histogram(pixels, filename, year):
    fig = plt.figure(figsize=(6, 4))
    plt.hist(pixels, bins=50, color='blue', alpha=0.7, edgecolor='black')
    plt.title(f"Histogram\n{filename} ({year})")
    plt.xlabel("Pixel Value")
    plt.ylabel("Frequency")
    
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
    plt.close(fig)
    return buf.getvalue()

def analyze_file(file_path, year, category):
    background = process_hdf4(file_path)
    if background is None:
        return None, None

    filename = os.path.basename(file_path)
    histogram = None
    
    if 'ccd' in category:
        # Full frame for CCD
        pixels = background.flatten()
        histogram = generate_histogram(pixels, filename, year)
        stats = {
            'mean': np.mean(pixels),
            'median': np.median(pixels),
            'stdv': np.std(pixels),
            'total': len(pixels)
        }
    else:
        # Ross categories
        if ross_manager.needs_roi_selection(category):
            selector = ROISelector(background, filename, year, category)
            ross_manager.set_rois(category, selector.rois)
        
        rois = ross_manager.get_rois(category)
        pixels = []
        for x, y, w, h in rois:
            roi_data = background[y:y+h, x:x+w]
            pixels.extend(roi_data.flatten())
        
        if pixels:
            histogram = generate_histogram(pixels, filename, year)
            stats = {
                'mean': np.mean(pixels),
                'median': np.median(pixels),
                'stdv': np.std(pixels),
                'total': len(pixels)
            }
        else:
            stats = None

    return stats, histogram

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_data = []
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename = os.path.basename(file_path)
        category = categorize_file(filename)
        
        if category not in category_data:
            continue
            
        stats, hist = analyze_file(file_path, year, category)
        if stats and hist:
            category_data[category].append(stats)
            hist_data.append({
                'Year': year,
                'Filename': filename,
                'Category': category.replace('_', ' '),
                'Histogram': hist
            })
    
    # Aggregate statistics
    for cat, entries in category_data.items():
        prefix = cat.replace('_', ' ')
        if entries:
            year_stats[f"{prefix} - mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{prefix} - median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{prefix} - stdv"] = np.mean([e['stdv'] for e in entries])
            year_stats[f"{prefix} - total"] = np.sum([e['total'] for e in entries])
        else:
            year_stats[f"{prefix} - mean"] = np.nan
            year_stats[f"{prefix} - median"] = np.nan
            year_stats[f"{prefix} - stdv"] = np.nan
            year_stats[f"{prefix} - total"] = 0
    
    return year_stats, hist_data

# Main processing
all_data = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_data.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# Create and save Excel report
if all_data:
    # Statistics sheet
    df_stats = pd.DataFrame(all_data)
    categories = ['iaw ccd', 'iaw ross', 'epw ccd', 'epw ross']
    column_order = ['Year'] + [
        f"{cat} - {metric}" for cat in categories 
        for metric in ['mean', 'median', 'stdv', 'total']
    ]
    df_stats = df_stats.reindex(columns=column_order)
    
    # Histograms sheet
    df_hists = pd.DataFrame(all_hists)
    
    # Save to Excel
    with pd.ExcelWriter(
        'analysis_results.xlsx',
        engine='xlsxwriter',
        engine_kwargs={'options': {'nan_inf_to_errors': True}}
    ) as writer:
        # Statistics sheet
        df_stats.to_excel(writer, sheet_name='Statistics', index=False, startrow=2)
        
        # Format statistics sheet
        workbook = writer.book
        stats_sheet = writer.sheets['Statistics']
        
        # Merge headers
        header_format = workbook.add_format({
            'bold': True, 'align': 'center', 'valign': 'vcenter', 'border': 1
        })
        stats_sheet.merge_range('A1:A2', 'Year', header_format)
        col_idx = 1
        for category in categories:
            stats_sheet.merge_range(0, col_idx, 0, col_idx+3, category, header_format)
            for i, metric in enumerate(['mean', 'median', 'stdv', 'total']):
                stats_sheet.write(1, col_idx+i, metric, header_format)
            col_idx += 4
        
        # Histograms sheet
        df_hists.to_excel(writer, sheet_name='Histograms', index=False)
        hist_sheet = writer.sheets['Histograms']
        
        # Insert images
        for idx, row in df_hists.iterrows():
            img_data = BytesIO(row['Histogram'])
            hist_sheet.insert_image(
                idx + 1, 3,  # Start from row 1, column D
                row['Filename'],
                {'image_data': img_data, 'x_offset': 5, 'y_offset': 5}
            )
        
        # Set column widths and row heights
        hist_sheet.set_column('A:A', 10)   # Year
        hist_sheet.set_column('B:B', 30)   # Filename
        hist_sheet.set_column('C:C', 15)   # Category
        hist_sheet.set_column('D:D', 60)   # Histogram
        
        for row in range(1, len(df_hists)+1):
            hist_sheet.set_row(row, 100)

        # Format numbers in statistics sheet
        num_format = workbook.add_format({'num_format': '0.000'})
        total_format = workbook.add_format({'num_format': '#,##0'})
        for row in range(2, len(df_stats)+2):
            for col in range(1, len(df_stats.columns)):
                value = df_stats.iloc[row-2, col]
                if pd.isna(value):
                    stats_sheet.write_blank(row, col, None)
                elif (col % 4) == 0:  # Total columns
                    stats_sheet.write(row, col, value, total_format)
                else:
                    stats_sheet.write(row, col, value, num_format)
        
        stats_sheet.set_column(0, 0, 10)  # Year column
        for col in range(1, len(df_stats.columns)):
            stats_sheet.set_column(col, col, 15)

    print("Analysis results saved to analysis_results.xlsx")
else:
    print("No data processed")

Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025
Analysis results saved to analysis_results.xlsx


In [8]:
import os
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import RectangleSelector
from pyhdf.SD import SD, SDC
import pandas as pd
from io import BytesIO

plt.switch_backend('TkAgg')
base_dir = 'historic_data'

class RossROIManager:
    def __init__(self):
        self.rois = {
            'iaw_ross': None,
            'epw_ross': None
        }
        self.first_files_processed = {
            'iaw_ross': False,
            'epw_ross': False
        }

    def get_rois(self, category):
        return self.rois[category]

    def set_rois(self, category, rois):
        self.rois[category] = rois
        self.first_files_processed[category] = True

    def needs_roi_selection(self, category):
        return not self.first_files_processed[category]

ross_manager = RossROIManager()

def categorize_file(filename):
    lower_name = filename.lower()
    if 'iaw' in lower_name:
        if 'ccd' in lower_name:
            return 'iaw_ccd'
        elif 'ross' in lower_name:
            return 'iaw_ross'
    elif 'epw' in lower_name:
        if 'ccd' in lower_name:
            return 'epw_ccd'
        elif 'ross' in lower_name:
            return 'epw_ross'
    return 'other'

class ROISelector:
    def __init__(self, image, filename, year, category):
        self.fig, self.ax = plt.subplots(figsize=(10, 6))
        self.image = image
        self.filename = filename
        self.year = year
        self.category = category
        self.rois = []
        
        self.ax.imshow(self.image, cmap='gray')
        self.ax.set_title(f"Select 2 ROIs for {category}\n{filename} ({year})")
        
        self.selector = RectangleSelector(
            self.ax, self.on_select,
            useblit=True, button=[1],
            minspanx=5, minspany=5,
            spancoords='pixels', interactive=True
        )
        plt.show(block=True)

    def on_select(self, eclick, erelease):
        if len(self.rois) >= 2:
            return
        
        x1 = int(min(eclick.xdata, erelease.xdata))
        x2 = int(max(eclick.xdata, erelease.xdata))
        y1 = int(min(eclick.ydata, erelease.ydata))
        y2 = int(max(eclick.ydata, erelease.ydata))
        self.rois.append((x1, y1, x2-x1, y2-y1))
        
        rect = plt.Rectangle((x1, y1), x2-x1, y2-y1,
                           linewidth=1.5, edgecolor='red', facecolor='none')
        self.ax.add_patch(rect)
        self.fig.canvas.draw()
        
        if len(self.rois) == 2:
            plt.close()

def process_hdf4(file_path):
    try:
        hdf = SD(file_path, SDC.READ)
        dataset = hdf.select('Streak_array')
        data = dataset.get()
        background = data[1, :, :]
        hdf.end()
        return background.astype(float)
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

def generate_histogram(pixels, filename, year):
    fig = plt.figure(figsize=(6, 4))
    plt.hist(pixels, bins=50, color='blue', alpha=0.7, edgecolor='black')
    plt.title(f"Histogram\n{filename} ({year})")
    plt.xlabel("Pixel Value")
    plt.ylabel("Frequency")
    
    buf = BytesIO()
    plt.savefig(buf, format='png', dpi=100, bbox_inches='tight')
    plt.close(fig)
    return buf.getvalue()

def analyze_file(file_path, year, category):
    background = process_hdf4(file_path)
    if background is None:
        return None, None

    filename = os.path.basename(file_path)
    histogram = None
    
    if 'ccd' in category:
        # Use full frame for CCD categories
        pixels = background.flatten()
        histogram = generate_histogram(pixels, filename, year)
        stats = {
            'mean': np.mean(pixels),
            'median': np.median(pixels),
            'stdv': np.std(pixels),
            'total': len(pixels)
        }
    else:
        # Ross categories
        if ross_manager.needs_roi_selection(category):
            selector = ROISelector(background, filename, year, category)
            ross_manager.set_rois(category, selector.rois)
        
        rois = ross_manager.get_rois(category)
        pixels = []
        for x, y, w, h in rois:
            roi_data = background[y:y+h, x:x+w]
            pixels.extend(roi_data.flatten())
        
        if pixels:
            histogram = generate_histogram(pixels, filename, year)
            stats = {
                'mean': np.mean(pixels),
                'median': np.median(pixels),
                'stdv': np.std(pixels),
                'total': len(pixels)
            }
        else:
            stats = None

    return stats, histogram

def process_year(year):
    year_dir = os.path.join(base_dir, str(year))
    if not os.path.exists(year_dir):
        return None, []
    
    hdf_files = sorted([f for f in os.listdir(year_dir) if f.endswith('.hdf')])
    year_stats = {'Year': year}
    hist_data = []
    
    category_data = {
        'iaw_ccd': [], 'iaw_ross': [],
        'epw_ccd': [], 'epw_ross': []
    }
    
    for file_name in hdf_files:
        file_path = os.path.join(year_dir, file_name)
        filename = os.path.basename(file_path)
        category = categorize_file(filename)
        
        if category not in category_data:
            continue
            
        stats, hist = analyze_file(file_path, year, category)
        if stats and hist:
            category_data[category].append(stats)
            hist_data.append({
                'Year': year,
                'Filename': filename,
                'Category': category.replace('_', ' '),
                'Histogram': hist
            })
    
    # Aggregate statistics
    for cat, entries in category_data.items():
        prefix = cat.replace('_', ' ')
        if entries:
            year_stats[f"{prefix} - mean"] = np.mean([e['mean'] for e in entries])
            year_stats[f"{prefix} - median"] = np.mean([e['median'] for e in entries])
            year_stats[f"{prefix} - stdv"] = np.mean([e['stdv'] for e in entries])
            year_stats[f"{prefix} - total"] = np.sum([e['total'] for e in entries])
        else:
            year_stats[f"{prefix} - mean"] = np.nan
            year_stats[f"{prefix} - median"] = np.nan
            year_stats[f"{prefix} - stdv"] = np.nan
            year_stats[f"{prefix} - total"] = 0
    
    return year_stats, hist_data

# Main processing
all_data = []
all_hists = []

for year in range(2015, 2026):
    print(f"Processing {year}")
    year_stat, year_hists = process_year(year)
    if year_stat:
        all_data.append(year_stat)
    if year_hists:
        all_hists.extend(year_hists)

# Create and save Excel report
if all_data:
    # Prepare statistics DataFrame
    df_stats = pd.DataFrame(all_data)
    categories = ['iaw ccd', 'iaw ross', 'epw ccd', 'epw ross']
    column_order = ['Year'] + [
        f"{cat} - {metric}" for cat in categories 
        for metric in ['mean', 'median', 'stdv', 'total']
    ]
    df_stats = df_stats.reindex(columns=column_order)
    
    # Prepare histograms DataFrame
    df_hists = pd.DataFrame(all_hists)
    
    # Create Excel writer
    with pd.ExcelWriter(
        'analysis_results.xlsx',
        engine='xlsxwriter',
        engine_kwargs={'options': {'nan_inf_to_errors': True}}
    ) as writer:
        # Write statistics data starting at row 3 (Excel row 4)
        df_stats.to_excel(writer, sheet_name='Statistics', index=False, startrow=3)
        
        workbook = writer.book
        stats_sheet = writer.sheets['Statistics']
        
        # Create header format
        header_format = workbook.add_format({
            'bold': True,
            'align': 'center',
            'valign': 'vcenter',
            'border': 1
        })
        
        # Write and merge headers
        # Year header (Excel rows 1-2)
        stats_sheet.merge_range('A1:A2', 'Year', header_format)
        
        # Category headers (Excel row 1)
        col_idx = 1
        for category in categories:
            stats_sheet.merge_range(0, col_idx, 0, col_idx+3, category, header_format)
            col_idx += 4
        
        # Metric subheaders (Excel row 2)
        col_idx = 1
        for _ in categories:
            for metric in ['mean', 'median', 'stdv', 'total']:
                stats_sheet.write(1, col_idx, metric, header_format)
                col_idx += 1
        
        # Format data cells
        num_format = workbook.add_format({'num_format': '0.000'})
        total_format = workbook.add_format({'num_format': '#,##0'})
        
        for df_row in range(len(df_stats)):
            excel_row = df_row + 3  # Data starts at Excel row 4
            # Year column
            stats_sheet.write(excel_row, 0, df_stats.iloc[df_row, 0])
            
            # Data columns
            for col in range(1, len(df_stats.columns)):
                value = df_stats.iloc[df_row, col]
                if pd.isna(value):
                    stats_sheet.write_blank(excel_row, col, None)
                elif (col % 4) == 0:  # Total columns
                    stats_sheet.write(excel_row, col, value, total_format)
                else:
                    stats_sheet.write(excel_row, col, value, num_format)
        
        # Set column widths
        stats_sheet.set_column(0, 0, 10)  # Year column
        for col in range(1, len(df_stats.columns)):
            stats_sheet.set_column(col, col, 15)

        # Histograms sheet
        df_hists.to_excel(writer, sheet_name='Histograms', index=False)
        hist_sheet = writer.sheets['Histograms']
        
        # Insert images
        for idx, row in df_hists.iterrows():
            img_data = BytesIO(row['Histogram'])
            hist_sheet.insert_image(
                idx + 1, 3,  # Start from row 1, column D
                row['Filename'],
                {'image_data': img_data, 'x_offset': 5, 'y_offset': 5}
            )
        
        # Set column widths and row heights
        hist_sheet.set_column('A:A', 10)   # Year
        hist_sheet.set_column('B:B', 30)   # Filename
        hist_sheet.set_column('C:C', 15)   # Category
        hist_sheet.set_column('D:D', 60)   # Histogram
        
        for row in range(1, len(df_hists)+1):
            hist_sheet.set_row(row, 100)

    print("Analysis results saved to analysis_results.xlsx")
else:
    print("No data processed")

Processing 2015
Processing 2016
Processing 2017
Processing 2018
Processing 2019
Processing 2020
Processing 2021
Processing 2022
Processing 2023
Processing 2024
Processing 2025
Analysis results saved to analysis_results.xlsx
