# SEM-EDS zinc-soap dataset: data preparation

This notebook performs one-time setup:
- Extract .raw.gz files
- Load .rpl/.raw spectrum images with HyperSpy
- Apply crop + axis calibration + metadata
- Save cleaned datasets as .hspy for fast reload in the analysis notebook

After running this notebook once, proceed to '02_clustering_analysis.ipynb' for all analysis. 

In [2]:
# Imports
from pathlib import Path
import gzip, shutil
import hyperspy.api as hs
import numpy as np
import matplotlib.pyplot as plt

# Paths (edit if your folder structure differs)
DATA = Path("./data_raw")
OUT = Path("./data_processed")
FIGS = Path("./figures")

OUT.mkdir(exist_ok=True)
FIGS.mkdir(exist_ok=True)

print("DATA:", DATA.resolve())
print("OUT:", OUT.resolve())
print("FIGS:", FIGS.resolve())

# Plot defaults
plt.rcParams["figure.dpi"] = 120
DEFAULT_CMAP = "magma"

DATA: C:\Users\elise\OneDrive\Desktop\ZINCSOAP\data_raw
OUT: C:\Users\elise\OneDrive\Desktop\ZINCSOAP\data_processed
FIGS: C:\Users\elise\OneDrive\Desktop\ZINCSOAP\figures


## 1) Extract .raw.gz (if needed)

In [3]:
def gunzip_if_needed(gz_path: Path) -> Path: 
    """Extract *raw.gz to *raw if the raw file is not already present."""
    raw_path = gz_path.with_suffix("")
    if raw_path.exists():
        print(f"[skip] {raw_path.name} already exists")
        return raw_path

    if not gz_path.exists():
        raise FileNoteFoundError(gz_path)

    print(f"[do] Extracting {gz_path.name} -> {raw_path.name}")
    with gzip.open(gz_path, "rb") as f_in, open(raw_path, "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)
    return raw_path

aerial_raw = gunzip_if_needed(DATA / "Aerial 150x.raw.gz")
cs_raw = gunzip_if_needed(DATA / "CS 148x.raw.gz")

print("Done.")

[skip] Aerial 150x.raw already exists
[skip] CS 148x.raw already exists
Done.


## 2) Load Aerial dataset, crop empty rows, set metadata, save .hspy

In [6]:
aerial_rpl = DATA / "Aerial 150x.rpl"
sA = hs.load(aerial_rpl).inav[:, :600] # crop bottom empty pixels

# Spatial calibration
sA.axes_manager[0].name  = "x"
sA.axes_manager[0].scale = 0.859
sA.axes_manager[0].units = "µm"

sA.axes_manager[1].name  = "y"
sA.axes_manager[1].scale = 0.8859
sA.axes_manager[1].units = "µm"

# Energy calibration
sA.axes_manager[2].name   = "Energy"
sA.axes_manager[2].offset = -0.955
sA.axes_manager[2].scale  = 0.01
sA.axes_manager[2].units  = "keV"

# EDS signal type + elements
sA.set_signal_type("EDS_SEM")
sA.add_elements(["C","Cl","Zn","O","Cd","S"])
sA.metadata.General.title = "Aerial EDS Spectrum Image"

aerial_hspy = OUT / "Aerial 150x.hspy"
sA.save(aerial_hspy, overwrite=True)
print("Saved:", aerial_hspy)
print(sA)

Saved: data_processed\Aerial 150x.hspy
<EDSSEMSpectrum, title: Aerial EDS Spectrum Image, dimensions: (1024, 600|3093)>


## 3) Load Cross-section dataset, crop empty rows, set metadata, save .hspy

In [7]:
cs_rpl = DATA / "CS 148x.rpl"
sC = hs.load(cs_rpl).inav[:, 120:]   # crop bottom empty pixels

# Spatial calibration
sC.axes_manager[0].name  = "x"
sC.axes_manager[0].scale = 0.859
sC.axes_manager[0].units = "µm"

sC.axes_manager[1].name  = "y"
sC.axes_manager[1].scale = 0.8859
sC.axes_manager[1].units = "µm"

# Energy calibration
sC.axes_manager[2].name   = "Energy"
sC.axes_manager[2].offset = -0.955
sC.axes_manager[2].scale  = 0.01
sC.axes_manager[2].units  = "keV"

# EDS signal type + elements
sC.set_signal_type("EDS_SEM")
sC.add_elements(["C","Cl","Zn","O","Cd","S"])
sC.metadata.General.title = "Cross-section EDS Spectrum Image"

cs_hspy = OUT / "CS 148x.hspy"
sC.save(cs_hspy, overwrite=True)
print("Saved:", cs_hspy)
print(sC)

Saved: data_processed\CS 148x.hspy
<EDSSEMSpectrum, title: Cross-section EDS Spectrum Image, dimensions: (1024, 648|3144)>
