In [None]:
#|default_exp legacy 

# Pdz archaeology 

> How to read legacy pdz files... 

**If you did not develop a headache in the previous section, then this section is for you! We need to resolve one more problem of reading ancient pdz file formats.**  

As explained in the previous section, the currently prevalent pdz file format version **pdz25** contains multiple blocks of different types and variable size. The first two bytes of a **pdz25** file decode as `25`. This is not true for old 'legacy format' pdz files. In earlier days pdz files were formatted as one single block of data. Legacy files always start with two bytes with hexadecimal code `\x01\x01`. I call this format **pdz11**.  The exact structure of these legacy files is not fixed, because some instrument detectors are equipped with 1024 channels, while other types contain 2048 channels. To complicate matters further the last part of pdz files can vary in length. 

To check a pdz file type import the `check_pdz_type()` function and provide a valid file path to a pdz file. Listing pdz files is easily done  with the standard library function `glob('*.pdz')`.       

In [2]:
from read_pdz import check_pdz_type
from glob import glob
import os 

Let's check the pdz format for all demo data pdz files. 

In [36]:
os.chdir('/home/frank/Work/DATA/read-pdz-demodata/')  
pdz_files = sorted(glob('*.pdz'))

print('Checking pdz file format versions: \n ')
for i, f in enumerate(pdz_files): 
    pdz_type = check_pdz_type(f)
    print(f"[{i}] {f}\n    --> format: '{pdz_type}'\n")  

Checking pdz file format versions: 
 
[0] 00066-Spectrum Only.pdz
    --> format: 'pdz25'

[1] 00067-Spectrum Only.pdz
    --> format: 'pdz25'

[2] 00068-Alloys 2.pdz
    --> format: 'pdz25'

[3] 00069-Alloys 2.pdz
    --> format: 'pdz25'

[4] 00081-Precious Metals 2.pdz
    --> format: 'pdz25'

[5] 00085-Mudrock Dual.pdz
    --> format: 'pdz25'

[6] 00086-Mudrock Dual.pdz
    --> format: 'pdz25'

[7] 00181-Restricted Materials.pdz
    --> format: 'pdz25'

[8] 00182-Restricted Materials.pdz
    --> format: 'pdz25'

[9] 1-sky sample spot.pdz
    --> format: 'pdz11_1024_channels'

[10] 2-sky left of left tree.pdz
    --> format: 'pdz11_1024_channels'

[11] 3-brown left edge sample spot.pdz
    --> format: 'pdz11_1024_channels'

[12] 59.pdz
    --> format: 'pdz11_2048_channels'

[13] A1_a.pdz
    --> format: 'pdz11_2048_channels'

[14] A7_a.pdz
    --> format: 'pdz11_2048_channels'

[15] ANALYZE_EMP-10.pdz
    --> format: 'pdz11_2048_channels'

[16] ANALYZE_EMP-11.pdz
    --> format: 'pdz

Let's now see how to parse a pdz11 file. To do so we need to import the `multiparse()` function and extended format string for pdz11 files.  

In [30]:
from read_pdz import multiparse, file_to_bytes, PDZ11_STRUCT_DICT

Let's take a closer look at the structure of PDZ11 files. 

```python
PDZ11_STRUCT_DICT = {
    'pdz11_2048_channels' : {'xformat': '2X-4X-h-34X-2d-86X-2i-10X-2f-188X-Z-*X', 
                          'param_keys': ['pdz-version', '??', 'NumberOfChannels', '??', 'LiveTimeInSeconds', 
                                         'eVPerChannel', '??', 'RawCounts', 'ValidCounts', '??',  'XrayVoltageInkV', 
                                         'XrayFilamentCurrentInMicroAmps', '??', 'PhotonCounts (2048 channels)', '??']}, 
    'pdz11_1024_channels' : {'xformat': '2X-4X-h-34X-2d-86X-2i-10X-2f-24X-z-*X', 
                          'param_keys': ['pdz-version', '??', 'NumberOfChannels', '??', 'LiveTimeInSeconds', 
                                         'eVPerChannel', '??', 'RawCounts', 'ValidCounts', '??',  'XrayVoltageInkV', 
                                         'XrayFilamentCurrentInMicroAmps', '??', 'PhotonCounts (1024 channels)', '??']} 
}
```


Given the slightly different `xformat` strings for the two types of pdz11 legacy files we can now check if we can also parse these files correctly.  

In [34]:
#|code-fold: true
n = 13  
pdz_file = pdz_files[n]
pdz_type = check_pdz_type(pdz_file) 
arr = file_to_bytes(pdz_file)

xformat = PDZ11_STRUCT_DICT[pdz_type]['xformat']
param_keys = PDZ11_STRUCT_DICT[pdz_type]['param_keys']
print(f"file: '{pdz_file}'\n--> pdz type: '{pdz_type}'\n--> xformat string: '{xformat}'")

file: 'A1_a.pdz'
--> pdz type: 'pdz11_2048_channels'
--> xformat string: '2X-4X-h-34X-2d-86X-2i-10X-2f-188X-Z-*X'


In [35]:
parsed, tail = multiparse(xformat, arr, param_keys=param_keys)

Unnamed: 0,values,param_keys
0,b'\x01\x01',pdz-version
1,b'\x17',??
2,2048,NumberOfChannels
3,b'\xc8\xe6\xc8\xca\xd6\xd8\x02\x01?\x02\x01\x04',??
4,0.0,LiveTimeInSeconds
5,20.085341,eVPerChannel
6,b'\xa8\x91@\x10\x9d@\xe0j@n@\xf4\x01\x8c\x02',??
7,132513792,RawCounts
8,65538,ValidCounts
9,b'\x0e\t\x11\x07\xde\x02',??


In [None]:
#|code-fold: true
n = 11
pdz_file = pdz_files[n]
pdz_type = check_pdz_type(pdz_file) 
arr = file_to_bytes(pdz_file)

xformat = PDZ11_STRUCT_DICT[pdz_type]['xformat']
param_keys = PDZ11_STRUCT_DICT[pdz_type]['param_keys']
print(f"file: '{pdz_file}'\n--> pdz type: '{pdz_type}'\n--> xformat string: '{xformat}'")

file: '3-brown left edge sample spot.pdz'
--> pdz type: 'pdz11_4454_bytes'
--> xformat string: '2s-4s-h-34s-d-d-86s-i-i-10s-f-f-24s-z-148s'


In [None]:
parsed, tail = multiparse(xformat, arr, param_keys=param_keys)

Unnamed: 0,values,param_keys
0,b'\x01\x01',pdz-version
1,b'\x14\x00\x00\x00',??
2,1024,NumberOfChannels
3,b'\xc8\x00\xe6\x00\xc8\x00\xca\x00\xd6\x00\xd8...,??
4,60.76,LiveTimeInSeconds
5,40.0,eVPerChannel
6,b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00...,??
7,1001652224,RawCounts
8,519897088,ValidCounts
9,b'\x00\x00\xc4w\r\x000Y\x07\x00',??


## FUNCTIONS

In [32]:
#|export 

from read_pdz import file_to_bytes, get_blocks, multiparse, parse 
import struct

In [33]:
#|export 

PDZ11_STRUCT_DICT = {
    'pdz11_2048_channels' : {'xformat': '2X-4X-h-34X-2d-86X-2i-10X-2f-188X-Z-*X', 
                          'param_keys': ['pdz-version', '??', 'NumberOfChannels', '??', 'LiveTimeInSeconds', 
                                         'eVPerChannel', '??', 'RawCounts', 'ValidCounts', '??',  'XrayVoltageInkV', 
                                         'XrayFilamentCurrentInMicroAmps', '??', 'PhotonCounts (2048 channels)', '??']}, 
    'pdz11_1024_channels' : {'xformat': '2X-4X-h-34X-2d-86X-2i-10X-2f-24X-z-*X', 
                          'param_keys': ['pdz-version', '??', 'NumberOfChannels', '??', 'LiveTimeInSeconds', 
                                         'eVPerChannel', '??', 'RawCounts', 'ValidCounts', '??',  'XrayVoltageInkV', 
                                         'XrayFilamentCurrentInMicroAmps', '??', 'PhotonCounts (1024 channels)', '??']} 
}


def check_pdz_type(pdz_file, verbose=True): 
    '''Read first two bytes and for legacy pdz files number of detector channels to check pdz file type.'''

    pdz_bytes = file_to_bytes(pdz_file) 
    file_size = len(pdz_bytes)
    first_two_bytes = struct.unpack('<h', pdz_bytes[0:2])[0] 

    if first_two_bytes == 25: 
        pdz_type = 'pdz25' 
    elif first_two_bytes == 257: 
        n_channels = struct.unpack('<h', pdz_bytes[6:8])[0] 
        if n_channels == 1024:
            pdz_type = 'pdz11_1024_channels'
        elif n_channels == 2048:
            pdz_type = 'pdz11_2048_channels'  
        else: 
            pdz_type = f'pdz11_with_unexpected_number_of_{n_channels}_channels'
    else:
        pdz_type = f'pdz_type_unknown:{first_two_bytes}'
                
    return pdz_type

In [None]:
#|hide 
import pandas as pd 
import numpy as np 

In [None]:
#|hide
pd.set_option('display.max_rows', None)