In [1]:
import numpy as np 
import pandas as pd
import glob
import matplotlib.pyplot as plt

#plt.style.use('ggplot')
plt.style.use('seaborn-colorblind')
plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['savefig.bbox'] = 'tight'

%matplotlib inline

import datetime
date = datetime.datetime.now().strftime('%Y%m%d')

# Read and build spectral dataset

## The normal scans

In [2]:
core_list = glob.glob('data/SO264*')
core_list.sort()
core_list

['data/SO264-09-2',
 'data/SO264-13-2',
 'data/SO264-14-1',
 'data/SO264-15-2',
 'data/SO264-16-2',
 'data/SO264-19-2',
 'data/SO264-22-2',
 'data/SO264-24-3',
 'data/SO264-26-2',
 'data/SO264-28-2',
 'data/SO264-32-2',
 'data/SO264-34-2',
 'data/SO264-41-2',
 'data/SO264-44-2',
 'data/SO264-44-3',
 'data/SO264-45-2',
 'data/SO264-46-5',
 'data/SO264-47-2',
 'data/SO264-49-2',
 'data/SO264-51-2',
 'data/SO264-52-2',
 'data/SO264-53-2',
 'data/SO264-54-2',
 'data/SO264-55-1',
 'data/SO264-56-2',
 'data/SO264-60-12',
 'data/SO264-62-2',
 'data/SO264-64-1',
 'data/SO264-66-2',
 'data/SO264-70-1',
 'data/SO264-76-1']

In [8]:
# SO264-54-2 has a section naming the section depth 10430B
# so we deal with it later
core_list.remove('data/SO264-54-2')

In [35]:
file_name = []
spe_all = []
depth_all = []
cps_all = []
core_all = []
section_all = []
s_depth_all = []

# do it core by core
for core in core_list:
    i = 0
    section = 0
    start_depth = 0
    
    # only read the 10kV which having better signal to the light elements
    spe_dir = glob.glob('{}/Run 1 at  10kV/*.spe'.format(core))
    
    # make sure the spe directories sorted by the depth number
    spe_dir.sort()
    
    for spe in spe_dir:
        check_depth = spe.split()[3].split('_')[-1]

        # if the section changes, the start depth changes
        if start_depth != int(check_depth):
            # reset the number for calculate composite depth
            i = 0
            section += 1
            # 5 and 6 digis means in mm
            # I don't use >= 5 becasue I need to make sure I'm noticed
            # if there is 7 difits or more
            if (len(check_depth) == 5) or (len(check_depth) == 6):
                start_depth = int(check_depth)
            # 3 and 4 digit means in cm, needs to be multipled to be mm
            elif len(check_depth) >= 3:
                start_depth = int(check_depth) * 10

            # this shouldn't happen, but just check
            # it did happens...
            else:
                print('Weird digits: {}'.format(spe))

        file_name.append(spe.split('/')[-1])
        with open(spe, 'r') as f:
            content = []
            lines = f.readlines()
            for line in lines[49:]:
                content = np.hstack((content, line.split()))

            spe_all.append(content.astype(int))
            depth_all.append(start_depth + i)
            cps_all.append(int(lines[28]))
            core_all.append(core[5:])
            section_all.append(section)
            s_depth_all.append(i)
            
        # the scanning resolution is 10 mm
        i += 10
    print('core {} is done.'.format(core))

core data/SO264-09-2 is done.
core data/SO264-13-2 is done.
core data/SO264-14-1 is done.
core data/SO264-15-2 is done.
core data/SO264-16-2 is done.
core data/SO264-19-2 is done.
core data/SO264-22-2 is done.
core data/SO264-24-3 is done.
core data/SO264-26-2 is done.
core data/SO264-28-2 is done.
core data/SO264-32-2 is done.
core data/SO264-34-2 is done.
core data/SO264-41-2 is done.
core data/SO264-44-2 is done.
core data/SO264-44-3 is done.
core data/SO264-45-2 is done.
core data/SO264-46-5 is done.
core data/SO264-47-2 is done.
core data/SO264-49-2 is done.
core data/SO264-51-2 is done.
core data/SO264-52-2 is done.
core data/SO264-53-2 is done.
core data/SO264-55-1 is done.
core data/SO264-56-2 is done.
core data/SO264-60-12 is done.
core data/SO264-62-2 is done.
core data/SO264-64-1 is done.
core data/SO264-66-2 is done.
core data/SO264-70-1 is done.
core data/SO264-76-1 is done.


### Deal with the SO264-54-2
SO264-54-2 is the core excluded above and has a section naming the section depth 10430B.

#### Check sections

In [30]:
spe_dir = glob.glob('data/SO264-54-2/Run 1 at  10kV/*.spe'.format(core))
# make sure the spe directories sorted by the depth number
spe_dir.sort()
test = []

for spe in spe_dir:
    test.append(spe.split()[3].split('_')[-1])

In [31]:
np.unique(test)

array(['00000', '00430', '01430', '02430', '03430', '04430', '05430',
       '06430', '07430', '08430', '09430', '10430B'], dtype='<U6')

#### Run

In [36]:
i = 0
section = 0
start_depth = 0
# the rest list variables adopt the variables in the cores above

# only read the 10kV which having better signal to the light elements
spe_dir = glob.glob('data/SO264-54-2/Run 1 at  10kV/*.spe'.format(core))
# make sure the spe directories sorted by the depth number
spe_dir.sort()

for spe in spe_dir:
    check_depth = spe.split()[3].split('_')[-1]
    
    # deal with this section separately
    if check_depth == '10430B':
        start_depth = 10430
        file_name.append(spe.split('/')[-1])
        with open(spe, 'r') as f:
            content = []
            lines = f.readlines()
            for line in lines[49:]:
                content = np.hstack((content, line.split()))

            spe_all.append(content.astype(int))
            # the way calculating the composite depth is different in this section
            # the depth starts from 10.0 mm so I need to - 10 to calibrate it
            # the depth is recorded as '1000.0\n'
            depth_all.append(int(lines[13][:-3]) + start_depth - 10)
            cps_all.append(int(lines[28]))
            core_all.append('SO264-54-2')
            s_depth_all.append(int(lines[13][:-3]) - 10)
            
            # manually count: it's section 11
            section_all.append(11)
            
    else:       
        # if the section changes, the start depth changes
        if start_depth != int(check_depth):
            # reset the number for calculate composite depth
            i = 0
            section += 1
            
            # 5 and 6 digis means in mm
            # I don't use >= 5 becasue I need to make sure I'm noticed
            # if there is 7 difits or more
            if (len(check_depth) == 5) or (len(check_depth) == 6):
                start_depth = int(check_depth)
            # 3 and 4 digit means in cm, needs to be multipled to be mm
            elif len(check_depth) >= 3:
                start_depth = int(check_depth) * 10

            # this shouldn't happen, but just check
            # it did happens...
            else:
                print('Weird digits: {}'.format(spe))

        file_name.append(spe.split('/')[-1])
        with open(spe, 'r') as f:
            content = []
            lines = f.readlines()
            for line in lines[49:]:
                content = np.hstack((content, line.split()))

            spe_all.append(content.astype(int))
            depth_all.append(start_depth + i)
            cps_all.append(int(lines[28]))
            core_all.append('SO264-54-2')
            section_all.append(section)
            s_depth_all.append(i)

        # the scanning resolution is 10 mm
        i += 10

In [37]:
len(spe_all)

34316

In [38]:
data_df = pd.DataFrame(spe_all)
data_df['cps'] = cps_all
data_df['core'] = core_all
data_df['composite_depth_mm'] = depth_all
data_df['section'] = section_all
data_df['section_depth_mm'] = s_depth_all
data_df['file_name'] = file_name

In [39]:
data_df.shape

(34316, 2054)

In [40]:
data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2044,2045,2046,2047,cps,core,composite_depth_mm,section,section_depth_mm,file_name
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,174740,SO264-09-2,0,0,0,SO264-09-2_0000 10.0mm 10s 10kV 150uA No-F...
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,176896,SO264-09-2,10,0,10,SO264-09-2_0000 20.0mm 10s 10kV 150uA No-F...
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,183148,SO264-09-2,20,0,20,SO264-09-2_0000 30.0mm 10s 10kV 150uA No-F...
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,185151,SO264-09-2,30,0,30,SO264-09-2_0000 40.0mm 10s 10kV 150uA No-F...
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,186255,SO264-09-2,40,0,40,SO264-09-2_0000 50.0mm 10s 10kV 150uA No-F...


In [41]:
data_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2044,2045,2046,2047,cps,core,composite_depth_mm,section,section_depth_mm,file_name
34311,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,110604,SO264-54-2,11380,11,950,SO264-54-2_10430B 960.0mm 10s 10kV 150uA No...
34312,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,117872,SO264-54-2,11390,11,960,SO264-54-2_10430B 970.0mm 10s 10kV 150uA No...
34313,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,120172,SO264-54-2,11400,11,970,SO264-54-2_10430B 980.0mm 10s 10kV 150uA No...
34314,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,122814,SO264-54-2,11410,11,980,SO264-54-2_10430B 990.0mm 10s 10kV 150uA No...
34315,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,74021,SO264-54-2,11420,11,990,SO264-54-2_10430B 1000.0mm 10s 10kV 150uA No...


In [42]:
data_df.to_csv('data/spe_dataset_{}.csv'.format(date))

## The rescan

In [44]:
glob.glob('data/SO*/Rescan*')

['data/SO264-55-1/Rescan_00050&01120',
 'data/SO264-28-2/Rescan_06755',
 'data/SO264-28-2/Rescan_05755',
 'data/SO264-09-2/Rescan_0269&0369',
 'data/SO264-51-2/Rescan_0000&0085',
 'data/SO264-19-2/Rescan_03400',
 'data/SO264-53-2/Rescan_0000&0063&0163']