I select the core SO264-15-2 for the pilot work, which is same as [the reference paper](https://www.nature.com/articles/s41598-022-25377-x). This pilot work has the purpose to go through the steps of building dataset and training model in a smaller portion of the data. The final model will be trained on the whole dataset. This notebook is the first step of the pilot work: building the dataset.

# Build dataset
I intend to adopt the data of the reference paper because it has been polished by dealing with some duplicates, zeros and problemtic machine heads. I am going to create a data format fitting the [template](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html).

In [1]:
import numpy as np
import pandas as pd

# get today's date
from datetime import date
date = date.today().strftime('%Y%m%d')
print(date)

20231121


In [2]:
spe_df = pd.read_csv('data/spe_dataset_20220629.csv', index_col=0)
spe_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2044,2045,2046,2047,cps,core,composite_depth_mm,section_depth_mm,filename,section
composite_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SO264-09-2_00010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,174740,SO264-09-2,10,10,SO264-09-2_0000 10.0mm 10s 10kV 150uA No-F...,0
SO264-09-2_00020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,176896,SO264-09-2,20,20,SO264-09-2_0000 20.0mm 10s 10kV 150uA No-F...,0
SO264-09-2_00030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,183148,SO264-09-2,30,30,SO264-09-2_0000 30.0mm 10s 10kV 150uA No-F...,0
SO264-09-2_00040,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,185151,SO264-09-2,40,40,SO264-09-2_0000 40.0mm 10s 10kV 150uA No-F...,0
SO264-09-2_00050,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,186255,SO264-09-2,50,50,SO264-09-2_0000 50.0mm 10s 10kV 150uA No-F...,0


In [3]:
spe_df.iloc[0, :2048].apply(type).value_counts()

SO264-09-2_00010
<class 'numpy.int64'>    2048
Name: count, dtype: int64

The spectra are not yet transformed, so I can simply use them.

### Check the maximum value of channel values

In [4]:
spe_df.iloc[:, :2048].max().max()

218656

In [5]:
spe_df.iloc[:, :2048].max().argmax()

184

In [6]:
spe_df.iloc[:, 184].argmax()

53691

In [7]:
spe_df.iloc[53691, -4:]

composite_depth_mm                                                 6410
section_depth_mm                                                    520
filename              PS97-092-1_0589  520.0mm  10s  10kV 150uA No-F...
section                                                               6
Name: PS97-092-1_06410, dtype: object

## Export spectra to single files
As mentioned in the begining, only the core SO264-15-2 is selected as the pilot work. The spectra are exported to single files in the folder `data/spe/`.

In [8]:
core = "SO264-15-2"
spe_df = spe_df[spe_df.core == core].copy()
spe_df = spe_df.reset_index(drop=False)
spe_df.head()

Unnamed: 0,composite_id,0,1,2,3,4,5,6,7,8,...,2044,2045,2046,2047,cps,core,composite_depth_mm,section_depth_mm,filename,section
0,SO264-15-2_00010,0,0,0,0,0,0,0,0,0,...,0,0,0,0,38706,SO264-15-2,10,10,SO264-15-2_0000 10.0mm 10s 10kV 150uA No-F...,0
1,SO264-15-2_00020,0,0,0,0,0,0,0,0,0,...,0,0,0,0,43223,SO264-15-2,20,20,SO264-15-2_0000 20.0mm 10s 10kV 150uA No-F...,0
2,SO264-15-2_00030,0,0,0,0,0,0,0,0,0,...,0,0,0,0,107525,SO264-15-2,30,30,SO264-15-2_0000 30.0mm 10s 10kV 150uA No-F...,0
3,SO264-15-2_00040,0,0,0,0,0,0,0,0,0,...,0,0,0,0,127305,SO264-15-2,40,40,SO264-15-2_0000 40.0mm 10s 10kV 150uA No-F...,0
4,SO264-15-2_00050,0,0,0,0,0,0,0,0,0,...,0,0,0,0,130242,SO264-15-2,50,50,SO264-15-2_0000 50.0mm 10s 10kV 150uA No-F...,0


In [9]:
for row in spe_df.iterrows():
    row[1][1:2049].to_csv('data/spe/{}.csv'.format(row[0]), index=False, header=False)

In [10]:
arr = np.loadtxt('data/spe/0.csv', delimiter=',', dtype=int)
print(arr.shape)
print(arr.dtype)

(2048,)
int64


## Build masked tensors
I determine the masking ratio to be 40% of the total number of channels, which falls in the range of 20% to 50% in our proposal. The 40% masked channels are randomly selected and replaced by 99999999 in each spectrum. The value, 99999999, is two orders larger than the maximum value of the all spectra (218656). The masked spectra are exported to single files in the folder `data/masked/`.

In [12]:
# test outside the loop
rand = np.random.rand(arr.shape[0])
mask_arr = arr.copy()
mask_arr[rand < 0.4] = 99999999

print(mask_arr)
print((mask_arr == 99999999).sum()/mask_arr.size)


[99999999        0 99999999 ... 99999999 99999999        0]
0.4033203125


In [13]:
np.random.seed(24)

for row in spe_df.iterrows():
    arr = row[1][1:2049].values
    rand = np.random.rand(arr.shape[0])
    mask = rand < 0.4
    arr[mask] = 99999999
    #np.save('data/mask/{}.npy'.format(row[0]), mask)
    np.savetxt('data/mask/{}.csv'.format(row[0]), mask, delimiter=',', fmt='%5i')
    np.savetxt('data/masked/{}.csv'.format(row[0]), arr, delimiter=',', fmt='%d')

In [14]:
arr = np.loadtxt('data/masked/0.csv', delimiter=',', dtype=int)
print(arr.shape)
print(arr.dtype)

mask = np.loadtxt('data/mask/0.csv', delimiter=',', dtype=int)
print(mask.shape)
print(mask.dtype)

(2048,)
int64
(2048,)
int64


### Use apply()
This is decrepated because I want to export the masks also, which may be needed when calculating the loss. The loss only considers the masked channels as BERT.

In [None]:
def mask(arr, ratio=0.4, value=99999999):
    rand = np.random.rand(arr.shape[0])
    arr[rand < ratio] = value
    return arr

In [None]:
np.random.seed(24)
spe_m_df = spe_df.iloc[:, 1:2049].apply(mask, axis=1)
spe_m_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,0,0,0,99999999,99999999,0,0,99999999,99999999,99999999,...,0,99999999,99999999,99999999,0,0,0,99999999,99999999,99999999
1,0,0,0,0,0,0,99999999,0,0,0,...,99999999,99999999,99999999,0,99999999,99999999,0,0,99999999,99999999
2,0,99999999,0,0,0,99999999,0,99999999,0,0,...,99999999,0,99999999,0,0,0,99999999,99999999,0,0
3,0,0,0,99999999,0,99999999,99999999,99999999,99999999,0,...,0,0,99999999,0,99999999,99999999,0,99999999,99999999,99999999
4,0,0,0,0,0,99999999,0,99999999,0,99999999,...,99999999,99999999,0,99999999,0,99999999,0,0,99999999,0


In [None]:
for row in spe_m_df.iterrows():
    #row[1].to_csv('data/masked/{}.csv'.format(row[0]), index=False, header=False)

## Build annotations file
This file contains the information of the spectra files and the corresponding index.

In [15]:
spe_df['dirname'] = ['{}.csv'.format(x) for x in spe_df.index]
spe_df['dirname'].head()

0    0.csv
1    1.csv
2    2.csv
3    3.csv
4    4.csv
Name: dirname, dtype: object

In [16]:
spe_df.columns

Index(['composite_id', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '2045', '2046', '2047', 'cps', 'core', 'composite_depth_mm',
       'section_depth_mm', 'filename', 'section', 'dirname'],
      dtype='object', length=2056)

In [17]:
spe_df[['dirname', 'composite_id', 'cps', 'core', 
       'composite_depth_mm', 'section_depth_mm', 
       'filename', 'section']]

Unnamed: 0,dirname,composite_id,cps,core,composite_depth_mm,section_depth_mm,filename,section
0,0.csv,SO264-15-2_00010,38706,SO264-15-2,10,10,SO264-15-2_0000 10.0mm 10s 10kV 150uA No-F...,0
1,1.csv,SO264-15-2_00020,43223,SO264-15-2,20,20,SO264-15-2_0000 20.0mm 10s 10kV 150uA No-F...,0
2,2.csv,SO264-15-2_00030,107525,SO264-15-2,30,30,SO264-15-2_0000 30.0mm 10s 10kV 150uA No-F...,0
3,3.csv,SO264-15-2_00040,127305,SO264-15-2,40,40,SO264-15-2_0000 40.0mm 10s 10kV 150uA No-F...,0
4,4.csv,SO264-15-2_00050,130242,SO264-15-2,50,50,SO264-15-2_0000 50.0mm 10s 10kV 150uA No-F...,0
...,...,...,...,...,...,...,...,...
1335,1335.csv,SO264-15-2_14360,144280,SO264-15-2,14360,880,SO264-15-2_1348 880.0mm 10s 10kV 150uA No-F...,13
1336,1336.csv,SO264-15-2_14370,56052,SO264-15-2,14370,890,SO264-15-2_1348 890.0mm 10s 10kV 150uA No-F...,13
1337,1337.csv,SO264-15-2_14380,42094,SO264-15-2,14380,900,SO264-15-2_1348 900.0mm 10s 10kV 150uA No-F...,13
1338,1338.csv,SO264-15-2_14390,34444,SO264-15-2,14390,910,SO264-15-2_1348 910.0mm 10s 10kV 150uA No-F...,13


In [18]:
spe_df[['dirname', 'composite_id', 'cps', 'core', 
       'composite_depth_mm', 'section_depth_mm', 
       'filename', 'section']].to_csv(f'data/info_{date}.csv', index=False)

## Build dataset class 

In [19]:
import os
import pandas as pd
from torch.utils.data import Dataset

class CustomImageDataset(Dataset):
    # We don't need the labels and transform for now
    def __init__(self, annotations_file, input_dir, target_dir, mask_dir):
        """
        input_dir: directory with masked spe files
        target_dir: directory with original spe files
        mask_dir: directory with boolean mask files
        """
        self.spe_info = pd.read_csv(annotations_file)
        self.input_dir = input_dir
        self.target_dir = target_dir
        self.mask_dir = mask_dir
        
    def __len__(self):
        return len(self.spe_info)

    def __getitem__(self, idx):
        input_path = os.path.join(self.input_dir, self.spe_info.iloc[idx, 0])
        target_path = os.path.join(self.target_dir, self.spe_info.iloc[idx, 0])
        mask_path = os.path.join(self.mask_dir, self.spe_info.iloc[idx, 0])

        input_spe = np.loadtxt(input_path, delimiter=',', dtype=int)
        target_spe = np.loadtxt(target_path, delimiter=',', dtype=int)
        mask = np.loadtxt(mask_path, delimiter=',', dtype=int)

        output = {'input_spe': input_spe,
                  'target_spe': target_spe,
                  'mask': mask}
  
        return output

# Play around

In [20]:
from torch import Generator
from torch.utils.data import random_split

dataset = CustomImageDataset('data/info_20231121.csv', 'data/masked', 'data/spe', 'data/mask')
data_train, data_test = random_split(dataset, [0.8, 0.2], generator=Generator().manual_seed(24))

print(len(dataset))
print(dataset[0])
print(len(data_train))
print(len(data_test))

1340
{'input_spe': array([       0,        0,        0, ..., 99999999, 99999999, 99999999]), 'target_spe': array([0, 0, 0, ..., 0, 0, 0]), 'mask': array([0, 0, 0, ..., 1, 1, 1])}
1072
268


In [26]:
data_train[0]['mask'].sum()/data_train[0]['mask'].size

0.4228515625

0 in mask is not masked. 1 in mask is masked.

In [138]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(data_train, batch_size=64, shuffle=True)
output = next(iter(train_dataloader))
print(output['input_spe'].size())
print(output)

torch.Size([64, 2048])
{'input_spe': tensor([[99999999,        0, 99999999,  ...,        0, 99999999, 99999999],
        [99999999, 99999999, 99999999,  ...,        0, 99999999,        0],
        [99999999, 99999999, 99999999,  ...,        0,        0,        0],
        ...,
        [99999999,        0,        0,  ..., 99999999, 99999999, 99999999],
        [99999999,        0,        0,  ..., 99999999,        0, 99999999],
        [       0,        0,        0,  ..., 99999999,        0, 99999999]]), 'target_spe': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'mask': tensor([[1, 0, 1,  ..., 0, 1, 1],
        [1, 1, 1,  ..., 0, 1, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 0, 0,  ..., 1, 1, 1],
        [1, 0, 0,  ..., 1, 0, 1],
        [0, 0, 0,  ..., 1, 0, 1]])}
