In [1]:
from SpatialBiologyToolkit import preprocess

# Pre-processing of MCD files ready for denoising

### Extract .tiff stacks from MCD files
Put all the MCD files in the same directory. Within the output directory, each MCD file will have it own directory, with each ROI having it's own stack of tiffs (ie, one file per ROI), an associated panel file which details the names and order of channels. Within the directory for the MCD file, there is also a log of errors (...errors.csv), and the metadata which includes the names of the ROIs (...meta.csv)

In [2]:
preprocess.export_mcd_folder?

[1;31mSignature:[0m
[0mpreprocess[0m[1;33m.[0m[0mexport_mcd_folder[0m[1;33m([0m[1;33m
[0m    [0mpath[0m[1;33m=[0m[1;34m'MCD_files'[0m[1;33m,[0m[1;33m
[0m    [0mexport_path[0m[1;33m=[0m[1;34m'tiff_stacks'[0m[1;33m,[0m[1;33m
[0m    [0mexport_panel[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mexport_meta[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mexport_errors[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Extracts every MCD file in a given directory into TIFF stacks, panel files, and a metadata table.

Parameters
----------
path : str or Path
    Path to the directory containing MCD files. Defaults to 'MCD_files'.
export_path : str or Path
    Path to the folder where to export the TIFF files. Defaults to 'tiff_stacks'.
export_panel : bool, optional
    If True, exports the panel files. Defaults to True.
export_meta : bool, optional
    If True, exports

In [3]:
preprocess.export_mcd_folder(path='MCD_files',
                             export_path='tiff_stacks')

Exporting NF2_TMA4...


  warn(
  warn(
  warn(


Error in acquisition number 11: MCD file 'NF2_TMA4.mcd' corrupted: invalid acquisition image data size
1 errors encountered


### Unstack .tiffs into individual channel images
For denoising, and generally QCing the raw images, its often easier to have the channels as individual images. This function will 'unstack' the stacks into individual images.

In [4]:
preprocess.unstack_tiffs?

[1;31mSignature:[0m
[0mpreprocess[0m[1;33m.[0m[0munstack_tiffs[0m[1;33m([0m[1;33m
[0m    [0minput_folder[0m[1;33m=[0m[1;34m'tiff_stacks'[0m[1;33m,[0m[1;33m
[0m    [0munstacked_output_folder[0m[1;33m=[0m[1;34m'tiffs'[0m[1;33m,[0m[1;33m
[0m    [0muse_panel_files[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0muse_metadata_file[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Unpack TIFF stacks into individual channel images with sensible names.

Parameters
----------
input_folder : str or Path, optional
    Folder containing the TIFF stacks. Defaults to 'tiff_stacks'.
unstacked_output_folder : str or Path, optional
    Folder where individual channel TIFFs will be saved. Defaults to 'tiffs'.
use_panel_files : bool, optional
    If True, use panel files created for each ROI. Defaults to True.
use_metadata_file : bool, optional
    If True, adds metadata for ROIs extracted from 

In [5]:
channel_df, channels_list, image_data, meta_data = preprocess.unstack_tiffs(input_folder='tiff_stacks',
                                                               unstacked_output_folder='tiffs')

Unpacking ROIs...


100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:37<00:00,  3.11s/it]

Metadata for MCD and ROIs saved to meta_data.csv in tiffs output folder

The following 56 channels were detected:

['ArAr80_80ArAr', 'Y89_89Y_SMA', 'In113_113In_CollagenV', 'In115_115In_IBA1', 'Sn120_120Sn', 'I127_127I', 'Xe131_131Xe', 'Cs133_133Cs', 'Xe134_134Xe', 'Ba138_138Ba', 'La139_139La_Ki67', 'Ce140_140Ce', 'Pr141_141Pr_S100B', 'Nd142_142Nd_Collagen1', 'Nd143_143Nd_Vimentin', 'Nd144_144Nd_CD14', 'Nd145_145Nd_NFKb', 'Nd146_146Nd_Neurofilament', 'Sm147_147Sm_Fibronectin', 'Nd148_148Nd_SOX10', 'Sm149_149Sm_LaminAC', 'Nd150_150Nd_CD74', 'Eu151_151Eu_CD31', 'Sm152_152Sm_Panlaminin', 'Eu153_153Eu_CD44', 'Sm154_154Sm_Aggrecan', 'Gd155_155Gd_HIF1a', 'Gd156_156Gd_CD4', 'Gd157_157Gd', 'Gd158_158Gd_TMEM119', 'Tb159_159Tb_CD49a', 'Gd160_160Gd_CollagenIV', 'Dy161_161Dy_Nidogen2', 'Dy162_162Dy_Versican', 'Dy163_163Dy_Glut1', 'Dy164_164Dy_Syndecan1', 'Ho165_165Ho_PD1', 'Er166_166Er_CollagentypeIII', 'Er167_167Er_PDL1', 'Er168_168Er_P2RY12', 'Tm169_169Tm_CA9', 'Er170_170Er_CD3', 'Yb171_171Yb_CD




### QC graphs for raw images
The best QC will always be manually reviewing the raw data. However, this function will plot metrics for all the different ROIs by PCA and heatmap. This is especially useful if you have 100's of cores in a TMA, where there may be a couple of outliers that you can identify and then review manually.

In [6]:
preprocess.qc_heatmap?

[1;31mSignature:[0m
[0mpreprocess[0m[1;33m.[0m[0mqc_heatmap[0m[1;33m([0m[1;33m
[0m    [0mdirectory[0m[1;33m=[0m[1;34m'tiffs'[0m[1;33m,[0m[1;33m
[0m    [0mquantile[0m[1;33m=[0m[1;36m0.95[0m[1;33m,[0m[1;33m
[0m    [0msave[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mchannels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnormalize[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mfigsize[0m[1;33m=[0m[1;33m([0m[1;36m10[0m[1;33m,[0m [1;36m10[0m[1;33m)[0m[1;33m,[0m[1;33m
[0m    [0mdpi[0m[1;33m=[0m[1;36m75[0m[1;33m,[0m[1;33m
[0m    [0msave_dir[0m[1;33m=[0m[1;34m'qc_images'[0m[1;33m,[0m[1;33m
[0m    [0mdo_pca[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mannotate_pca[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mhide_figures[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Generate 

In [None]:
preprocess.qc_heatmap(channels=channels_list, 
                      figsize=(10,10), 
                      normalize='zscore', 
                      dpi=75)

## Setup denoise config file
This function will create the config file that accompanies the denoising.py script that can be run on CSF3 (or locally, if you have a desktop with a GPU). 

### Deep SNF settings
An in-depth explanation of all the settings can be found on the IMC Denoise GitHub (https://github.com/PENGLU-WashU/IMC_Denoise), so I won't repeat here. However, here are some that I found useful to tweak:


**<font color='red'>train_batch_size </font>** This is the amount of data used for training in each itteration, increasing this will increase the speed overall, but you are limited by how much RAM your GPU has. The CSF3 free-at-access GPUs (V100s) have 16gb of RAM, and I've found 200 works fine. If you are getting 'out of memory' errors you may need to reduce this to work on a smaller GPU. 

**<font color='red'>patch_step_size </font>** This is the frequency (in pixels) at which patches are taken from the dataset for training, e.g. 100 would mean every 100 pixels, a patch is taken for training from each image. In smaller datasets (e.g. less than 10 ROIs), the default of 50 should be fine, but as your dataset gets bigger (100's of ROIs), you will have to increase this to 100-200. There is an upper ceiling of how much training data actually benefits the performance, though there's not an easy way to identify the perfect number. If you have a channel with very few cells or sparse taining, you may also want to decrease this a lot, potentially to <50.

**<font color='red'>is_load_weights </font>** Once the training part of the denoise is done, then you can reuse the previously saved weights, which massively speeds things up.

**<font color='red'>n_neighbours, n_iter, window_size</font>** Have a look at the documentation for what these do. Rarely, if a channel is being particularly awkward and won't denoise, I may change these +/- 1 or 2.

**<font color='red'>train_epochs</font>** The amount of itterations of training at each epoch. We have good results with 50 previously, as there is usually convergence at this point, but I think 100+ is probably advisable. This will have a big impact on the length of the processing.

### Selecting channels
It is important that you either edit the `channels_list` in Python or the created .json file to remove any channels that do not require denoising, or are empty channels (e.g. Argon). If  you need to redo channels with tweaked settings, you can easily edit this file whilst it's on the CSF.

In [7]:
# These parameters worked fine for ne on CSF3 on a V100 GPU with a small-medium sized dataset. I would tweak patch_step_size first with a bigger dataset.
deep_snf_parameters = {
    "train_epochs": 100,
    "patch_step_size": 100,
    "train_batch_size": 200,
    "pixel_mask_percent": 0.15,
    "is_load_weights": False,
    "n_neighbours": 4,
    "n_iter": 3,
    "window_size": 3
}

# Create the configuration file
preprocess.create_denoise_config(
    config_filename='denoise_config.json',
    raw_directory='tiffs',
    processed_output_dir='processed',
    channels=channels_list,
    parameters=deep_snf_parameters
)

Configuration file 'denoise_config.json' created successfully.


# Denoising on the CSF
We use the Python package IMC Denoise for denoising. It’s a slightly annoying package to get working, mostly because it hasn’t been updated from some very specific version of Python, tensorflow and keras. It does work on a desktop PC with a GPU, but easiest to run on the CSF. The code for running the denoising can be found in **<font color='red'>denoise.py</font>**.


## Accessing CSF
To access the CSF you’ll need to contact Research IT and request an account. I had to wait a long time for the training course. The free-at-point-of-access accounts are available to everyone free of charge, though you also need to stipulate GPU access, which allows you access to a small selection of free GPUs.

## Installing miniconda
This first step may not be required, you may be able to install packages whilst on the login node, but to safe  I accessed one of the nodes.

`qrsh -l short -l mem512`

Download and install the latest version of Miniconda. At some point it will ask if you want to modify the bash so that conda starts automatically, select yes for this.

`wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh`

`bash Miniconda3-latest-Linux-x86_64.sh1`

Load conda, this should happen automatically when you relogin at later dates.

`source ~/.bashrc`

These next steps are all taken directly from the IMC_Denoise GitHub (https://github.com/PENGLU-WashU/IMC_Denoise/tree/main/IMC_Denoise)

`conda create -n IMC_Denoise python=3.6`

`conda activate IMC_Denoise`

`pip install tensorflow==2.2.0 keras==2.3.1`

`conda install -c anaconda cudnn=7.6.5 cudatoolkit=10.1.243`

This will download the latest code for IMC_Denoise directly from their GitHub, there is no version control so I can’t guarantee anything will work in the future.

`git clone https://github.com/PENGLU-WashU/IMC_Denoise.git`

`cd IMC_Denoise`

`pip install -e .`

Finally, we need to install pandas. By doing this last we should avoid installing the newer versions of pandas and numpy that will cause compatibility issues.

`conda install pandas`

## Submitting the denoise job on the CSF
The code for running the denoise is in **<font color='red'>denoise.py</font>**. I have a Jupyter Notebook for extracting the MCD files into **<font color='red'>.tiff files (into organised directories)</font>**, and generating **<font color='red'>denoising_config.json</font>**, which will supply the settings for the denoising. These files (denoise.py, .tiff file folders, and  denoising_config.json) are all uploaded onto the CSF. We can then submit CSF3 job, as follows:
#### *denoising_job.txt:*
```
#! /bin/bash --login
#$ -cwd

#$ -N imc_denoising
#$ -l v100

echo "Denoising job is using $NGPUS GPU(s) with ID(s) $CUDA_VISIBLE_DEVICES and $NSLOTS CPU core(s)"

conda activate IMC_Denoise
python denoising.py
Back on the CSF3 command line…
qsub denoising_job.txt
```

# QC denoised images
Once you've processed the images, and you've copied them back off the CSF, this function will plot a side-by-side of the before and after denoising for each ROI and channel. When reviewing these, look carefully at the scales, as they will be different for each images.

In [8]:
preprocess.qc_check_side_by_side?

[1;31mSignature:[0m
[0mpreprocess[0m[1;33m.[0m[0mqc_check_side_by_side[0m[1;33m([0m[1;33m
[0m    [0mchannels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcolourmap[0m[1;33m=[0m[1;34m'jet'[0m[1;33m,[0m[1;33m
[0m    [0mdpi[0m[1;33m=[0m[1;36m75[0m[1;33m,[0m[1;33m
[0m    [0msave[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0msave_dir[0m[1;33m=[0m[1;34m'qc_images'[0m[1;33m,[0m[1;33m
[0m    [0mhide_images[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mraw_directory[0m[1;33m=[0m[1;34m'tiffs'[0m[1;33m,[0m[1;33m
[0m    [0mprocessed_output_dir[0m[1;33m=[0m[1;34m'processed'[0m[1;33m,[0m[1;33m
[0m    [0mquiet[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compare raw and processed images side by side for quality control.

Parameters
----------
channels : list of str, optional
    List of channels to process. Defaults to No

In [None]:
preprocess.qc_check_side_by_side(channels=channels_list,
                                 raw_directory='tiffs',
                                 processed_output_dir='tiffs')