## Connect to Google Drive and install packages 

In [None]:
#run once per session
from google.colab import drive
drive.mount('/content/drive')

!pip install backports.cached-property
!pip install imagecodecs

## Import packages

In [None]:
import sys
#add folder containing packages
sys.path.append('/content/drive/MyDrive/Colab/nucID_pkgs')
import glob
from nucid.Training_functions import FilterMask, MakeTrainingData, MakeValData, CheckBB, FindBalanceData, BalanceData

# **Filter Mask Files**

Sometimes (especially in 4x images) segmentation of nuclei can be noisy generating objects that are clearly not cells. These objects can sometimes be removed using simple size exclusion which is implemented by the funtion below.

In [None]:
#Path to Tif files
MASKS = '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/20220829_151508_481__Plate000_Well1_ChannelDAPI_Seq0000_tissue_mask.tif'
#minimum allowed area for a mask
min_area = 0
#maximum allowed area for a mask
max_area = 300

# Set this to true if you would like to check the results of filtering (NOTE: this uses a lot of RAM if there are a lot of cells and can crash the session)
check_mask = True
#only need this if check_mask is 'True'
IMAGES = '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/20220829_151508_481__Plate000_Well1_ChannelDAPI_Seq0000.tif'

IMAGES.sort()
MASKS.sort()
print(IMAGES)
print(MASKS)

if len(MASKS) == len(IMAGES):
  for i in range(len(MASKS)):
    FilterMask(MASKS[i],min_area,max_area,IMAGES[i],check_mask)
else:
  for i in range(len(MASKS)):
    FilterMask(MASKS[i],min_area,max_area,None,False)

# **Generate Training Data**

This section takes in a list of of images and masks and generates training data in the correct format for training NucID.

In [None]:
#load variables
#path to images you want to use as your training data
TIFS = glob.glob('/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/*[0-9].tif')
#path to the images respective masks to the images you want to use as training data (note mask names have to be similar enough to images that the sort funtion will order the files the same, or you can input a list of paths in the correct order and comment out the sort function in lines 14 and 15 below)
MASKS = glob.glob('/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/*_filtered_mask.tif')
#path where you wan the training data outputed
OutPath = '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData_320_upSize2'
#channel in which the nuclear images are
NuclearChannel = 1
#how many image label pairs you want to include int your validation data set
NumberOfValidation = 0


## YOU SHOULD NOT NEED TO CHANGE SETTING BELLOW THIS UNLESS YOU ARE LOOKING FOR "ADVANCED" OPTIONS
#size of tiles you want to train on (should be the same as the size tiles you will input into the model)
tileSize = 320
#overlap between tiles (.1 is usually good)
overlap = .1

#if you want to incres smaller tile to 640x640 say true
upSize = True

## NO MORE VARIABLE TO CHANGE
TIFS.sort()
MASKS.sort()
print(TIFS)
print(MASKS)

['/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/20220829_151508_481__Plate000_Well1_ChannelDAPI_Seq0000.tif', '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/20220829_151508_481__Plate000_Well3_ChannelDAPI_Seq0002.tif', '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/Plate2022_08_11_DLS032_Plate2_Well5_ChannelDAPI_Seq0004.tif']
['/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/20220829_151508_481__Plate000_Well1_ChannelDAPI_Seq0000_tissue_min0_max300_filtered_mask.tif', '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/20220829_151508_481__Plate000_Well3_ChannelDAPI_Seq0002_tissue_min0_max300_filtered_mask.tif', '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/Plate2022_08_11_DLS032_Plate2_Well5_ChannelDAPI_Seq0004_tissue_min0_max300_filtered_mask.tif']


In [None]:
#generate training Data
for i in range(len(TIFS)):
  MakeTrainingData(TIFS[i],MASKS[i],OutPath,NuclearChannel,NumberOfValidation,tileSize,overlap,i,upSize)
  print('processed file: ' + TIFS[i])


#take a subset of data generated above and use it for validation
base_path = OutPath + "/TrainingData"
MakeValData(base_path,NumberOfValidation)

processed file: /content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/20220829_151508_481__Plate000_Well1_ChannelDAPI_Seq0000.tif
processed file: /content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/20220829_151508_481__Plate000_Well3_ChannelDAPI_Seq0002.tif
processed file: /content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/Images_masks/Plate2022_08_11_DLS032_Plate2_Well5_ChannelDAPI_Seq0004.tif


In [None]:
base_path = '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData_320_upSize/TrainingData'
NumberOfValidation = 50
MakeValData(base_path,NumberOfValidation)

# **Check Training Data**

Before starting to train your model you may want to check that your images and labels are correct. This section takes in an example tile and label from your training data and plots the ground truth bouding boxes aroudn nuclei.

In [None]:
#path to image bounding box was generted for
img_path = '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/TrainingData/val/images/020220829_151508_481__Plate000_Well1_ChannelDAPI_Seq0000_7_7.tif'
#path to bounding box file for respective image (bounding box must be in yolo format)
bb_path = '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData/TrainingData/val/labels/020220829_151508_481__Plate000_Well1_ChannelDAPI_Seq0000_7_7.txt'

CheckBB(img_path,bb_path,label_size=.5)

# **Balance Training Data**

If your data set has a lot of empty tiles, or many very dense tiles this may cause the model to bias over or under counting nuclei. To avoid you can try balancing the data. This function tries to more evenly represent images with different number of bounding boxes in them.

In [None]:
#path to labels of data you want to balance
PATHS = glob.glob('/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData_320_upSize_minmax/TrainingData/train/labels/*.txt')
#number of differen bins you want for splitting up files of diffrent sizes
NumberOfBins = 4000
#number of files you want to take per bin
SamplesPerBin = 20

#get subset of paths according to binning
sub_PATHS = FindBalanceData(PATHS,NumberOfBins,SamplesPerBin)

The number of training files selected is: 2697


In [None]:
#move files to balanced data set
BalanceData(sub_PATHS)

In [None]:
#take a subset of data generated above and use it for validation
base_path = '/content/drive/MyDrive/Guillaume_Shared/NucID/4X_trainingData_320_upSize_minmax/Balanced'
NumberOfValidation = 100

MakeValData(base_path,NumberOfValidation)

# **Train your model**

Now that all the data is set up, train the model on your prepared data set


In [None]:
# train the model
%cd /content/drive/MyDrive/Colab/nucID_pkgs/yolov7
!python train.py --device 0 --batch-size 16 --epochs 50 --img 640 --data data/Nuclei_10X.yaml --hyp data/hyp.scratch.custom.yaml --cfg cfg/training/yolov7_nuc_cfg.yaml --weights '10X_ms.pt' --name tenXbalancedMscale_dense --multi-scale 

In [None]:
#restart training if it crashes
%cd /content/drive/MyDrive/Colab/nucID_pkgs/yolov7
!python train.py --device 0 --batch-size 8 --epochs 250 --img 640 --data data/custom_data.yaml --hyp data/hyp.scratch.custom.yaml --cfg cfg/training/yolov7-custom.yaml --name yolov7-custom-transfer-allData --multi-scale --resume 'path/to/last.pt'

In [None]:
#take a subset of data generated above and use it for validation
base_path = '/content/drive/MyDrive/Guillaume_Shared/NucID/10X_trainingData/TrainingData'
MakeValData(base_path,70)

In [None]:
!/content/drive/MyDrive/Guillaume_Shared/NucID/10X_trainingData/TrainingData/train/images/*_Seq0002_0_0.tif

/bin/bash: /content/drive/MyDrive/Guillaume_Shared/NucID/10X_trainingData/TrainingData/train/images/*_Seq0002_0_0.tif: No such file or directory


In [None]:
!/content/drive/MyDrive/Guillaume_Shared/NucID/10X_trainingData/TrainingData/train/images/*_Seq0002_0_0.txt


/bin/bash: /content/drive/MyDrive/Guillaume_Shared/NucID/10X_trainingData/TrainingData/train/images/*_Seq0002_0_0.txt: No such file or directory


In [None]:
!/content/drive/MyDrive/Guillaume_Shared/NucID/10X_trainingData/TrainingData/train/labels/1020220713_111234_109__Plate000_Well2_ChannelDAPI_Seq0001_42_4.txt

/bin/bash: /content/drive/MyDrive/Guillaume_Shared/NucID/10X_trainingData/TrainingData/train/labels/1020220713_111234_109__Plate000_Well2_ChannelDAPI_Seq0001_42_4.txt: Permission denied


In [None]:
!