# Preprocess timepoint images

1. Parse image paths into dataframe with timepoints, experimental conditions and other variables
    - Export dataframe for plotting reference
2. Copy images into directories, structured by a metadata pattern 
    - (ex. 'experiment/condition/timepoint/')
3. TODO: Preprocess image tensors for machine learning input
- ADD exclude celllines
- ADD include/exclude timepoints

In [1]:
pwd()

'Z:\\StephanieLam\\Cortical\\FeatureAnalysis\\COR37-GEDI'

In [2]:
# File parsing
import os
import csv
import glob
import shutil
# Data interpretation
import pandas as pd
import cv2
import matplotlib.pyplot as plt
# Logging
from IPython.display import clear_output

In [8]:
## Stage 1
# open notebook in root directory
currentDirectory = os.getcwd()
# append working directory if needed
workingDirectory = "Z:\\StephanieLam\\Cortical\\FeatureAnalysis\\COR37-GEDI"
#currentDirectory + "/clustering/COR24+25+27/"
print(f'working directory: {workingDirectory}')

# serialize paths for input images
rootPaths = [
  "Z:\\StephanieLam\\Cortical\\FeatureAnalysis\\COR37-GEDI/ObjectCropsFiltered/KeepObjectCrops",
]
imageExtension = ".tif"

# map metadata types to indices in filename (0-based index)
metadataIndices = {
  "experiment": 1,
  "well": 4,
  "timepoint": 2,
  "morphology channel": 6
}
# map plate map paths by experiment (in image filename)
plateMapPaths = {
  "COR37-GEDI": "platelayout.csv",
}

## Stage 2
# enter a directory pattern by metadata name (experiment, timepoint, well, condition, morphology channel, or anything else imported from plate)
targetDirectoryPattern = "experiment" + os.sep + "condition" + os.sep + "timepoint"

working directory: Z:\StephanieLam\Cortical\FeatureAnalysis\COR37-GEDI


### Load plate maps & image paths

In [9]:
def parsePlateMapCSV(csvPath: str):
  """
  Return a mapping of sample IDs to lists of well IDs.
  """
  # read plate map csv into dict
  with open(csvPath, 'r') as rawPlateLayout:
      reader = csv.reader(rawPlateLayout)
      # map well ID to sample ID, excluding header row
      return {row[0]: row[1] for row in reader if 'Sci_WellID' not in row[0]}

# load plate maps
plateMapsByExperiment = dict()
for experiment, plateMapPath in plateMapPaths.items():
  # TODO: replace with plate class?
  plateMapsByExperiment[experiment] = parsePlateMapCSV(plateMapPath)

# load image paths
imagePaths = [
    os.path.realpath(imagePath)
    for imageDir in rootPaths 
    for imagePath in glob.iglob(imageDir + os.sep + '**' + os.sep + '*' + imageExtension, recursive=True)
] 
print(f'number of images: {len(imagePaths)}')

number of images: 2877


In [10]:
plateMapsByExperiment[experiment]

{'I01': 'CS2518n2',
 'I02': 'CS2518n2',
 'I03': 'CS2518n2',
 'I04': 'CS2518n2',
 'I05': 'CS2518n2',
 'I06': 'CS2518n2',
 'I07': 'CS2518n2',
 'I08': 'CS2518n2',
 'I09': 'CS2518n2',
 'I10': 'CS2518n2',
 'I11': 'CS2518n2',
 'I12': 'CS2518n2',
 'I13': 'HD53n5',
 'I14': 'HD53n5',
 'I15': 'HD53n5',
 'I16': 'HD53n5',
 'I17': 'HD53n5',
 'I18': 'HD53n5',
 'I19': 'HD53n5',
 'I20': 'HD53n5',
 'I21': 'HD53n5',
 'I22': 'HD53n5',
 'I23': 'HD53n5',
 'I24': 'HD53n5',
 'J01': 'CS2518n2',
 'J02': 'CS2518n2',
 'J03': 'CS2518n2',
 'J04': 'CS2518n2',
 'J05': 'CS2518n2',
 'J06': 'CS2518n2',
 'J07': 'CS2518n2',
 'J08': 'CS2518n2',
 'J09': 'CS2518n2',
 'J10': 'CS2518n2',
 'J11': 'CS2518n2',
 'J12': 'CS2518n2',
 'J13': 'HD53n5',
 'J14': 'HD53n5',
 'J15': 'HD53n5',
 'J16': 'HD53n5',
 'J17': 'HD53n5',
 'J18': 'HD53n5',
 'J19': 'HD53n5',
 'J20': 'HD53n5',
 'J21': 'HD53n5',
 'J22': 'HD53n5',
 'J23': 'HD53n5',
 'J24': 'HD53n5',
 'K01': 'CS2518n2',
 'K02': 'CS2518n2',
 'K03': 'CS2518n2',
 'K04': 'CS2518n2',
 'K05': 

In [11]:
# extract metadata from image names into dataframe
imageMetadata = pd.DataFrame(index=imagePaths, columns=metadataIndices.keys())
# create condition column if plate map is indexable
if "well" in metadataIndices.keys() and "experiment" in metadataIndices.keys():
    imageMetadata["condition"] = None
imageMetadata.index.name = "image path"
for imageName in imagePaths:   
    currentRow = imageMetadata.loc[imageName, :]
    # delimit image name to have indexable metadata
    currentMetadata = imageName.split(os.path.sep)[-1].split("_")
    
    for metadataType, index in metadataIndices.items():
        currentRow[metadataType] = currentMetadata[index]

    # add experimental condition from 2D plate map dict if "experiment" and "well" are available metadata
    if "condition" in imageMetadata.columns:
        currentRow['condition'] = plateMapsByExperiment[currentRow['experiment']][currentRow['well']]

imageMetadata.to_csv(workingDirectory + "/imageMetadata.csv")


In [12]:
imageMetadata.head()

Unnamed: 0_level_0,experiment,well,timepoint,morphology channel,condition
image path,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Z:\StephanieLam\Cortical\FeatureAnalysis\COR37-GEDI\ObjectCropsFiltered\KeepObjectCrops\I01\PID20210410_COR37-GEDI_T2_0-1_I01_0_FITC_0_1_0_MN_10.tif,COR37-GEDI,I01,T2,FITC,CS2518n2
Z:\StephanieLam\Cortical\FeatureAnalysis\COR37-GEDI\ObjectCropsFiltered\KeepObjectCrops\I01\PID20210410_COR37-GEDI_T2_0-1_I01_0_FITC_0_1_0_MN_16.tif,COR37-GEDI,I01,T2,FITC,CS2518n2
Z:\StephanieLam\Cortical\FeatureAnalysis\COR37-GEDI\ObjectCropsFiltered\KeepObjectCrops\I01\PID20210410_COR37-GEDI_T2_0-1_I01_0_FITC_0_1_0_MN_18.tif,COR37-GEDI,I01,T2,FITC,CS2518n2
Z:\StephanieLam\Cortical\FeatureAnalysis\COR37-GEDI\ObjectCropsFiltered\KeepObjectCrops\I01\PID20210410_COR37-GEDI_T2_0-1_I01_0_FITC_0_1_0_MN_21.tif,COR37-GEDI,I01,T2,FITC,CS2518n2
Z:\StephanieLam\Cortical\FeatureAnalysis\COR37-GEDI\ObjectCropsFiltered\KeepObjectCrops\I01\PID20210410_COR37-GEDI_T2_0-1_I01_0_FITC_0_1_0_MN_23.tif,COR37-GEDI,I01,T2,FITC,CS2518n2


In [13]:
metadataByDirectory = targetDirectoryPattern.split(os.sep)

# copy each image according to directory pattern
for i, image in enumerate(imageMetadata.itertuples()):
    # slice off trailing separator
    targetPath = workingDirectory
    for metadataType in metadataByDirectory:
        targetPath += os.sep + getattr(image, metadataType) 
    sourcePath = getattr(image, "Index")
    #print(targetPath)
    print(f"moving image ({i+1}/{len(imageMetadata)}): {os.sep.join([*targetPath.split(os.sep)[-len(metadataByDirectory):], sourcePath.split(os.sep)[-1]])}")
    clear_output(wait=True)
    if not os.path.isfile(os.sep.join([targetPath,sourcePath.split(os.sep)[-1]])) and os.path.isfile(sourcePath):
        os.umask(0)
        os.makedirs(targetPath, exist_ok=True)
        shutil.move(sourcePath, targetPath)

moving image (2877/2877): COR37-GEDI\HD53n5\T2\PID20210410_COR37-GEDI_T2_0-1_P16_0_FITC_0_1_0_MN_8.tif
