### Make Training Data (Movies/Images)

In [None]:
# This cell is not currently used

import os

setlst = os.listdir('./')
all_sets = []
for term in setlst:
    if 'set' in term:
        all_sets.append(term)

for set in all_sets:
    temp = os.listdir(os.path.join('.', set, ))
    direc_name = os.path.join('.', set, 'movie')
    output_path = os.path.join('.', set, 'final')
    partslst = []
    if not 'annotations' in temp:
        partslst = os.listdir(os.path.join('.', set))
    print(partslst)
    
    

In [4]:
"""
make_training_data_tracking.py - for multiple sets of data with multiple parts and montages

Executing functions for creating npz files containing the training data
Functions will create training data for either
    - Patchwise sampling
    - Fully convolutional training of single image conv-nets
    - Fully convolutional training of movie conv-nets

Files should be placed in training directories with each separate
dataset getting its own folder

@author: David Van Valen
"""

"""
Import packages
"""
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import glob
import os
import pathlib
import skimage as sk
import scipy as sp
from scipy import ndimage
from skimage import feature
from skimage import morphology as morph
from skimage.transform import resize
from sklearn.utils import class_weight
from deepcell import get_image
from deepcell import make_training_data
# from deepcell import format_coord as cf

# Load data
direc_name = '/data/data/cells/3T3/NIH/set2/part_2/movie'
output_directory = '/data/npz_data/cells/3T3/NIH/movie'
file_name_save = os.path.join( output_directory, 'nuclear_movie_3T3_S2P2_same.npz')
# Training directories are organized according to location within an image
num_x = 7 # Define num of horizontal samples
num_y = 7 # Define num of vertical samples
samples_to_drop = ['03_06','04_02','05_06','06_06'] # Some movies/montages/samples do not contain cells or contain annotation errors
#samples_to_drop = []
# Build list of possible training directories (excluding those to be dropped)
training_direcs = ['0{}_0{}'.format(i,j) for i in range(num_x) for j in range(num_y)]
training_direcs = [x for x in training_direcs if x not in samples_to_drop]
channel_names = [""] # Commonality in raw filenames

# Create output ditrectory, if necessary
pathlib.Path(output_directory).mkdir(parents=True, exist_ok=True)

# Create the training data
make_training_data(
    direc_name = direc_name,
    file_name_save = file_name_save,
    channel_names = channel_names,
    dimensionality = 3,
    training_direcs = training_direcs,
    raw_image_direc = "raw",
    annotation_direc = "annotated",
    annotation_name = "",
    border_mode = "same",
    output_mode = "conv",
    num_frames = 30,
    reshape_size = None,
    verbose = True)


In [8]:
"""
make_training_data_tracking.py - for a single directory of data

Import packages
"""
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import glob
import os
import pathlib
import skimage as sk
import scipy as sp
from scipy import ndimage
from skimage import feature
from skimage import morphology as morph
from skimage.transform import resize
from sklearn.utils import class_weight
from deepcell import get_image
from deepcell import make_training_data

# Load data
direc_name = '/data/data/cells/HeLa/S3/set0/HeLaTrackingTests'
output_directory = '/data/data/cells/HeLa/S3/set0/HeLaTrackingTests/'
file_name_save = os.path.join(output_directory, 'nuclear_movie_HeLa_set0_large.npz')
# Build list of possible training directories (excluding those to be dropped)
training_direcs = ['set0']
channel_names = [""] # Commonality in raw filenames

# Create output ditrectory, if necessary
pathlib.Path(output_directory).mkdir(parents=True, exist_ok=True)

# Create the training data
make_training_data(
    direc_name = direc_name,
    file_name_save = file_name_save,
    channel_names = channel_names,
    dimensionality = 3,
    training_direcs = training_direcs,
    raw_image_direc = "raw",
    annotation_direc = "annotated",
    annotation_name = "",
    output_mode = "conv",
    num_frames = 30,
    reshape_size = None,
    verbose = True)


In [9]:
# to compile multiple sets together

import numpy as np
import os

base_path = '/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_'
#num_of_sets = 2
set_list = ['S0P1','S2P2']

# Instantiate arrays to hold the final trading data and fill them
X_full = np.empty((0, 30, 154, 182, 1))
y_full = np.empty((0, 30, 154, 182, 1))
#for movie in range(num_of_sets):
for movie in set_list:
    path = os.path.join(base_path + str(movie) + '_same.npz')
    data = np.load(path)
    print(data.keys())
    X_to_load, y_to_load = data['X'], data['y']
    print('X Shape:', X_to_load.shape)
    print('y Shape:', y_to_load.shape)
    X_full = np.concatenate((X_full, X_to_load), axis=0)
    y_full = np.concatenate((y_full, y_to_load), axis=0)
    
# Save the result to a new npz
output_directory = '/data/npz_data/cells/3T3/NIH/movie/'
file_name_save = os.path.join( output_directory, 'nuclear_movie_3T3_S0P1andS2P2_same.npz')

np.savez(file_name_save, X=X_full, y=y_full)


['y', 'X']
X Shape: (44, 30, 154, 182, 1)
y Shape: (44, 30, 154, 182, 1)
['y', 'X']
X Shape: (45, 30, 154, 182, 1)
y Shape: (45, 30, 154, 182, 1)


In [10]:
# Verify the result
data = np.load('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_S0P1andS2P2_same.npz')
X_to_load, y_to_load = data['X'][()], data['y'][()]

print(data.keys())
data_readable_X, data_readable_y = data['X'][()], data['y'][()]
print('X Shape:', data_readable_X.shape)
print('y Shape:', data_readable_y.shape)

['y', 'X']
X Shape: (89, 30, 154, 182, 1)
y Shape: (89, 30, 154, 182, 1)


In [6]:
# Verify the result
data = np.load('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same.npz')
X_to_load, y_to_load = data['X'][()], data['y'][()]

print(data.keys())
data_readable_X, data_readable_y = data['X'][()], data['y'][()]
print('X Shape:', data_readable_X.shape)
print('y Shape:', data_readable_y.shape)

['y', 'X']
X Shape: (180, 40, 216, 256, 1)
y Shape: (180, 40, 216, 256, 1)


In [50]:
# Test Combining Mulitiple Different Cell Types/Acquisition Parameters (Test on HeLa + 3T3)
# These types have different sizes (num of frames and pixel dimensions) to compensate we will zero pad

import os
import numpy as np

base_path = '/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_'
list_to_comb = ['3T3_S0P1andS2P2','hela0-7']

#path = os.path.join(base_path + str(list_to_comb[0]) + '_same.npz')
#data = np.load(path)

# Define a class to hold both raw data and labels for any dataset 
class Dataset_Xy():
    def __init__(self, X_to_load, y_to_load):
        self.X_to_load = X_to_load
        self.y_to_load = y_to_load
#datasets = dict([ (d.name, d.X_to_load, d.y_to_load) for d in datasets ])

# Define a normalizaiton function for the raw images that can be run before padding
def image_norm(original_image):
    # NNs prefer input data that is 0 mean and unit variance
    
    normed_image = (original_image - np.mean(original_image)) / np.std(original_image)
    return normed_image

# Load each movie and get max/min parameters for padding
max_x = 0
max_y = 0
max_frames = 0
datasets = {}
for dataset in list_to_comb:
    path = os.path.join(base_path + str(dataset) + '_same.npz')
    data = np.load(path)
    #d.X_to_load, d.y_to_load = Dataset_Xy(data['X'], data['y'])
    d = Dataset_Xy(data['X'], data['y'])
    datasets[dataset] = d
    if d.X_to_load.shape[1] > max_frames:
        max_frames = d.X_to_load.shape[1]
    if d.X_to_load.shape[2] > max_x:
        max_x = d.X_to_load.shape[2]
    if d.X_to_load.shape[3] > max_y:
        max_y = d.X_to_load.shape[3]

# Instantiate arrays to hold the final training data 
X_full = np.zeros((0, max_frames, max_x, max_y, 1))
y_full = np.zeros((0, max_frames, max_x, max_y, 1))

print(X_full.shape)    
print(y_full.shape)

# Norm images and Pad smaller movies (3T3 in this case) in prep for combining - then combine
for dataset in list_to_comb:
    dtl = datasets[dataset]
    # Normalize the raw images
    for batch in range(dtl.X_to_load.shape[0]):
        for frame in range(dtl.X_to_load.shape[1]):
            dtl.X_to_load[batch, frame, :, :, 0] = image_norm(dtl.X_to_load[batch, frame, :, :, 0]) 
    # Image padding
    if dtl.X_to_load.shape[2] < max_x:
        pad_width = int((max_x-dtl.X_to_load.shape[2])/2)
        dtl.X_to_load = np.pad(dtl.X_to_load, ((0,0), (0,0), (pad_width,pad_width), (0,0), (0,0)), mode='constant', constant_values=0)
        dtl.y_to_load = np.pad(dtl.y_to_load, ((0,0), (0,0), (pad_width,pad_width), (0,0), (0,0)), mode='constant', constant_values=0)
    if dtl.X_to_load.shape[3] < max_y:
        pad_width = int((max_y-dtl.X_to_load.shape[3])/2)
        dtl.X_to_load = np.pad(dtl.X_to_load, ((0,0), (0,0), (0,0), (pad_width,pad_width), (0,0)), mode='constant', constant_values=0)
        dtl.y_to_load = np.pad(dtl.y_to_load, ((0,0), (0,0), (0,0), (pad_width,pad_width), (0,0)), mode='constant', constant_values=0)
    if dtl.X_to_load.shape[1] < max_frames:   
        pad_width = int(max_frames-dtl.X_to_load.shape[1])
        dtl.X_to_load = np.pad(dtl.X_to_load, ((0,0), (0,pad_width), (0,0), (0,0), (0,0)), mode='constant', constant_values=0)
        dtl.y_to_load = np.pad(dtl.y_to_load, ((0,0), (0,pad_width), (0,0), (0,0), (0,0)), mode='constant', constant_values=0)
    # Add to final training data
    X_full = np.concatenate((X_full, dtl.X_to_load), axis=0)
    y_full = np.concatenate((y_full, dtl.y_to_load), axis=0)

print(X_full.shape)    
print(y_full.shape)

# Save the result to a new npz
output_directory = '/data/npz_data/cells/3T3/NIH/movie/'
file_name_save = os.path.join( output_directory, 'nuclear_movie_3T3_and_HeLa.npz')

np.savez(file_name_save, X=X_full, y=y_full)

(0, 40, 216, 256, 1)
(0, 40, 216, 256, 1)
(269, 40, 216, 256, 1)
(269, 40, 216, 256, 1)


In [47]:
# Review Data if neccesary
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

def get_js_video(images, batch=0, channel=0):
    fig = plt.figure()    
    ims = []
    for i in range(images.shape[1]):
        im = plt.imshow(images[batch, i, :, :, channel], animated=True, cmap='cubehelix', vmin=0, vmax=15)
        ims.append([im])
        ani = animation.ArtistAnimation(fig, ims, interval=75, repeat_delay=1000)
    plt.close()
    return HTML(ani.to_jshtml())

get_js_video(y_full, batch=0)

In [51]:
# Check the result
data = np.load(file_name_save)
print(data.keys())
data_readable_X, data_readable_y = data['X'][()], data['y'][()]
print('X Shape:', data_readable_X.shape)
print('y Shape:', data_readable_y.shape)

['y', 'X']
X Shape: (269, 40, 216, 256, 1)
y Shape: (269, 40, 216, 256, 1)


### Make Training Data (Division Information)

In [21]:
import os
import sys
import numpy as np
import pandas as pd
import scipy as sp

csv_path = '/data/npz_data/cells/3T3/NIH/movie/divisions-3T3.csv'

#Open .csv file containing hand-curated cell division data
divisions_csv = pd.read_csv(csv_path)

#Convert nan entries to blanks, i.e. ''
divisions_csv = divisions_csv.replace(np.nan, 0, regex=True)

In [22]:
#Throw away all montages we decided not to use for training

throw_away_indices = divisions_csv.loc[:, r'dont use (true)'] == True

keep_indices = ~throw_away_indices

divisions_csv = divisions_csv.loc[keep_indices,:]

divisions_csv.head(6201)

Unnamed: 0,number,cell_type,set,part,montage,label,daughter,frame_div (0-index),no cells (true),dont use (true),Notes
0,1,3T3,0,1,00_00,1,0,0.0,0.0,0,0
1,1,3T3,0,1,00_00,2,0,0.0,0.0,0,0
2,1,3T3,0,1,00_00,3,0,0.0,0.0,0,0
3,1,3T3,0,1,00_00,4,56,8.0,0.0,0,0
4,1,3T3,0,1,00_00,5,0,0.0,0.0,0,0
5,1,3T3,0,1,00_00,6,0,0.0,0.0,0,0
6,1,3T3,0,1,00_00,7,89,17.0,0.0,0,0
7,1,3T3,0,1,00_00,8,0,0.0,0.0,0,0
8,1,3T3,0,1,00_00,9,0,0.0,0.0,0,0
9,1,3T3,0,1,00_00,10,0,0.0,0.0,0,0


In [23]:
def division_per_montage(set_num, montage):
    parents = []
    daughters = []
    for row in divisions_csv.itertuples():
        if row.set == set_num and row.montage == montage and row.daughter != 0:
            parents.append(row.label)
            daughter_values = [int(x) for x in row.daughter.split(',')]
            daughters.append(daughter_values)

    npz_arr = []
    for i in range(31):
        npz_arr.append(np.array([]))
    for idx, parent in enumerate(parents):
        ind = int(parent)
        npz_arr[ind] = np.array(daughters[idx])
    
    return npz_arr

In [24]:
# unique_montages contains a list of each montage grouped by set - index by unique_montages[set][montage = 00_0, 00_1]
unique_montages = divisions_csv['montage'].groupby(divisions_csv['set']).unique()

children = []
for set_num in divisions_csv['set'].unique():
    for montage in unique_montages[set_num]:
        arr_to_append = division_per_montage(set_num, montage)
        children.append(arr_to_append)
        
for batch in range(len(children)):
    for i, lst in enumerate(children[batch]):
        children[batch][i] = np.asarray(lst, dtype='int32')

children = np.array(children)
np.savez('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_S0P1andS2P2_same_kids.npz', daughters=children)

In [25]:
children

array([[array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       ...,
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), ..., array([], dtype=int32),
 

### Lets check against the original file for formatting

In [26]:
# Look at the original kids npz to see if it looks correct
data = np.load('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same_kids.npz')
data.keys()

['daughters']

In [27]:
# Store data from keys to deconsruct
data_readable = data['daughters']
data_readable.shape

(180, 31)

In [16]:
# Check the first two entries for structure
data_readable[0:2,:]

array([[array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([8, 9], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        arr

In [28]:
# Look at the original kids npz to see if it looks correct
data = np.load('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_S0P1andS2P2_same_kids.npz')
data.keys()

['daughters']

In [29]:
# Store data from keys to deconsruct
data_readable = data['daughters']
data_readable.shape

(89, 31)

In [19]:
# Check the first two entries for structure
data_readable[0:2,:]

array([[array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([5, 6], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([8, 9], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32)],
       [array([], dtype=int32), array([], dtype=int32),
        array([], dtype=int32), array([], dtype=int32),
       

In [30]:
# Compile the two separate daughter lists
_3T3_daughter = np.load('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_S0P1andS2P2_same_kids.npz')
_3T3_daughter_readable = _3T3_daughter['daughters']
print(_3T3_daughter_readable.shape)

hela_daughter = np.load('/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_hela0-7_same_kids.npz')
hela_daughter_readable = hela_daughter['daughters']
print(hela_daughter_readable.shape)

all_daughters = np.concatenate((_3T3_daughter_readable, hela_daughter_readable), axis=0)
print(all_daughters.shape)

np.savez('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_3T3_and_HeLa_kids.npz', daughters=all_daughters)


(89, 31)
(180, 31)
(269, 31)


In [4]:
import numpy as np

# Look at the original kids npz to see if it looks correct
data = np.load('/data/npz_data/cells/3T3/NIH/movie/nuclear_movie_HeLa_and_3T3_kids.npz')
print(data.keys())
data_readable = data['daughters']
print(data_readable.shape)


['daughters']
(178, 31)


### Ending Checks

# EOF

In [None]:
# List of all montages 
movies = os.listdir(base_direc)
movies.sort()



children = []
for movie in movies:
    path = os.path.join(base_direc, movie, 'division.npz')
    training_data = np.load(path)
    children.append(training_data['arr_0'].tolist())

for batch in range(len(children)):
    for i, lst in enumerate(children[batch]):
        children[batch][i] = np.asarray(lst, dtype=int32)

children = np.array(children)
#np.savez(os.path.join(output_path, 'combined_daugthers.npz'), daughters=children)

In [None]:
def combine(base_direc, output_path):
    movies = os.listdir(base_direc)
    movies.sort()
    children = []
    for movie in movies:
        path = os.path.join(base_direc, movie, 'division.npz')
        if os.path.isfile(path):
            print(movie)
            training_data = np.load(path)
            children.append(training_data['arr_0'].tolist())

    for batch in range(len(children)):
        for i, lst in enumerate(children[batch]):
            children[batch][i] = np.asarray(lst, dtype=int32)

    children = np.array(children)
    np.savez(os.path.join(output_path, 'combined_daugthers.npz'), daughters=children)
    data = np.load(os.path.join(output_path, 'combined_daugthers.npz'))

In [None]:


divisions_csv.loc[(divisions_csv['column_name'] != 0) & (divisions_csv['montage' == montage)]

In [None]:
count = 0
for row in divisions_csv.itertuples():
    if row.daugter != 0:
        count = count + 1
#        print(divisions_csv['daugter'])
#     print(divisions_csv['set'])   
print('number of divisions:', count)

In [None]:
base_path = '/data/npz_data/cells/HeLa/S3/movie/nuclear_movie_HeLa_'
num_of_sets = 8

for movie in range(num_of_sets):
    path = os.path.join(base_path + str(movie) + 'division.npz')

In [None]:
setlst = os.listdir('./')
all_sets = []
for term in setlst:
    if 'set' in term:
        all_sets.append(term)

for set in all_sets:
    temp = os.listdir(os.path.join('.', set, ))
    base_direc = os.path.join('.', set, 'movie')
    output_path = os.path.join('.', set, 'final')
    partslst = []
    if not 'annotations' in temp:
        partslst = os.listdir(os.path.join('.', set))
    print(partslst)
        if len(partslst) == 0:
            print(base_direc, output_path)
            combine(base_direc, output_path)
        else:
            for part in partslst:
                base_direc = os.path.join('.', set, part, 'movie')
                output_path = os.path.join('.', set, part, 'final')
                combine(base_direc, output_path)

In [None]:
#x = np.empty([2, 31], dtype='int32')
x = []

# Make an empty array for a single montage
npz_arr = []
for i in range(31):
    npz_arr.append([])

# Put two blank montages together
for i in range(2):
    x.append(np.array(npz_arr, dtype='int32'))

x = np.array(x)
x.shape

# Save it as an npz file
# np.savez('/home/HeLa_output/set0_files/04_2/output.npz', npz_arr)

In [None]:
# Build division npz for each montage (movie)
set_num = 0
# unique_montages contains a list of each montage grouped by set - index by unique_montages[set][montage = 00_0, 00_1]
unique_montages = divisions_csv['montage'].groupby(divisions_csv['set']).unique()

parents = []
daughters = []
for row in divisions_csv.itertuples():
    if row.set == set_num and row.montage == '00_0' and row.daugter != 0:
        parents.append(row.label)
        daughter_values = [int(x) for x in row.daugter.split(',')]
        daughters.append(daughter_values)

npz_arr = []
for i in range(31):
    npz_arr.append(np.array([]))
for idx, parent in enumerate(parents):
    ind = int(parent)
    npz_arr[ind] = np.array(daughters[idx])

#np.savez(os.path.join(output_dir, 'division.npz'), npz_arr)
