In [6]:
# Try to add lineage patterns via morphological analyses
import os
from pickle import load,dump

import pandas as pd
from treelib import Tree
import numpy as np
import glob
import tqdm


In [4]:

def construct_celltree(nucleus_file, max_time):
    '''
    Construct cell tree structure with cell names
    :param nucleus_file:  the name list file to the tree initilization
    :param max_time: the maximum time point to be considered
    :return cell_tree: cell tree structure where each time corresponds to one cell (with specific name)
    '''

    ##  Construct cell
    #  Add unregulized naming
    cell_tree = Tree()
    cell_tree.create_node('P0', 'P0')
    cell_tree.create_node('AB', 'AB', parent='P0')
    cell_tree.create_node('P1', 'P1', parent='P0')
    cell_tree.create_node('EMS', 'EMS', parent='P1')
    cell_tree.create_node('P2', 'P2', parent='P1')
    cell_tree.create_node('P3', 'P3', parent='P2')
    cell_tree.create_node('C', 'C', parent='P2')
    cell_tree.create_node('P4', 'P4', parent='P3')
    cell_tree.create_node('D', 'D', parent='P3')
    cell_tree.create_node('Z2', 'Z2', parent='P4')
    cell_tree.create_node('Z3', 'Z3', parent='P4')

    # EMS
    cell_tree.create_node('E', 'E', parent='EMS')
    cell_tree.create_node('MS', 'MS', parent='EMS')

    # Read the name excel and construct the tree with complete segCell
    df_time = read_cd_file(nucleus_file)

    # read and combine all names from different acetrees
    ## Get cell number
    try:
        pd_number = pd.read_csv('./dataset/number_dictionary.csv', names=["name", "label"])
        number_dictionary = pd.Series(pd_number.label.values, index=pd_number.name).to_dict()
    except:
        raise Exception("Not find number dictionary at ./dataset")

    # =====================================
    # dynamic update the name dictionary
    # =====================================
    cell_in_dictionary = list(number_dictionary.keys())

    ace_pd = read_cd_file(os.path.join(nucleus_file))
    ace_pd = ace_pd[ace_pd.time <= max_time]
    cell_list = list(ace_pd.cell.unique())
    add_cell_list = list(set(cell_list) - set(cell_in_dictionary))

    if len(add_cell_list) != 0:
        assert len(add_cell_list) == 0, "Name dictionary should be updated"
        # print(colored("Name dictionary updated", "red"))

        # ================================= cancel dynamic updating ============
        # add_cell_list.sort()
        # print("Name dictionary updated !!!")
        # add_number_dictionary = dict(zip(add_cell_list, range(len(cell_in_dictionary) + 1, len(cell_in_dictionary) + len(add_cell_list) + 1)))
        # number_dictionary.update(add_number_dictionary)
        # pd_number_dictionary = pd.DataFrame.from_dict(number_dictionary, orient="index")
        # pd_number_dictionary.to_csv('./dataset/number_dictionary.csv', header=False)

    df_time = df_time[df_time.time <= max_time]
    all_cell_names = list(df_time.cell.unique())
    for cell_name in list(all_cell_names):
        if cell_name not in number_dictionary:
            continue
        times = list(df_time.time[df_time.cell==cell_name])
        cell_info = cell_node()
        cell_info.set_number(number_dictionary[cell_name])
        cell_info.set_time(times)
        if not cell_tree.contains(cell_name):
            if "Nuc" not in cell_name:
                parent_name = cell_name[:-1]
                cell_tree.create_node(cell_name, cell_name, parent=parent_name, data=cell_info)
        else:
            cell_tree.update_node(cell_name, data=cell_info)

    return cell_tree, max_time


class cell_node(object):
    # Node Data in cell tree
    def __init__(self):
        self.number = 0
        self.time = 0

    def set_number(self, number):
        self.number = number

    def get_number(self):

        return self.number

    def set_time(self, time):
        self.time = time

    def get_time(self):

        return self.time

class cell_node_info(object):
    # Node Data in cell tree
    def __init__(self, number=0, time=[], generation_num=0, position_x=0):
        self.number = number
        self.time = time
        self.generation = generation_num
        self.position_x = position_x

    def set_number(self, number):
        self.number = number

    def get_number(self):
        return self.number

    def set_generation(self, generation_num):
        self.generation = generation_num

    def get_generation(self):
        return self.generation

    def set_time(self, time):
        self.time = time

    def get_time(self):
        return self.time

    def set_position_x(self, position_x):
        self.position_x = position_x

    def get_position_x(self):
        return self.position_x

def read_cd_file(cd_file):
    df_nuc = pd.read_csv(cd_file, lineterminator="\n")
    df_nuc = df_nuc.astype({"x": float, "y": float, "z": float, "time": int})

    return df_nuc

def construct_basic_cell_name_tree(cell_div_files_path, max_time, tree_distance_num,name_dictionary_path):
    """
    Construct cell tree structure with cell names
    :param cell_div_files_path:  the name list file to the tree initilization
    :param max_time: the maximum time point to be considered
    :param name_dictionary_path: name dictionary of cell labels
    :return cell_tree: cell tree structure where each time corresponds to one cell (with specific name)
    """

    # read and combine all names from different acetrees
    # Get cell number by its name
    # number_cell_dict, cell_number_dict = get_cell_name_affine_table(path=name_dictionary_path)
    # name_dictionary_path = r'F:\packed membrane nucleus 3d niigz\name_dictionary_TUNETr.csv'
    label_name_dict = pd.read_csv(name_dictionary_path, index_col=0).to_dict()['0']
    name_label_dict = {value: key for key, value in label_name_dict.items()}
    #  Construct cell
    #  Add irregular naming
    # initialize the cell tree (basic cell tree -- ABa, E MS, C, Z3 Z2 ,etc)
    cell_tree = Tree()
    cell_tree.create_node('P0', 'P0',
                          data=cell_node_info(number=name_label_dict['P0'], time=[], generation_num=-1, position_x=0))
    cell_tree.create_node('AB', 'AB', parent='P0',
                          data=cell_node_info(number=name_label_dict['AB'], time=[], generation_num=0,
                                              position_x=-2 ** tree_distance_num))
    P1_CELL_NODE = cell_tree.create_node('P1', 'P1', parent='P0',
                                         data=cell_node_info(number=name_label_dict['P1'], time=[], generation_num=0,
                                                             position_x=2 ** tree_distance_num))
    EMS_CELL_NODE = cell_tree.create_node('EMS', 'EMS', parent='P1',
                                          data=cell_node_info(number=name_label_dict['EMS'], time=[], generation_num=1,
                                                              position_x=P1_CELL_NODE.data.get_position_x() - 2 ** (
                                                                      tree_distance_num - 1)))
    # MS,E daughters of EMS
    cell_tree.create_node('MS', 'MS', parent='EMS',
                          data=cell_node_info(number=name_label_dict['MS'], time=[], generation_num=2,
                                              position_x=EMS_CELL_NODE.data.get_position_x() - 2 ** (
                                                      tree_distance_num - 2)))
    cell_tree.create_node('E', 'E', parent='EMS',
                          data=cell_node_info(number=name_label_dict['E'], time=[], generation_num=2,
                                              position_x=EMS_CELL_NODE.data.get_position_x() + 2 ** (
                                                      tree_distance_num - 2)))
    # P2
    P2_CELL_NODE = cell_tree.create_node('P2', 'P2', parent='P1',
                                         data=cell_node_info(number=name_label_dict['P2'], time=[], generation_num=1,
                                                             position_x=P1_CELL_NODE.data.get_position_x() + 2 ** (
                                                                     tree_distance_num - 1)))

    # C,P3 daughters of P2
    cell_tree.create_node('C', 'C', parent='P2',
                          data=cell_node_info(number=name_label_dict['C'], time=[], generation_num=2,
                                              position_x=P2_CELL_NODE.data.get_position_x() - 2 ** (
                                                      tree_distance_num - 2)))
    P3_CELL_NODE = cell_tree.create_node('P3', 'P3', parent='P2',
                                         data=cell_node_info(number=name_label_dict['P3'], time=[], generation_num=2,
                                                             position_x=P2_CELL_NODE.data.get_position_x() + 2 ** (
                                                                     tree_distance_num - 2)))
    # D, P4 daughters of P3
    cell_tree.create_node('D', 'D', parent='P3',
                          data=cell_node_info(number=name_label_dict['D'], time=[], generation_num=3,
                                              position_x=P3_CELL_NODE.data.get_position_x() - 2 ** (
                                                      tree_distance_num - 3)))
    P4_CELL_NODE = cell_tree.create_node('P4', 'P4', parent='P3',
                                         data=cell_node_info(number=name_label_dict['P4'], time=[], generation_num=3,
                                                             position_x=P3_CELL_NODE.data.get_position_x() + 2 ** (
                                                                     tree_distance_num - 3)))
    # Z3 Z2 daughters of P4
    cell_tree.create_node('Z3', 'Z3', parent='P4',
                          data=cell_node_info(number=name_label_dict['Z3'], time=[], generation_num=4,
                                              position_x=P4_CELL_NODE.data.get_position_x() - 2 ** (
                                                      tree_distance_num - 4)))
    cell_tree.create_node('Z2', 'Z2', parent='P4',
                          data=cell_node_info(number=name_label_dict['Z2'], time=[], generation_num=4,
                                              position_x=P4_CELL_NODE.data.get_position_x() + 2 ** (
                                                      tree_distance_num - 4)))

    # Read the name excel and construct the tree with complete segCell
    df_cell_CD_file = read_cd_file(cell_div_files_path)

    # =====================================
    # dynamic update the name dictionary
    # =====================================
    cell_in_dictionary = list(name_label_dict.keys())

    # erase the cell excced max time
    ace_pd = df_cell_CD_file[df_cell_CD_file.time <= max_time]
    cell_list = list(ace_pd.cell.unique())
    add_cell_list = list(set(cell_list) - set(cell_in_dictionary))
    # if embryo CD files are different from cell name csv file(dictionary)
    assert len(add_cell_list) == 0, "Name dictionary should be updated"

    # ================================= cancel dynamic updating ============
    # add_cell_list.sort()
    # if len(add_cell_list) > 0:
    #     print("Name dictionary updated !!!")
    #     add_number_dictionary = dict(zip(add_cell_list, range(len(cell_in_dictionary) + 1, len(cell_in_dictionary) + len(add_cell_list) + 1)))
    #     number_dictionary.update(add_number_dictionary)
    #     pd_number_dictionary = pd.DataFrame.from_dict(number_dictionary, orient="index")
    #     pd_number_dictionary.to_csv('./dataset/number_dictionary.csv', header=False)

    df_cell_CD_file = df_cell_CD_file[df_cell_CD_file.time <= max_time]
    cells_this_CD_embryo = list(df_cell_CD_file.cell.unique())
    for cell_name in list(cells_this_CD_embryo):
        if cell_name not in name_label_dict:  # the cell with no nucleus or generated by acetree unknown
            continue

        if not cell_tree.contains(cell_name):  # this cell not yet in the cell tree
            if "Nuc" not in cell_name:  # normal name of the cell
                parent_name = cell_name[:-1]
                this_cell_node = cell_tree.create_node(cell_name, cell_name, parent=parent_name,
                                                       data=cell_node_info(number=name_label_dict[cell_name], time=[]))
                parent_node = cell_tree.parent(cell_name)
                this_cell_node.data.set_generation(parent_node.data.get_generation() + 1)
                if len(cell_tree.children(parent_name)) == 1:
                    this_cell_node.data.set_position_x(parent_node.data.get_position_x() - 2 ** (
                            tree_distance_num - this_cell_node.data.get_generation()))
                else:
                    # len ==2
                    this_cell_node.data.set_position_x(parent_node.data.get_position_x() + 2 ** (
                            tree_distance_num - this_cell_node.data.get_generation()))

    return cell_tree

In [7]:
tree_distance_num = 12
save_folder =  r'./Data/lineage_tree'
cd_files_root_path=r'C:\Users\zelinli6\OneDrive - City University of Hong Kong - Student\MembraneProjectData\RawCDFiles'
# embryo_nameing_wt20xx = {'WT_Sample1': 11, 'WT_Sample2': 3, 'WT_Sample3': 2, 'WT_Sample4': 13,
#                                             'WT_Sample5': 7, 'WT_Sample6': 12, 'WT_Sample7': 1, 'WT_Sample8': 3}
embryo_nameing_wt20xx = {'WT_Sample1': '200113plc1p2', 'WT_Sample2': '200322plc1p2', 'WT_Sample3': '200323plc1p1', 'WT_Sample4': '200326plc1p3',
                                            'WT_Sample5': '200326plc1p4', 'WT_Sample6': '191108plc1p1', 'WT_Sample7': '200109plc1p1', 'WT_Sample8': '200113plc1p3'}

cell_info_folder_root=r'C:\Users\zelinli6\OneDrive - City University of Hong Kong - Student\MembraneProjectData\CMapSubmission\Dataset Access\Dataset nucLoc\Dataset E'
# embryo_names = ['200710hmr1plc1p1','200710hmr1plc1p2','200710hmr1plc1p3']
name_dictionary_file_path = r"C:\Users\zelinli6\OneDrive - City University of Hong Kong - Student\Documents\06paper TUNETr TMI LSA NC\Tables\name_dictionary.csv"

cell_size = []

for formated_emb_name,origianl_embryo_name in embryo_nameing_wt20xx.items():
    cell_div_files_path = os.path.join(cd_files_root_path, origianl_embryo_name,"CD{}.csv".format(origianl_embryo_name))
    # Online folder
    cell_info_folder = os.path.join(cell_info_folder_root,formated_emb_name)
    save_name = None

    # number of tps
    # get all * .csv files list under the folder
    cell_info_files_path = sorted(glob.glob(os.path.join(cell_info_folder, "*.csv")))
    # print(cell_info_files)
    max_time = len(cell_info_files_path)
    # ================================
    # Construct cell label tree

    # ================================
    cell_tree = construct_basic_cell_name_tree(cell_div_files_path, max_time,tree_distance_num,name_dictionary_file_path)

    # cell_tree.show(key=False),

    # ================================
    # collect cell information (tps, volumns, surfaces)
    # ================================

    for tp, cell_info_file in enumerate(
            tqdm.tqdm(cell_info_files_path, desc="Collecting cell infos cshaper embryo{}".format(formated_emb_name)),
            start=1):
        if tp <= max_time:
            # go through each cell with time order
            df_cell_CD_and_note = pd.read_csv(cell_info_file, header=0).astype({"note": str, "nucleus_name": str})
            df_cell_CD_and_note = df_cell_CD_and_note[
                ~df_cell_CD_and_note.note.str.contains("lost")]  # delete all lost cells
            df_cell_CD_and_note = df_cell_CD_and_note[
                ~df_cell_CD_and_note.note.str.contains("child")]  # delete all children nucleus, but not cell.
            cells_at_this_tp = df_cell_CD_and_note["nucleus_name"].tolist()
            for cell_name in cells_at_this_tp:
                this_cell_node = cell_tree.get_node(cell_name)
                if this_cell_node.is_leaf():
                    this_cell_node.data.get_time().append(tp)
                else:
                    [child1, child2] = cell_tree.children(cell_name)
                    if this_cell_node.is_leaf():
                        this_cell_node.data.get_time().append(tp)
                    elif len(child1.data.get_time()) == 0 and len(child1.data.get_time()) == 0:
                        this_cell_node.data.get_time().append(tp)
                    elif tp >= child1.data.get_time()[0] or tp >= child2.data.get_time()[0]:
                        continue
                    else:
                        this_cell_node.data.get_time().append(tp)
        else:
            break
    # for node_id in cell_tree.expand_tree(sorting=False):
    #     # if len(cell_tree.get_node(node_id).data.get_time())==0:
    #     this_cell_node = cell_tree.get_node(node_id)
    #     print(embryo_name, node_id, 'living tp', this_cell_node.data.get_time(), 'Generation number',
    #           this_cell_node.data.get_generation(), 'position_x', this_cell_node.data.get_position_x())
    # cell_size.append(cell_tree.size())
    # cell_tree.to_graphviz(embryo_name)
    # save cell tree incoporated with life span. Accessible with cell_tree.get_node[<cell_name>].data.get_time()

    # if cell_tree.get_node('ABal').data.get_time()[0] != cell_tree.get_node('ABpl').data.get_time()[0]:
    #     print('ABal and ABpl don\'t split together')
    #     print(cell_tree.get_node('ABal').data.get_time()[0])
    #     print(cell_tree.get_node('ABpl').data.get_time()[0])

    if save_name is None:
        save_name = os.path.basename(cell_info_folder)
    save_file = os.path.join(save_folder, save_name + "_cell_life_tree")
    with open(save_file, "wb") as f:
        dump(cell_tree, f)  # treelib tree object need pickle to dump and load

    begen_cell = ''
    for node_id in cell_tree.expand_tree(sorting=False):
        if len(cell_tree.get_node(node_id).data.get_time()) != 0:
            begen_cell = node_id
            break
    print(formated_emb_name,'  begin cell', begen_cell, ' cell number', cell_tree.size(),'embryo max frame', max_time)

KeyError: 'P0'