# Annotation counts
In this notebook we will compile some of the numbers reported in Schlegel _et al._ (2023) straight from the data dump.

In [1]:
import pandas as pd
import numpy as np

# Read the annotation dump
annot = pd.read_csv(
    "../supplemental_files/Supplemental_file1_neuron_annotations.tsv",
    sep="\t",  # it's a tab-separated file
    low_memory=False,  # this is just to avoid a datatype warning
)
annot.head()

Unnamed: 0,supervoxel_id,root_id,pos_x,pos_y,pos_z,soma_x,soma_y,soma_z,nucleus_id,flow,...,ito_lee_hemilineage,hartenstein_hemilineage,morphology_group,top_nt,top_nt_conf,side,nerve,vfb_id,fbbt_id,status
0,78112261444987077,720575940628857210,109306,50491,3960,104904.0,47464.0,5461.0,2453924.0,intrinsic,...,SMPpv2_ventral,CP1_ventral,,acetylcholine,0.917977,left,,fw138205,FBbt_20001935,
1,82475466912542440,720575940626838909,172029,55635,1592,177472.0,56936.0,1429.0,7393349.0,intrinsic,...,VLPl2_medial,BLAv2_medial,VLPl2_medial__0,acetylcholine,0.645442,right,,fw000001,,
2,83038623024880664,720575940626046919,180632,58664,1925,180632.0,58664.0,1925.0,7415038.0,intrinsic,...,,,,acetylcholine,0.844781,right,,fw000002,FBbt_20000538,
3,79801523353604463,720575940630311383,133800,56063,1847,180728.0,61008.0,1630.0,7415013.0,intrinsic,...,putative_primary,putative_primary,,acetylcholine,0.760387,right,,fw000003,FBbt_20000260,
4,83038554439606237,720575940633370649,180496,57448,2989,180496.0,57448.0,2989.0,7415848.0,intrinsic,...,,,,acetylcholine,0.895458,right,,fw000004,FBbt_00051248,


Some overall stats:

In [2]:
print("Number of neurons:")
print(f'{annot[(annot.super_class != "not_a_neuron")].shape[0]:,}')

Number of neurons:
139,255


In [3]:
print("A breakdown by superclass:")
print(annot.super_class.value_counts())

A breakdown by superclass:
optic                 77505
central               32384
sensory               16934
visual_projection      8057
ascending              2362
descending             1303
visual_centrifugal      524
motor                   106
endocrine                80
Name: super_class, dtype: int64


In [4]:
print("Sensory neurons in the central brain (i.e. excluding photoreceptor neurons):")
print(
    f'{annot[(annot.super_class == "sensory") & (annot.cell_class != "visual")].shape[0]:,}'
)

Sensory neurons in the central brain (i.e. excluding photoreceptor neurons):
5,516


In [9]:
print("The number of total annotations across all neurons:")
print(
    annot[
        [
            "flow",
            "super_class",
            "cell_class",
            "cell_sub_class",
            "cell_type",
            "hemibrain_type",
            "side",
            "nerve",
            "ito_lee_hemilineage",
            "morphology_group",
            "soma_x",  # whether a soma position has been recorded
        ]
    ]
    .notnull()
    .sum()
    .sum()
)

The number of total annotations across all neurons:
866060


In [10]:
print("The number of total annotations per field:")
print(
    annot[
        [
            "flow",
            "super_class",
            "cell_class",
            "cell_sub_class",
            "cell_type",
            "hemibrain_type",
            "side",
            "nerve",
            "ito_lee_hemilineage",
            "morphology_group",
        ]
    ]
    .notnull()
    .sum()
)

The number of total annotations per field:
flow                   139255
super_class            139255
cell_class             107364
cell_sub_class          16259
cell_type              109210
hemibrain_type          29467
side                   139253
nerve                    9639
ito_lee_hemilineage     36896
morphology_group        21355
dtype: int64


In [11]:
# Subset to all neurons intrinsic to the central brain
cent_int = annot[
    annot.super_class.isin(
        (
            "central",
            "descending",
            "motor",
            "endocrine",
            "visual_centrifugal",
        )
    )
    & (annot.cell_class != "visual")
]

print("Left vs right counts for central brain-intrinsic neurons:")
print(cent_int.side.value_counts().to_string(), "\n")

print("Left vs right differences for central brain-intrinsic neurons:")
lrdiff = (cent_int.side == "right").sum() - (cent_int.side == "left").sum()
print(f"{lrdiff} of {cent_int.shape[0]} ({lrdiff / cent_int.shape[0]:.2%})")

Left vs right counts for central brain-intrinsic neurons:
right     17163
left      17137
center       97 

Left vs right differences for central brain-intrinsic neurons:
26 of 34397 (0.08%)


Stats on cell types

In [12]:
# Add a column where we backfill cell type with hemibrain type
# (i.e. `cell_type` is considered to be the "final" type)
annot["type_combined"] = annot.cell_type.fillna(annot.hemibrain_type)

print("Number of neurons with a type (cell- or hemibrain type):")
print(
    f"{annot.type_combined.notnull().sum():,} of {annot.shape[0]:,} ({annot.type_combined.notnull().sum()/annot.shape[0]:.1%})"
)

Number of neurons with a type (cell- or hemibrain type):
129,894 of 139,255 (93.3%)


In [13]:
# Subset to all neurons with arbour in the central brain
cent = annot[
    annot.super_class.isin(
        (
            "central",
            "descending",
            "motor",
            "endocrine",
            "sensory",
            "ascending",
            "visual_projection",
            "visual_centrifugal",
        )
    )
    & (
        annot.cell_class != "visual"
    )  # we included all sensory above; here we exclude visual sensory
]
print("Number of central brain neurons with a type:")
print(
    f"{cent.type_combined.notnull().sum():,} of {cent.shape[0]:,} ({cent.type_combined.notnull().sum()/cent.shape[0]:.1%})"
)

Number of central brain neurons with a type:
47,067 of 50,332 (93.5%)


In [14]:
# Redefine central brain intrinsic neurons because we added a new column to the
# original annotation table
cent_int = annot[
    annot.super_class.isin(
        (
            "central",
            "descending",
            "motor",
            "endocrine",
            "visual_centrifugal",
        )
    )
    & (annot.cell_class != "visual")
]

print("Number of central brain-intrinsic neurons with a type:")
print(
    f"{cent_int.type_combined.notnull().sum():,} of {cent_int.shape[0]:,} ({cent_int.type_combined.notnull().sum()/cent_int.shape[0]:.1%})"
)

Number of central brain-intrinsic neurons with a type:
33,817 of 34,397 (98.3%)


In [15]:
print("Total number of unique cell types:")
print(f"{annot.type_combined.nunique():,}")

Total number of unique cell types:
7,846


In [16]:
# How many final cell types do we have that are based on three hemispheres worth of data?
print(
    "Number of cell types based on three hemispheres worth of data (high confidence):"
)
print(f"{annot[annot.hemibrain_type.notnull()].type_combined.nunique():,}")

print(
    "Number of FlyWire neurons with cell types based on three hemispheres worth of data:"
)
print(
    f"{annot[annot.hemibrain_type.notnull() & annot.type_combined.notnull()].shape[0]:,}"
)

Number of cell types based on three hemispheres worth of data (high confidence):
3,884
Number of FlyWire neurons with cell types based on three hemispheres worth of data:
29,467


In [13]:
print("Cell types based on FlyWire + light-level data")
hb_matched = np.unique(
    annot.loc[
        annot.hemibrain_type.notnull(), ["type_combined", "hemibrain_type"]
    ].values.flatten()
)
is_hb_matched = annot.hemibrain_type.isin(hb_matched) | annot.type_combined.isin(
    hb_matched
)
has_cb_type = annot.cell_type.str.startswith("CB", na=False)
is_new_dn = (  # find new DNs that have no match in the hemibrain
    (annot.super_class == "descending")
    & annot.cell_type.str.contains("DN[a-z]e.*?", regex=True, na=False)
    & annot.hemibrain_type.isnull()
)
print(annot[~has_cb_type & ~is_hb_matched & ~is_new_dn].type_combined.nunique())

print("Number of de-novo cell types based on just FlyWire left and right:")
# i.e. for how many final cell types do we have that are based on just two
# hemispheres
print(annot[(has_cb_type | is_new_dn) & ~is_hb_matched].type_combined.nunique())

print(
    "Number of de-novo cell types based on FlyWire left + right and untyped hemibrain fragment:"
)
# i.e. for how many final cell types do we have that include a previously untyped
# (fragment of) a hemibrain neuron
print(annot[(has_cb_type | is_new_dn) & is_hb_matched].nunique().shape[0])

Cell types based on FlyWire + light-level data
168
Number of de-novo cell types based on just FlyWire left and right:
1105
Number of de-novo cell types based on FlyWire left + right and untyped hemibrain fragment:
25


In [17]:
# How many hemibrain cell types did we manage to match?
# Keep in mind that hemibrain_type contains merges such as "SIP078,SIP080"
# Here, we unpack them into ['SIP078', 'SIP080', ...]
matched_types = np.unique(
    [
        c.strip()
        for l in annot[annot.hemibrain_type.notnull()].hemibrain_type.unique()
        for c in l.split(",")
    ]
)

# Load the hemibrain meta data we compiled
# Each row in this file is a hemibrain neuron
hb_meta = pd.read_csv("../supplemental_files/Supplemental_file5_hemibrain_meta.csv")

# `matched_types` contains a mix of mostly morphology types but also a few connectivity types
hb_meta["found"] = hb_meta.type.isin(matched_types) | hb_meta.morphology_type.isin(
    matched_types
)

In [20]:
hb_types = hb_meta[hb_meta.type.notnull()].drop_duplicates("morphology_type")
print("Number of hemibrain types found in FlyWire:")
print(
    f"{hb_types.found.sum():,} of {hb_types.shape[0]:,} ({hb_types.found.sum() / hb_types.shape[0]:.1%})"
)

print(
    "Number of neurons for which we have corresponding labels in hemibrain and in FlyWire:"
)
print(
    f"{hb_meta.found.sum() + annot[annot.hemibrain_type.notnull() & annot.type_combined.notnull()].shape[0]:,}"
)

Number of hemibrain types found in FlyWire:
3,539 of 5,235 (67.6%)
Number of neurons for which we have corresponding labels in hemibrain and in FlyWire:
47,176


In [21]:
# How often have we been able to map down to the connectivity-type level?
hb_cn_types = hb_types[hb_types.type != hb_types.morphology_type]
print("Number of hemibrain connectivity types found in FlyWire:")
print(
    f"{hb_cn_types.type.isin(matched_types).sum()} of {hb_cn_types.shape[0]} ({hb_cn_types.type.isin(matched_types).sum() / hb_cn_types.shape[0]:.1%})"
)

Number of hemibrain connectivity types found in FlyWire:
38 of 258 (14.7%)


In [22]:
print("Number of types representing merges of two or more hemibrain types (many:1):")
merges = annot[
    annot.hemibrain_type.str.contains(",", na=False)
    & ~annot.hemibrain_type.str.contains("hb", na=False)
]
print(
    f"{merges.hemibrain_type.unique().shape[0]} ({merges.hemibrain_type.unique().shape[0]/annot[annot.hemibrain_type.notnull()].hemibrain_type.unique().shape[0]:.1%})"
)

Number of types representing merges of two or more hemibrain types (many:1):
195 (5.6%)


In [23]:
print("Number of hemibrain types contained in those many:1 cell types:")
in_merge = np.unique(
    [t.strip() for l in merges.hemibrain_type.unique() for t in l.split(",")]
)
print(
    f"{in_merge.shape[0]} ({in_merge.shape[0] / matched_types.shape[0]:.1%} of all matched hemibrain types)"
)

Number of hemibrain types contained in those many:1 cell types:
451 (12.0% of all matched hemibrain types)


In [24]:
# Find cases where we further split the hemibrain type
splits = np.unique(
    [
        v
        for l in annot.loc[
            annot.hemibrain_type.notnull()  # needs to have a hemibrain type
            & annot.cell_type.notnull()  # needs to have a cell type
            & ~annot.cell_type.str.contains(
                ",", na=False
            )  # cell type must not be compound type
            & ~annot.hemibrain_type.str.contains(
                "hb", na=False
            )  # hemibrain type must not be untyped neurons
            & np.array(
                [
                    b in a
                    for a, b in zip(
                        annot.cell_type.fillna("").values,
                        annot.hemibrain_type.fillna("").values,
                    )
                ]
            )  # hemibrain type must be in cell type (e.g. "SMP001" -> "SMP001a")
            & (
                annot.cell_type != annot.hemibrain_type
            ),  # cell and hemibrain type must not be the same
            "hemibrain_type",
        ].unique()
        for v in l.split(",")
    ]
)
was_split = (
    hb_meta[hb_meta.type.isin(splits) | hb_meta.morphology_type.isin(splits)]
    .drop_duplicates("type")
    .copy()
)
split_dict = (
    annot[annot.cell_type.notnull()]
    .groupby("hemibrain_type")
    .cell_type.apply(lambda x: ",".join(list(set(x))))
)
was_split["fw_type"] = was_split.type.map(split_dict)

print("Number of hemibrain types that were split (1:many):")
print(was_split.shape[0])

Number of hemibrain types that were split (1:many):
234


We hope the above gave you an overview of the annotations provided. Please 
also see the [tutorials](https://fafbseg-py.readthedocs.io/en/latest/source/gallery.html)
on annotations for `fafbseg`.