In [457]:
import pandas as pd
import numpy as np

In [458]:
# importing data
all_cell_markers_df = pd.read_csv('../data/all_cell_markers.txt', sep = r'\t')
all_human_markers_df = pd.read_csv('../data/Human_cell_markers.txt', sep = r'\t')
all_mouse_markers_df = pd.read_csv('../data/Mouse_cell_markers.txt', sep = r'\t')
all_singleCell_markers_df = pd.read_csv('../data/Single_cell_markers.txt', sep = r'\t')

  return func(*args, **kwargs)


# Data Schema
- **speciesType**: the species from which the data originates
    - there are only two data type, either `Human` or `Mouse`
- **tissueType**: the type of tissues from which data originates
    - in total 181 different kinds of cells
    - a lot of them are undefined
- **UberonOntologyID**: The universal unique identifier of the anatomy structure found in animals 
    - needs to confirm with the team
    - contain missing value
- **cancerType**: the association of the cell marker with the cancer name
    - no cancer cell are named as `Normal`
- **cellName**: the English name of the cell that marker belong to
- **CellOntologyID**: The universal unique identifier of the cell that marker belong to
    - contain missing value
- **cellMarker**: a marker molecule of the cell
    - in string list, can be convert to list
- **geneSymbol**: gene expression of the cell marker
- **geneID**: The universal unique identifier of the gene
    - contain missing value
- **proteinName**: name of the protein
    - contain missing value
- **proteinID**: The universal unique identifier of the protein
- **markerResource**: the type of resouce or methologoy used to identify the marker
    - there are only four data type, either `Experiment` or `Single-cell sequencing` or `Company` or `Review`
- **PMID**: The PudMed ID for the publication or study where the marker data was reported
    - abnormal value contain `company`
- **Company**: the company associated with the resouces

# Exploration on all_cell_markers
Notes: all codes a modulerized, which means you can directly jump to a specific section without running previous sections

EDA outlines:
- Confirming hypothesis: `all_cell_markers` contain all the data from `all_human_markers`, `all_mouse_markers`
- Confirming hypothesis: None of the data from `all_singleCell_markers` exist in `all_cell_markers`
- Revealing trend: assoication between missing `UberonOntologyID` and `tissueType`
- Revealing trend: assoication between missing `CellOntologyID` and `cellName`
- Confirming hypothesis: if `geneSymbol` is missing, then `geneID` is not missing
- Confirming hypothesis: if `proteinName` is missing, then `proteinID` is not missing
- Confirming all the nested list are correspond to each other element

## Confirm Hypothesis
`all_cell_markers` contain all the data from `all_human_markers`, `all_mouse_markers`

In [459]:
all_cell_marker_tuple = all_cell_markers_df.apply(tuple, axis=1)
all_human_marker_tuple = all_human_markers_df.apply(tuple, axis=1)
all_mouse_markers_tuple = all_mouse_markers_df.apply(tuple, axis=1)

# check if all value in human_marker exist in cell_marker
if all(all_human_marker_tuple.isin(all_cell_marker_tuple)):
    print("all value in all_human_marker dataset exist in all_cell_marker dataset")
else:
    mismatch_value = sum(all_human_marker_tuple.isin(all_cell_marker_tuple) == False)
    print(f"there are {mismatch_value} from all_human_marker that does not exist in all_cell_marker dataset")

# check if all value in mouse_marker exist in cell_marker
if all(all_mouse_markers_tuple.isin(all_cell_marker_tuple)):
    print("all value in all_mouse_markers dataset exist in all_cell_marker dataset")
else:
    mismatch_value = sum(all_mouse_markers_tuple.isin(all_cell_marker_tuple) == False)
    print(f"there are {mismatch_value} from all_mouse_markers that does not exist in all_cell_marker dataset")



all value in all_human_marker dataset exist in all_cell_marker dataset
all value in all_mouse_markers dataset exist in all_cell_marker dataset


## Confirm Hypothesis
None of the data from `all_singleCell_markers` exist in `all_cell_markers`

In [460]:
all_cell_marker_tuple = all_cell_markers_df.apply(tuple, axis=1)
all_singleCell_markers_tuple = all_singleCell_markers_df.apply(tuple, axis=1)

# check if all value in singleCell_marker exist in cell_marker
if all(all_singleCell_markers_tuple.isin(all_cell_marker_tuple)):
    print("all value in all_singleCell_markers dataset exist in all_cell_marker dataset")
else:
    mismatch_value = sum(all_singleCell_markers_tuple.isin(all_cell_marker_tuple) == False)
    print(f'there are in total {mismatch_value} records in all_singleCell_markers')
    print(f"there are {mismatch_value} from all_singleCell_markers that does not exist in all_cell_marker dataset")

there are in total 535 records in all_singleCell_markers
there are 535 from all_singleCell_markers that does not exist in all_cell_marker dataset


## Revealing trend
assoication between missing `UberonOntologyID` and `tissueType`

most of the tissue with missing ID is undefined tissue

In [461]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
cell_markers_df[['UberonOntologyID', 'tissueType']][cell_markers_df['UberonOntologyID'].isna()].tissueType.value_counts()

Undefined                      701
Fetal liver                     29
Fetal gonad                     26
Embryonic prefrontal cortex     12
Bladder                          5
Sinonasal mucosa                 2
Fetal brain                      1
Osteoarthritic cartilage         1
Name: tissueType, dtype: int64

# Revealing trend
assoication between missing `CellOntologyID` and `cellName`

In [462]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
cell_markers_df[['CellOntologyID', 'cellName']][cell_markers_df['CellOntologyID'].isna()].cellName.value_counts()

Cancer stem cell                    596
Progenitor cell                      53
Cancer stem-like cell                 9
Neural progenitor cell                8
Cardiac progenitor cell               8
                                   ... 
Foxp3+IL-17+ T cell                   1
Definitive zone cell                  1
Bone marrow stem cell                 1
PDX1+ pancreatic progenitor cell      1
Adipogenic progenitor cell            1
Name: cellName, Length: 142, dtype: int64

# Confirming hypothesis
if `geneSymbol` is missing, then `geneID` is not missing

In [463]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
check_same = cell_markers_df[['geneSymbol', 'geneID']].apply(
    lambda row: all(row.isna()) or all(~row.isna()), axis = 1 
)
if all(check_same):
    print('if there is a missing proteinSymbol, then there must be missing proteinID')
else:
    print('if there is a missing symbol then there might exist a ID')

if there is a missing proteinSymbol, then there must be missing proteinID


# Confirming hypothesis
if `proteinName` is missing, then `proteinID` is not missing

In [464]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
check_same = cell_markers_df[['proteinName', 'proteinID']].apply(
    lambda row: all(row.isna()) or all(~row.isna()), axis = 1 
)
if all(check_same):
    print('if there is a missing proteinSymbol, then there must be missing proteinID')
else:
    print('if there is a missing symbol then there might exist a ID')

if there is a missing proteinSymbol, then there must be missing proteinID


## Confirming hypothesis
each existed cellMarkers have a gene name assoicated it

In [499]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
def str_to_list(listLikeStr):
    if not isinstance(listLikeStr, str): return np.NaN
    cleaned_listLikeStr = listLikeStr.replace("'", "")

    split_element = cleaned_listLikeStr.split(',')

    is_nested = False
    result = []
    nested_list = []
    for element in split_element:
        if is_nested:
            if "]" in element:
                is_nested = False
                nested_list.append(element.replace(']', "").strip())
                result.append(nested_list)
                nested_list = []
            else:
                if element.strip():
                    nested_list.append(element.strip())
        else:
            if "[" in element:
                is_nested = True
                nested_list.append(element.replace('[', "").strip())
            else:
                if element.strip():
                    result.append(element.strip())

    return result
        

to_convert_col  = ['cellMarker', 'geneSymbol', 'geneID', 'proteinName', 'proteinID']
for col in to_convert_col:
    cell_markers_df[col] = cell_markers_df[col].apply(str_to_list)

len_count = cell_markers_df[to_convert_col].applymap(lambda x : len(x) if isinstance(x, list) else 0)


if cell_markers_df[to_convert_col].applymap(lambda x: len(x) if isinstance(x, list) else np.NaN).dropna().shape[0] == 0:
    print("each existed cellMarkers have a gene name assoicated it")
else:
    print('some gene has mis-match sequence')
    mistamch_index = len_count[len_count.nunique(axis = 1) != 1].index
    print(f"for these following index: {mistamch_index}")

some gene has mis-match sequence
for these following index: Int64Index([  64,   65,   66,  101,  214,  332,  412,  483,  490,  498,  524,
             525,  579,  672,  688,  689,  725,  727,  795,  815,  868,  872,
             922,  945, 1042, 1045, 1070, 1086, 1215, 1260, 1366, 1394, 1496,
            1524, 1525, 1779, 1850, 1854, 1856, 1883, 1993, 2187, 2253, 2254,
            2255, 2256, 2381, 2458, 2523, 2537, 2753, 3135, 3151, 3226, 3227,
            3228, 3243, 3350, 3557, 3558, 3632, 3710, 3733, 3741, 3844, 3908,
            3918, 3932, 3973, 3977, 3983, 4005, 4099, 4167, 4245, 4268, 4276,
            4379, 4443, 4453, 4467, 4508, 4512, 4518, 4540, 4634],
           dtype='int64')


# Parsing procedure

Data Parsing Outlines
- concat `all_cell_markers` and `all_singleCell_markers`
- replace all the `undefined` tissue to NaN value
- convert all the strLikelist into list

In [504]:
# import data
all_cell_markers_df = pd.read_csv('../data/all_cell_markers.txt', sep = r'\t')
all_human_markers_df = pd.read_csv('../data/Human_cell_markers.txt', sep = r'\t')
all_mouse_markers_df = pd.read_csv('../data/Mouse_cell_markers.txt', sep = r'\t')
all_singleCell_markers_df = pd.read_csv('../data/Single_cell_markers.txt', sep = r'\t')

  return func(*args, **kwargs)


In [505]:
def str_to_list(listLikeStr):
    if not isinstance(listLikeStr, str): return np.NaN
    cleaned_listLikeStr = listLikeStr.replace("'", "")

    split_element = cleaned_listLikeStr.split(',')

    is_nested = False
    result = []
    nested_list = []
    for element in split_element:
        if is_nested:
            if "]" in element:
                is_nested = False
                nested_list.append(element.replace(']', "").strip())
                result.append(nested_list)
                nested_list = []
            else:
                if element.strip():
                    nested_list.append(element.strip())
        else:
            if "[" in element:
                is_nested = True
                nested_list.append(element.replace('[', "").strip())
            else:
                if element.strip():
                    result.append(element.strip())

    return result

In [506]:
# concat all_cell_markers and all_singleCell_markers
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)

# change all the Undefined value in tissueType column to NaN
cell_markers_df['tissueType'] = cell_markers_df['tissueType'].replace('Undefined', np.NaN)

# convert specific columns into list
to_convert_col  = ['cellMarker', 'geneSymbol', 'geneID', 'proteinName', 'proteinID']
for col in to_convert_col:
    cell_markers_df[col] = cell_markers_df[col].apply(str_to_list)

In [507]:
cell_markers_df

Unnamed: 0,speciesType,tissueType,UberonOntologyID,cancerType,cellType,cellName,CellOntologyID,cellMarker,geneSymbol,geneID,proteinName,proteinID,markerResource,PMID,Company
0,Human,Kidney,UBERON_0002113,Normal,Normal cell,Proximal tubular cell,,[Intestinal Alkaline Phosphatase],[ALPI],[248],[PPBI],[P09923],Experiment,9263997,
1,Human,Liver,UBERON_0002107,Normal,Normal cell,Ito cell (hepatic stellate cell),CL_0000632,[Synaptophysin],[SYP],[6855],[SYPH],[P08247],Experiment,10595912,
2,Human,Endometrium,UBERON_0001295,Normal,Normal cell,Trophoblast cell,CL_0000351,[CEACAM1],[CEACAM1],[634],[CEAM1],[P13688],Experiment,10751340,
3,Human,Germ,UBERON_0000923,Normal,Normal cell,Primordial germ cell,CL_0000670,[VASA],[DDX4],[54514],[DDX4],[Q9NQI0],Experiment,10920202,
4,Human,Corneal epithelium,UBERON_0001772,Normal,Normal cell,Epithelial cell,CL_0000066,[KLF6],[KLF6],[1316],[KLF6],[Q99612],Experiment,12407152,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4653,Human,Embryo,UBERON_0000922,Normal,Normal cell,8-cell stage cell (Blastomere),CL_0000353,"[C11orf48, C19orf53, DHX9, DIABLO, EIF1AD, EIF...","[LBHD1, C19orf53, DHX9, DIABLO, EIF1AD, EIF4G1...","[79081, 28974, 1660, 56616, 84285, 1981, 26017...","[LBHD1, L10K, DHX9, DBLOH, EIF1A, IF4G1, FA32A...","[Q9BQE6, Q9UNZ5, Q08211, Q9NR28, Q8N9N8, Q0463...",Single-cell sequencing,23892778,
4654,Mouse,Embryo,UBERON_0000922,Normal,Normal cell,8-cell stage cell (Blastomere),CL_0000353,"[Abcf1, Cdc37, Diablo, Eif1ad, Eif4g1, Fam32a,...","[Abcf1, Cdc37, Diablo, Eif1ad, Eif4g1, Fam32a,...","[224742, 12539, 66593, 69860, 208643, 67922, 7...","[ABCF1, CDC37, DBLOH, EIF1A, IF4G1, FA32A, KDM...","[Q6P542, Q61081, Q9JIQ3, Q3THJ3, Q6NZJ6, Q9CR8...",Single-cell sequencing,23892778,
4655,Human,Embryo,UBERON_0000922,Normal,Normal cell,Morula cell (Blastomere),CL_0000360,"[ADCK1, AGL, AIMP1, AKAP12, ARPC3, ATP1B3, ATP...","[ADCK1, AGL, AIMP1, AKAP12, ARPC3, ATP1B3, NA,...","[57143, 178, 9255, 9590, 10094, 483, NA, 586, ...","[ADCK1, GDE, AIMP1, AKA12, ARPC3, AT1B3, AT5F1...","[Q86TW2, P35573, Q12904, Q02952, O15145, P5470...",Single-cell sequencing,23892778,
4656,Mouse,Embryo,UBERON_0000922,Normal,Normal cell,Morula cell (Blastomere),CL_0000360,"[Aimp1, Atp5f1, Atp5h, Bcat1, Bin1, Ccbl2, Cct...","[Aimp1, Atp5f1, Atp5h, Bcat1, Bin1, Kyat3, Cct...","[13722, 11950, 71679, 12035, 30948, 229905, 12...","[AIMP1, AT5F1, ATP5H, BCAT1, BIN1, KAT3, TCPE,...","[P31230, Q9CQQ7, Q9DCX2, P24288, O08539, Q71RI...",Single-cell sequencing,23892778,


## Data distribition

In [473]:
all_cell_markers_df.speciesType.value_counts()

Human    2868
Mouse    1255
Name: speciesType, dtype: int64

In [474]:
all_cell_markers_df.tissueType.value_counts()

Undefined           695
Brain               254
Bone marrow         254
Lung                249
Liver               240
                   ... 
Fetal brain           1
Vocal fold            1
Nerve                 1
Primitive streak      1
Premolar              1
Name: tissueType, Length: 181, dtype: int64

In [475]:
all_cell_markers_df.UberonOntologyID.value_counts()

UBERON_0000955    254
UBERON_0002371    254
UBERON_0002048    249
UBERON_0002107    240
UBERON_0005408    215
                 ... 
UBERON_0003215      1
UBERON_0006530      1
UBERON_0001052      1
UBERON_0003922      1
UBERON_0005384      1
Name: UberonOntologyID, Length: 172, dtype: int64

In [476]:
all_cell_markers_df.cancerType.value_counts()

Normal                                      3315
Breast Cancer                                100
Colorectum Cancer                             36
Prostate Cancer                               35
Colon Cancer                                  34
                                            ... 
Non-Hodgkin's Lymphoma                         1
Non-small Cell Lung Cancer (circulating)       1
Gonadoblastoma                                 1
Vascular Tumour                                1
oligodendroglioma                              1
Name: cancerType, Length: 133, dtype: int64

In [477]:
all_cell_markers_df.cellType.value_counts()

Normal cell    3315
Cancer cell     808
Name: cellType, dtype: int64

In [478]:
all_cell_markers_df.cellName.value_counts()

Cancer stem cell                                             593
Stem cell                                                    186
Endothelial cell                                             157
Macrophage                                                   139
Mesenchymal stem cell                                        137
                                                            ... 
Neuron-restricted precursor                                    1
Limbal mesenchymal cell                                        1
Quiescent small intestinal stem cell                           1
Adipose multilineage-differentiating stress-enduring cell      1
CD4+ T follicular helper cell                                  1
Name: cellName, Length: 681, dtype: int64

In [479]:
all_cell_markers_df.CellOntologyID.value_counts()

CL_0000034    308
CL_0000134    181
CL_0000115    168
CL_0000235    141
CL_0000084    132
             ... 
CL_0000166      1
CL_0008020      1
CL_0000058      1
CL_0000210      1
CL_0000850      1
Name: CellOntologyID, Length: 281, dtype: int64

In [480]:
all_cell_markers_df.cellMarker.value_counts()

CD133                                                                 109
CD44                                                                   58
CD31                                                                   50
CD68                                                                   47
CD3                                                                    39
                                                                     ... 
BDCA1, CD11c                                                            1
BDCA2, CD123                                                            1
CLEC9A                                                                  1
TAGLN2                                                                  1
ASCL1, BOC, CCND2, CD24, CHD7, EGFR, NFIB, SOX11, SOX2, SOX4, TCF4      1
Name: cellMarker, Length: 2517, dtype: int64

In [481]:
all_cell_markers_df.geneSymbol.value_counts()

PROM1                                                                 109
CD44                                                                   54
PECAM1                                                                 45
CD68                                                                   39
CD34                                                                   30
                                                                     ... 
KRT18, KRT19                                                            1
LIN28A                                                                  1
PROM1, NCAM1                                                            1
CLEC4F, VSIG4                                                           1
ASCL1, BOC, CCND2, CD24, CHD7, EGFR, NFIB, SOX11, SOX2, SOX4, TCF4      1
Name: geneSymbol, Length: 2485, dtype: int64

In [482]:
all_cell_markers_df.geneID.value_counts()

8842                                                                     109
960                                                                       54
5175                                                                      45
968                                                                       39
947                                                                       30
                                                                        ... 
8842, 4684                                                                 1
165530, 11326                                                              1
59, 2670                                                                   1
Albumin family, 4311, 3875, 3856                                           1
429, 91653, 894, 100133941, 55636, 1956, 4781, 6664, 6657, 6659, 6925      1
Name: geneID, Length: 2487, dtype: int64

In [483]:
all_cell_markers_df.proteinName.value_counts()

PROM1                                                                 115
CD44                                                                   61
PECA1                                                                  57
CD68                                                                   47
[CD3D, CD3E, CD3G]                                                     39
                                                                     ... 
Albumin family, NEP, K1C18, K2C8                                        1
Albumin family, K2C8                                                    1
Albumin family, K1C18                                                   1
FETA, FBF1, HNF4A                                                       1
ASCL1, BOC, CCND2, CD24, CHD7, EGFR, NFIB, SOX11, SOX2, SOX4, ITF2      1
Name: proteinName, Length: 2350, dtype: int64

In [484]:
all_cell_markers_df.proteinID.value_counts()

O43490                                                                                    109
P16070                                                                                     54
P16284                                                                                     45
P34810                                                                                     39
P28906                                                                                     30
                                                                                         ... 
Q8N1N0, Q9Y279                                                                              1
P62736, P14136                                                                              1
Albumin family, P08473, P05783, P05787                                                      1
Albumin family, P05787                                                                      1
P50553, Q9BWV1, P30279, P25063, Q9P2D1, P00533, O00712, P357

In [485]:
all_cell_markers_df.markerResource.value_counts()

Experiment                3064
Single-cell sequencing     535
Company                    355
Review                     169
Name: markerResource, dtype: int64

In [486]:
all_cell_markers_df.PMID.value_counts()

Company     355
30018341     41
30093597     37
29802404     27
29545511     21
           ... 
28869524      1
17714779      1
18701045      1
20060164      1
16849681      1
Name: PMID, Length: 2379, dtype: int64

In [487]:
all_cell_markers_df.Company.value_counts()

ebioscience           77
miltenyibiotec        60
abcam                 53
biolegend             52
bio-rad-antibodies    47
rndsystems            35
labome                16
bdbiosciences         15
Name: Company, dtype: int64

## Missing value

In [488]:
all_cell_markers_df.isna().mean().sort_values(ascending=False)

Company             0.913898
CellOntologyID      0.210526
UberonOntologyID    0.180936
proteinName         0.013582
proteinID           0.013582
geneSymbol          0.012127
geneID              0.012127
speciesType         0.000000
tissueType          0.000000
cancerType          0.000000
cellType            0.000000
cellName            0.000000
cellMarker          0.000000
markerResource      0.000000
PMID                0.000000
dtype: float64

In [489]:
test = all_cell_markers_df.copy().dropna(subset=['geneSymbol', 'proteinID'])    #[['cellMarker', 'geneSymbol', 'geneID']]
test['cellMarker'] = test['cellMarker'].apply(lambda x: x.split(', '))
test['geneSymbol'] = test['geneSymbol'].apply(lambda x: x.split(', '))
test['geneID'] = test['geneID'].apply(lambda x: x.split(', '))
test['proteinID'] = test['proteinID'].apply(lambda x: x.split(', '))

test['len_cellMarker'] = test['cellMarker'].apply(len)
test['len_geneSymbol'] = test['geneSymbol'].apply(len)
test['len_geneID'] = test['geneID'].apply(len)
test['len_proteinID'] = test['proteinID'].apply(len)

In [490]:
x = test[~((test['len_cellMarker'] == test['len_geneSymbol']) & (test['len_geneSymbol'] == test['len_geneID']))]

In [491]:
x.iloc[0]['geneSymbol']

['ITGAM', 'CD14', 'ITGB2', '[FCGR2A', 'FCGR2B', 'FCGR2C]', 'CD68']

In [492]:
x.iloc[0]['cellMarker']

['CD11b', 'CD14', 'CD18', 'CD32', 'CD68']

In [493]:
x.iloc[0]['geneID']

['3684', '929', '3689', '[2212', '2213', '9103]', '968']

In [494]:
# the problem of spliting for the cell maker

In [495]:
test

Unnamed: 0,speciesType,tissueType,UberonOntologyID,cancerType,cellType,cellName,CellOntologyID,cellMarker,geneSymbol,geneID,proteinName,proteinID,markerResource,PMID,Company,len_cellMarker,len_geneSymbol,len_geneID,len_proteinID
0,Human,Kidney,UBERON_0002113,Normal,Normal cell,Proximal tubular cell,,[Intestinal Alkaline Phosphatase],[ALPI],[248],PPBI,[P09923],Experiment,9263997,,1,1,1,1
1,Human,Liver,UBERON_0002107,Normal,Normal cell,Ito cell (hepatic stellate cell),CL_0000632,[Synaptophysin],[SYP],[6855],SYPH,[P08247],Experiment,10595912,,1,1,1,1
2,Human,Endometrium,UBERON_0001295,Normal,Normal cell,Trophoblast cell,CL_0000351,[CEACAM1],[CEACAM1],[634],CEAM1,[P13688],Experiment,10751340,,1,1,1,1
3,Human,Germ,UBERON_0000923,Normal,Normal cell,Primordial germ cell,CL_0000670,[VASA],[DDX4],[54514],DDX4,[Q9NQI0],Experiment,10920202,,1,1,1,1
4,Human,Corneal epithelium,UBERON_0001772,Normal,Normal cell,Epithelial cell,CL_0000066,[KLF6],[KLF6],[1316],KLF6,[Q99612],Experiment,12407152,,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4118,Human,Embryo,UBERON_0000922,Normal,Normal cell,8-cell stage cell (Blastomere),CL_0000353,"[C11orf48, C19orf53, DHX9, DIABLO, EIF1AD, EIF...","[LBHD1, C19orf53, DHX9, DIABLO, EIF1AD, EIF4G1...","[79081, 28974, 1660, 56616, 84285, 1981, 26017...","LBHD1, L10K, DHX9, DBLOH, EIF1A, IF4G1, FA32A,...","[Q9BQE6, Q9UNZ5, Q08211, Q9NR28, Q8N9N8, Q0463...",Single-cell sequencing,23892778,,19,19,19,19
4119,Mouse,Embryo,UBERON_0000922,Normal,Normal cell,8-cell stage cell (Blastomere),CL_0000353,"[Abcf1, Cdc37, Diablo, Eif1ad, Eif4g1, Fam32a,...","[Abcf1, Cdc37, Diablo, Eif1ad, Eif4g1, Fam32a,...","[224742, 12539, 66593, 69860, 208643, 67922, 7...","ABCF1, CDC37, DBLOH, EIF1A, IF4G1, FA32A, KDM5...","[Q6P542, Q61081, Q9JIQ3, Q3THJ3, Q6NZJ6, Q9CR8...",Single-cell sequencing,23892778,,12,12,12,12
4120,Human,Embryo,UBERON_0000922,Normal,Normal cell,Morula cell (Blastomere),CL_0000360,"[ADCK1, AGL, AIMP1, AKAP12, ARPC3, ATP1B3, ATP...","[ADCK1, AGL, AIMP1, AKAP12, ARPC3, ATP1B3, NA,...","[57143, 178, 9255, 9590, 10094, 483, NA, 586, ...","ADCK1, GDE, AIMP1, AKA12, ARPC3, AT1B3, AT5F1,...","[Q86TW2, P35573, Q12904, Q02952, O15145, P5470...",Single-cell sequencing,23892778,,112,112,112,112
4121,Mouse,Embryo,UBERON_0000922,Normal,Normal cell,Morula cell (Blastomere),CL_0000360,"[Aimp1, Atp5f1, Atp5h, Bcat1, Bin1, Ccbl2, Cct...","[Aimp1, Atp5f1, Atp5h, Bcat1, Bin1, Kyat3, Cct...","[13722, 11950, 71679, 12035, 30948, 229905, 12...","AIMP1, AT5F1, ATP5H, BCAT1, BIN1, KAT3, TCPE, ...","[P31230, Q9CQQ7, Q9DCX2, P24288, O08539, Q71RI...",Single-cell sequencing,23892778,,37,37,37,37


In [496]:
_id : geneid


elastic_search: (integer field) number of participants

SyntaxError: invalid syntax (3694850896.py, line 4)