In [523]:
import pandas as pd
import numpy as np

In [524]:
# importing data
all_cell_markers_df = pd.read_csv('../data/all_cell_markers.txt', sep = r'\t')
all_human_markers_df = pd.read_csv('../data/Human_cell_markers.txt', sep = r'\t')
all_mouse_markers_df = pd.read_csv('../data/Mouse_cell_markers.txt', sep = r'\t')
all_singleCell_markers_df = pd.read_csv('../data/Single_cell_markers.txt', sep = r'\t')

  return func(*args, **kwargs)


# Data Schema
- **speciesType**: the species from which the data originates
    - there are only two data type, either `Human` or `Mouse`
- **tissueType**: the type of tissues from which data originates
    - in total 181 different kinds of cells
    - a lot of them are undefined
- **UberonOntologyID**: The universal unique identifier of the anatomy structure found in animals 
    - needs to confirm with the team
    - contain missing value
- **cancerType**: the association of the cell marker with the cancer name
    - no cancer cell are named as `Normal`
- **cellName**: the English name of the cell that marker belong to
- **CellOntologyID**: The universal unique identifier of the cell that marker belong to
    - contain missing value
- **cellMarker**: a marker molecule of the cell
    - in string list, can be convert to list
- **geneSymbol**: gene expression of the cell marker
- **geneID**: The universal unique identifier of the gene
    - contain missing value
- **proteinName**: name of the protein
    - contain missing value
- **proteinID**: The universal unique identifier of the protein
- **markerResource**: the type of resouce or methologoy used to identify the marker
    - there are only four data type, either `Experiment` or `Single-cell sequencing` or `Company` or `Review`
- **PMID**: The PudMed ID for the publication or study where the marker data was reported
    - abnormal value contain `company`
- **Company**: the company associated with the resouces

# Exploration on all_cell_markers
Notes: all codes a modulerized, which means you can directly jump to a specific section without running previous sections

EDA outlines:
- Confirming hypothesis: `all_cell_markers` contain all the data from `all_human_markers`, `all_mouse_markers`
- Confirming hypothesis: None of the data from `all_singleCell_markers` exist in `all_cell_markers`
- Revealing trend: assoication between missing `UberonOntologyID` and `tissueType`
- Revealing trend: assoication between missing `CellOntologyID` and `cellName`
- Confirming hypothesis: if `geneSymbol` is missing, then `geneID` is not missing
- Confirming hypothesis: if `proteinName` is missing, then `proteinID` is not missing
- Confirming hypothesis: all the nested list are correspond to each other element
- Confirming hypothesis: if markerResource is `company`, then `PMID` is company and `Company` is not missing

## Confirm Hypothesis
`all_cell_markers` contain all the data from `all_human_markers`, `all_mouse_markers`

In [525]:
all_cell_marker_tuple = all_cell_markers_df.apply(tuple, axis=1)
all_human_marker_tuple = all_human_markers_df.apply(tuple, axis=1)
all_mouse_markers_tuple = all_mouse_markers_df.apply(tuple, axis=1)

# check if all value in human_marker exist in cell_marker
if all(all_human_marker_tuple.isin(all_cell_marker_tuple)):
    print("all value in all_human_marker dataset exist in all_cell_marker dataset")
else:
    mismatch_value = sum(all_human_marker_tuple.isin(all_cell_marker_tuple) == False)
    print(f"there are {mismatch_value} from all_human_marker that does not exist in all_cell_marker dataset")

# check if all value in mouse_marker exist in cell_marker
if all(all_mouse_markers_tuple.isin(all_cell_marker_tuple)):
    print("all value in all_mouse_markers dataset exist in all_cell_marker dataset")
else:
    mismatch_value = sum(all_mouse_markers_tuple.isin(all_cell_marker_tuple) == False)
    print(f"there are {mismatch_value} from all_mouse_markers that does not exist in all_cell_marker dataset")



all value in all_human_marker dataset exist in all_cell_marker dataset
all value in all_mouse_markers dataset exist in all_cell_marker dataset


## Confirm Hypothesis
None of the data from `all_singleCell_markers` exist in `all_cell_markers`

In [526]:
all_cell_marker_tuple = all_cell_markers_df.apply(tuple, axis=1)
all_singleCell_markers_tuple = all_singleCell_markers_df.apply(tuple, axis=1)

# check if all value in singleCell_marker exist in cell_marker
if all(all_singleCell_markers_tuple.isin(all_cell_marker_tuple)):
    print("all value in all_singleCell_markers dataset exist in all_cell_marker dataset")
else:
    mismatch_value = sum(all_singleCell_markers_tuple.isin(all_cell_marker_tuple) == False)
    print(f'there are in total {mismatch_value} records in all_singleCell_markers')
    print(f"there are {mismatch_value} from all_singleCell_markers that does not exist in all_cell_marker dataset")

there are in total 535 records in all_singleCell_markers
there are 535 from all_singleCell_markers that does not exist in all_cell_marker dataset


## Revealing trend
assoication between missing `UberonOntologyID` and `tissueType`

most of the tissue with missing ID is undefined tissue

In [527]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
cell_markers_df[['UberonOntologyID', 'tissueType']][cell_markers_df['UberonOntologyID'].isna()].tissueType.value_counts()

Undefined                      701
Fetal liver                     29
Fetal gonad                     26
Embryonic prefrontal cortex     12
Bladder                          5
Sinonasal mucosa                 2
Fetal brain                      1
Osteoarthritic cartilage         1
Name: tissueType, dtype: int64

# Revealing trend
assoication between missing `CellOntologyID` and `cellName`

In [528]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
cell_markers_df[['CellOntologyID', 'cellName']][cell_markers_df['CellOntologyID'].isna()].cellName.value_counts()

Cancer stem cell                    596
Progenitor cell                      53
Cancer stem-like cell                 9
Neural progenitor cell                8
Cardiac progenitor cell               8
                                   ... 
Foxp3+IL-17+ T cell                   1
Definitive zone cell                  1
Bone marrow stem cell                 1
PDX1+ pancreatic progenitor cell      1
Adipogenic progenitor cell            1
Name: cellName, Length: 142, dtype: int64

# Confirming hypothesis
if `geneSymbol` is missing, then `geneID` is not missing

In [529]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
check_same = cell_markers_df[['geneSymbol', 'geneID']].apply(
    lambda row: all(row.isna()) or all(~row.isna()), axis = 1 
)
if all(check_same):
    print('if there is a missing proteinSymbol, then there must be missing proteinID')
else:
    print('if there is a missing symbol then there might exist a ID')

if there is a missing proteinSymbol, then there must be missing proteinID


# Confirming hypothesis
if `proteinName` is missing, then `proteinID` is not missing

In [530]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
check_same = cell_markers_df[['proteinName', 'proteinID']].apply(
    lambda row: all(row.isna()) or all(~row.isna()), axis = 1 
)
if all(check_same):
    print('if there is a missing proteinSymbol, then there must be missing proteinID')
else:
    print('if there is a missing symbol then there might exist a ID')

if there is a missing proteinSymbol, then there must be missing proteinID


## Confirming hypothesis
each existed cellMarkers have a gene name assoicated it

In [531]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)
def str_to_list(listLikeStr):
    if not isinstance(listLikeStr, str): return np.NaN
    cleaned_listLikeStr = listLikeStr.replace("'", "")

    split_element = cleaned_listLikeStr.split(',')

    is_nested = False
    result = []
    nested_list = []
    for element in split_element:
        if is_nested:
            if "]" in element:
                is_nested = False
                nested_list.append(element.replace(']', "").strip())
                result.append(nested_list)
                nested_list = []
            else:
                if element.strip():
                    nested_list.append(element.strip())
        else:
            if "[" in element:
                is_nested = True
                nested_list.append(element.replace('[', "").strip())
            else:
                if element.strip():
                    result.append(element.strip())

    return result
        

to_convert_col  = ['cellMarker', 'geneSymbol', 'geneID', 'proteinName', 'proteinID']
for col in to_convert_col:
    cell_markers_df[col] = cell_markers_df[col].apply(str_to_list)

len_count = cell_markers_df[to_convert_col].applymap(lambda x : len(x) if isinstance(x, list) else 0)


if cell_markers_df[to_convert_col].applymap(lambda x: len(x) if isinstance(x, list) else np.NaN).dropna().shape[0] == 0:
    print("each existed cellMarkers have a gene name assoicated it")
else:
    print('some gene has mis-match sequence')
    mistamch_index = len_count[len_count.nunique(axis = 1) != 1].index
    print(f"for these following index: {mistamch_index}")

some gene has mis-match sequence
for these following index: Int64Index([  64,   65,   66,  101,  214,  332,  412,  483,  490,  498,  524,
             525,  579,  672,  688,  689,  725,  727,  795,  815,  868,  872,
             922,  945, 1042, 1045, 1070, 1086, 1215, 1260, 1366, 1394, 1496,
            1524, 1525, 1779, 1850, 1854, 1856, 1883, 1993, 2187, 2253, 2254,
            2255, 2256, 2381, 2458, 2523, 2537, 2753, 3135, 3151, 3226, 3227,
            3228, 3243, 3350, 3557, 3558, 3632, 3710, 3733, 3741, 3844, 3908,
            3918, 3932, 3973, 3977, 3983, 4005, 4099, 4167, 4245, 4268, 4276,
            4379, 4443, 4453, 4467, 4508, 4512, 4518, 4540, 4634],
           dtype='int64')


In [532]:
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)

cell_marker_iscompany = cell_markers_df[['markerResource', 'PMID', 'Company']].assign(is_company = cell_markers_df['markerResource'] == 'Company')

# if is_company is false, then should PMID should not be company and Company is should be NaN
for row in cell_marker_iscompany.iterrows():
    row = row[1]
    if row[-1]:
        if row[1] == "Company" and pd.notna(row[2]):
            continue
        else:
            print(row)
    else:
        if row[1] != "Company" and pd.isna(row[2]):
            continue
        else:
            print(row)
print('all row match the expected format')

all row match the expected format


# Parsing procedure

Data Parsing Outlines
- concat `all_cell_markers` and `all_singleCell_markers`
- replace all the `undefined` tissue to NaN value
- convert all the strLikelist into list

In [533]:
# import data
all_cell_markers_df = pd.read_csv('../data/all_cell_markers.txt', sep = r'\t')
all_human_markers_df = pd.read_csv('../data/Human_cell_markers.txt', sep = r'\t')
all_mouse_markers_df = pd.read_csv('../data/Mouse_cell_markers.txt', sep = r'\t')
all_singleCell_markers_df = pd.read_csv('../data/Single_cell_markers.txt', sep = r'\t')

In [534]:
def str_to_list(listLikeStr):
    if not isinstance(listLikeStr, str): return np.NaN
    cleaned_listLikeStr = listLikeStr.replace("'", "")

    split_element = cleaned_listLikeStr.split(',')

    is_nested = False
    result = []
    nested_list = []
    for element in split_element:
        if is_nested:
            if "]" in element:
                is_nested = False
                nested_list.append(element.replace(']', "").strip())
                result.append(nested_list)
                nested_list = []
            else:
                if element.strip():
                    nested_list.append(element.strip())
        else:
            if "[" in element:
                is_nested = True
                nested_list.append(element.replace('[', "").strip())
            else:
                if element.strip():
                    result.append(element.strip())

    return result

In [535]:
# concat all_cell_markers and all_singleCell_markers
cell_markers_df = pd.concat([all_cell_markers_df, all_singleCell_markers_df], axis = 0, ignore_index=True)

# change all the Undefined value in tissueType column to NaN
cell_markers_df['tissueType'] = cell_markers_df['tissueType'].replace('Undefined', np.NaN)

# convert specific columns into list
to_convert_col  = ['cellMarker', 'geneSymbol', 'geneID', 'proteinName', 'proteinID']
for col in to_convert_col:
    cell_markers_df[col] = cell_markers_df[col].apply(str_to_list)

In [536]:
cell_markers_df

Unnamed: 0,speciesType,tissueType,UberonOntologyID,cancerType,cellType,cellName,CellOntologyID,cellMarker,geneSymbol,geneID,proteinName,proteinID,markerResource,PMID,Company
0,Human,Kidney,UBERON_0002113,Normal,Normal cell,Proximal tubular cell,,[Intestinal Alkaline Phosphatase],[ALPI],[248],[PPBI],[P09923],Experiment,9263997,
1,Human,Liver,UBERON_0002107,Normal,Normal cell,Ito cell (hepatic stellate cell),CL_0000632,[Synaptophysin],[SYP],[6855],[SYPH],[P08247],Experiment,10595912,
2,Human,Endometrium,UBERON_0001295,Normal,Normal cell,Trophoblast cell,CL_0000351,[CEACAM1],[CEACAM1],[634],[CEAM1],[P13688],Experiment,10751340,
3,Human,Germ,UBERON_0000923,Normal,Normal cell,Primordial germ cell,CL_0000670,[VASA],[DDX4],[54514],[DDX4],[Q9NQI0],Experiment,10920202,
4,Human,Corneal epithelium,UBERON_0001772,Normal,Normal cell,Epithelial cell,CL_0000066,[KLF6],[KLF6],[1316],[KLF6],[Q99612],Experiment,12407152,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4653,Human,Embryo,UBERON_0000922,Normal,Normal cell,8-cell stage cell (Blastomere),CL_0000353,"[C11orf48, C19orf53, DHX9, DIABLO, EIF1AD, EIF...","[LBHD1, C19orf53, DHX9, DIABLO, EIF1AD, EIF4G1...","[79081, 28974, 1660, 56616, 84285, 1981, 26017...","[LBHD1, L10K, DHX9, DBLOH, EIF1A, IF4G1, FA32A...","[Q9BQE6, Q9UNZ5, Q08211, Q9NR28, Q8N9N8, Q0463...",Single-cell sequencing,23892778,
4654,Mouse,Embryo,UBERON_0000922,Normal,Normal cell,8-cell stage cell (Blastomere),CL_0000353,"[Abcf1, Cdc37, Diablo, Eif1ad, Eif4g1, Fam32a,...","[Abcf1, Cdc37, Diablo, Eif1ad, Eif4g1, Fam32a,...","[224742, 12539, 66593, 69860, 208643, 67922, 7...","[ABCF1, CDC37, DBLOH, EIF1A, IF4G1, FA32A, KDM...","[Q6P542, Q61081, Q9JIQ3, Q3THJ3, Q6NZJ6, Q9CR8...",Single-cell sequencing,23892778,
4655,Human,Embryo,UBERON_0000922,Normal,Normal cell,Morula cell (Blastomere),CL_0000360,"[ADCK1, AGL, AIMP1, AKAP12, ARPC3, ATP1B3, ATP...","[ADCK1, AGL, AIMP1, AKAP12, ARPC3, ATP1B3, NA,...","[57143, 178, 9255, 9590, 10094, 483, NA, 586, ...","[ADCK1, GDE, AIMP1, AKA12, ARPC3, AT1B3, AT5F1...","[Q86TW2, P35573, Q12904, Q02952, O15145, P5470...",Single-cell sequencing,23892778,
4656,Mouse,Embryo,UBERON_0000922,Normal,Normal cell,Morula cell (Blastomere),CL_0000360,"[Aimp1, Atp5f1, Atp5h, Bcat1, Bin1, Ccbl2, Cct...","[Aimp1, Atp5f1, Atp5h, Bcat1, Bin1, Kyat3, Cct...","[13722, 11950, 71679, 12035, 30948, 229905, 12...","[AIMP1, AT5F1, ATP5H, BCAT1, BIN1, KAT3, TCPE,...","[P31230, Q9CQQ7, Q9DCX2, P24288, O08539, Q71RI...",Single-cell sequencing,23892778,
