In [1]:
import pandas as pd
import zipfile
import os
import json

In [2]:
# Load the dataset
ftu_file_path = 'ref_data/FTU Cell Count Table - Cell_Type_Count.csv'
ftu_data = pd.read_csv(ftu_file_path)

# Filter the data where 'CT ID in CL' starts with 'CL'
ftu_data_clean = ftu_data.dropna(subset=['CT ID in CL'])

In [3]:
filtered_ftu_data = ftu_data_clean[ftu_data_clean['CT ID in CL'].str.startswith('CL')]
filtered_ftu_data

Unnamed: 0,Organ,FTU Label in Uberon,FTU ID in Uberon,CT Label in CL,CT ID in CL,CT Label in 2D Object,#AS,#CT
0,Kidney,Cortical Collecting Duct,UBERON:0004203,kidney cortex collecting duct intercalated cell,CL:1000715,Cortical Collecting Duct Intercalated Cell Type B,0,2
1,Kidney,Cortical Collecting Duct,UBERON:0004203,kidney collecting duct alpha-intercalated cell,CL:4030015,Cortical Collecting Duct Intercalated Cell Type A,0,4
2,Kidney,Cortical Collecting Duct,UBERON:0004203,peritubular capillary endothelial cell,CL:1001033,Endothelium Peritubular Capillary,0,5
3,Kidney,Cortical Collecting Duct,UBERON:0004203,kidney cortex collecting duct principal cell,CL:1000714,Cortical Collecting Duct Principal Cell,0,9
4,Kidney,Outer Medullary Collecting Duct,UBERON:0004204,kidney outer medulla collecting duct intercala...,CL:1000717,Outer Medullary Collecting Duct Intercalated C...,0,5
...,...,...,...,...,...,...,...,...
149,Thymus,Thymus Lobule,UBERON:0002125,memory B cell,CL:0000787,B Cell,0,8
150,Thymus,Thymus Lobule,UBERON:0002125,corticomedullary thymic epithelial cell,CL:0009070,Corticomedullary Thymic Epithelial Cell,0,45
151,Thymus,Thymus Lobule,UBERON:0002125,leukocyte,CL:0000738,White Blood Cell,0,10
152,Thymus,Thymus Lobule,UBERON:0002125,thymic medullary macrophage,CL:0000882,Thymic Medullary Macrophage,0,4


In [4]:
filtered_ftu_data['CT ID in CL'].nunique()

114

In [5]:
# Step 4: Extract unique organs from the FTU data
unique_organs = filtered_ftu_data['Organ'].unique()
unique_organs

array(['Kidney', 'Large Intestine', 'Liver', 'Lung', 'Pancreas',
       'Prostate', 'Skin', 'Small Intestine', 'Spleen', 'Thymus'],
      dtype=object)

In [21]:
organ_files = [
    'ref_data/ASCT_B/Kidney_v1.5 - Kidney_v1.5.csv',
    'ref_data/ASCT_B/Large_Intestine_v1.3 - Large_Intestine_v1.3.csv',
    'ref_data/ASCT_B/Lung_v1.5 - Lung_v1.5.csv',
    'ref_data/ASCT_B/Pancreas_v1.3 - Pancreas_v1.3.csv',
    'ref_data/ASCT_B/Prostate_v1.1 - Prostate_v1.1.csv',
    'ref_data/ASCT_B/Skin_v1.3 - Skin_v1.3.csv',
    'ref_data/ASCT_B/Small_Intestine_v1.1 - Small_Intestine_v1.1.csv',
    'ref_data/ASCT_B/Spleen_v1.3 - Spleen_v1.3.csv',
    'ref_data/ASCT_B/Thymus_v1.4 - Thymus_v1.4.csv',
    'ref_data/ASCT_B/Liver_v1.3 - Liver_v1.3.csv'
]

# Initialize an empty list to store the rows for the final CSV
refined_rows = []

# Iterate through each organ file
for organ_file in organ_files:
    organ_name = str(organ_file).split('/')[-1].split(' - ')[0].split('_v')[0].replace("_", " ")
    print(f'Processing data for organ: {organ_name}')
    organ_df = pd.read_csv(organ_file)
    
    # Strip any trailing spaces from the relevant columns
    if 'CT/1/ID' in organ_df.columns:
        organ_df['CT/1/ID'] = organ_df['CT/1/ID'].astype(str).str.strip()
    if 'CT/2/ID' in organ_df.columns:
        organ_df['CT/2/ID'] = organ_df['CT/2/ID'].astype(str).str.strip()
    
    # Normalize the organ name in the FTU data to match the organ file name format
    organ_specific_ftu_data = filtered_ftu_data[filtered_ftu_data['Organ'] == organ_name]

    # Iterate through each row in the organ-specific FTU data
    for _, row in organ_specific_ftu_data.iterrows():
        ct_id = row['CT ID in CL'].replace(' ', '')
        
        # Initialize an empty dataframe for matching rows
        matching_rows = pd.DataFrame()
        
        # Check if 'CT/1/ID' and 'CT/2/ID' columns exist in the organ dataframe and match the CT ID
        if 'CT/1/ID' in organ_df.columns:
            matching_rows = pd.concat([matching_rows, organ_df[organ_df['CT/1/ID'] == ct_id]], ignore_index=True)
        if 'CT/2/ID' in organ_df.columns:
            matching_rows = pd.concat([matching_rows, organ_df[organ_df['CT/2/ID'] == ct_id]], ignore_index=True)
        
        # Iterate through matching rows and extract required data
        for _, match in matching_rows.iterrows():
            # Concatenate the AS parts with '-->'
            as_parts = []
            for i in range(1, 7):
                as_label_col = f'AS/{i}/LABEL'
                if as_label_col in match and pd.notna(match[as_label_col]):
                    as_parts.append(match[as_label_col])
            ftu_partonomy = '-->'.join(as_parts)
            
            # Determine the correct cell ID and label
            cell_id = match['CT/1/ID'] if 'CT/1/ID' in match and match['CT/1/ID'] == ct_id else (match['CT/2/ID'] if 'CT/2/ID' in match and match['CT/2/ID'] == ct_id else "")
            cell_label = match['CT/1'] if 'CT/1/ID' in match and match['CT/1/ID'] == ct_id else (match['CT/2'] if 'CT/2/ID' in match and match['CT/2/ID'] == ct_id else "")
            
            # Concatenate gene biomarkers
            bgene = ';'.join([match[f'BGene/{i}'] for i in range(1, 9) if f'BGene/{i}' in match and pd.notna(match[f'BGene/{i}'])])
            
            # Concatenate protein biomarkers
            bprotein = ';'.join([match[f'BProtein/{i}'] for i in range(1, 5) if f'BProtein/{i}' in match and pd.notna(match[f'BProtein/{i}'])])
            
            # Create a row for the final CSV
            refined_row = [
                ftu_partonomy,  # FTU Partonomy
                row['CT Label in CL'],  # Cell Type
                cell_id,        # Cell ID
                bgene,          # Biomarker - Gene
                bprotein        # Biomarker - Protein
            ]
            
            refined_rows.append(refined_row)

# Convert the list of rows into a DataFrame
ftu_partonomy_content = pd.DataFrame(refined_rows, columns=[
    'FTU Partonomy', 'Cell Type', 'Cell ID', 'Biomarker - Gene', 'Biomarker - Protein'
])

# Display the first few rows of the refined DataFrame
ftu_partonomy_content.head()

Processing data for organ: Kidney
Processing data for organ: Large Intestine
Processing data for organ: Lung
Processing data for organ: Pancreas
Processing data for organ: Prostate
Processing data for organ: Skin
Processing data for organ: Small Intestine
Processing data for organ: Spleen
Processing data for organ: Thymus
Processing data for organ: Liver


Unnamed: 0,FTU Partonomy,Cell Type,Cell ID,Biomarker - Gene,Biomarker - Protein
0,kidney-->renal collecting system-->collecting ...,kidney cortex collecting duct intercalated cell,CL:1000715,SLC26A7;SLC4A1;KIT;AQP6,
1,kidney-->renal collecting system-->collecting ...,kidney cortex collecting duct intercalated cell,CL:1000715,SLC4A9;SLC26A4,
2,kidney-->renal collecting system-->collecting ...,kidney collecting duct alpha-intercalated cell,CL:4030015,SLC4A1;SLC26A7;TMEM213;KIT;AQP6,
3,kidney-->kidney vasculature-->kidney capillary...,peritubular capillary endothelial cell,CL:1001033,RAMP3;PLVAP;DNASE1L3,CD31
4,kidney-->cortex of kidney-->medullary ray-->co...,kidney cortex collecting duct principal cell,CL:1000714,AQP2;AQP3;FXYD4,AQP2


In [24]:
ftu_partonomy_content.shape

(340, 5)

In [19]:
# Get the unique CT IDs from the refined data and filtered FTU data
refined_ct_ids = set(ftu_partonomy_content['Cell ID'].unique())
filtered_ct_ids = set(filtered_ftu_data['CT ID in CL'].unique())

# Find the difference
# CT IDs present in filtered FTU data but not in refined FTU partonomy data
ct_ids_in_filtered_not_in_refined = filtered_ct_ids - refined_ct_ids

# CT IDs present in refined FTU partonomy data but not in filtered FTU data
ct_ids_in_refined_not_in_filtered = refined_ct_ids - filtered_ct_ids

# Display the results
ct_ids_in_filtered_not_in_refined, ct_ids_in_refined_not_in_filtered

({'CL:0000084', 'CL:0000738', 'CL:0000787', 'CL:0002144'}, set())

In [22]:
# Step 1: Extract and load the JSON file
zip_file_path = 'ref_data/hra-asctb-all.v2.1.zip'
extracted_path = 'ref_data/hra-asctb-all.v2.1'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

json_file_path = os.path.join(extracted_path, 'hra-asctb-all.v2.1.json')

# Specify the encoding to avoid UnicodeDecodeError
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    json_data = json.load(json_file)

# Step 4: Extract unique organs from the FTU data
unique_organs = filtered_ftu_data['Organ'].unique()

# Initialize a list to store the refined rows
refined_rows_ct_only = []

# Step 5: Process only the matching organs from the JSON data
# Step 5: Process only the matching organs from the JSON data
for organ in unique_organs:
    organ_key = organ.lower().replace(" ", "-")  # Adjust the organ name to match JSON keys
    if organ_key in json_data:
        organ_data_list = json_data[organ_key].get('data', [])
        
        # Subset the filtered FTU data for the current organ
        organ_specific_ftu_data = filtered_ftu_data[filtered_ftu_data['Organ'] == organ]

        # Process each entry in the organ data list
        for entry in organ_data_list:
            # Process each cell type in the entry
            for cell_type in entry['cell_types']:
                cell_id = cell_type.get('id', '').replace(' ', '').strip()
                cell_label = cell_type.get('name', '').strip()

                # Match with organ-specific filtered FTU data
                for _, ftu_row in organ_specific_ftu_data.iterrows():
                    if cell_id == ftu_row['CT ID in CL'].replace(' ', '').strip():
                        # Build the FTU Partonomy using all AS labels
                        anatomical_structures = [as_entry['name'] for as_entry in entry['anatomical_structures'] if 'name' in as_entry]
                        ftu_partonomy = '-->'.join(anatomical_structures)
                                
                        # Concatenate gene biomarkers
                        bgene = ';'.join([bm['name'].strip() for bm in entry['biomarkers_gene'] if bm.get('b_type') == 'gene'])

                        # Concatenate protein biomarkers
                        bprotein = ';'.join([bm['name'].strip() for bm in entry['biomarkers_protein'] if bm.get('b_type') == 'protein'])

                        # Create the refined row
                        refined_row = [
                            ftu_partonomy,  # FTU Partonomy
                            ftu_row['CT Label in CL'].strip(),  # Cell Type
                            cell_id,        # CT/<x> ID
                            cell_label,     # CT/<x>/Label
                            bgene,          # Biomarker - Gene
                            bprotein        # Biomarker - Protein
                        ]
                                
                        refined_rows_ct_only.append(refined_row)

# Convert the list of rows into a DataFrame
refined_ftu_partonomy_data_ct_only = pd.DataFrame(refined_rows_ct_only, columns=[
    'FTU Partonomy', 'Cell Type', 'CT ID', 'CT Label', 'Biomarker - Gene', 'Biomarker - Protein'
])

refined_ftu_partonomy_data_ct_only.head()

Unnamed: 0,FTU Partonomy,Cell Type,CT ID,CT Label,Biomarker - Gene,Biomarker - Protein
0,kidney-->cortex of kidney-->outer cortex of ki...,glomerular visceral epithelial cell,CL:0000653,Podocyte,NPHS2;PODXL;NPHS1;PTPRQ,
1,kidney-->cortex of kidney-->juxtamedullary cor...,glomerular visceral epithelial cell,CL:0000653,Podocyte,NPHS2;PODXL;NPHS1;PTPRQ,PODXL
2,kidney-->cortex of kidney-->medullary ray-->Co...,kidney cortex collecting duct principal cell,CL:1000714,Cortical Collecting Duct Principal Cell,AQP2;AQP3;FXYD4,AQP2
3,kidney-->cortex of kidney-->medullary ray-->Co...,kidney cortex collecting duct principal cell,CL:1000714,Cortical Collecting Duct Principal Cell,AQP2;AQP3;FXYD4,AQP2
4,kidney-->renal medulla-->inner medulla of kidn...,vasa recta descending limb cell,CL:1001285,Descending Vasa Recta Endothelial Cell,SERPINE2;TM4SF1;PALMD;ADAMTS6,UMOD


In [23]:
refined_ftu_partonomy_data_ct_only.shape

(340, 6)

In [25]:
# Get the unique CT IDs from the refined data and filtered FTU data
refined_ct_ids = set(refined_ftu_partonomy_data_ct_only['CT ID'].unique())
filtered_ct_ids = set(filtered_ftu_data['CT ID in CL'].unique())

# Find the difference
# CT IDs present in filtered FTU data but not in refined FTU partonomy data
ct_ids_in_filtered_not_in_refined = filtered_ct_ids - refined_ct_ids

# CT IDs present in refined FTU partonomy data but not in filtered FTU data
ct_ids_in_refined_not_in_filtered = refined_ct_ids - filtered_ct_ids

# Display the results
ct_ids_in_filtered_not_in_refined, ct_ids_in_refined_not_in_filtered

({'CL:0000084', 'CL:0000738', 'CL:0000787', 'CL:0002144'}, set())

In [26]:
# Filter the filtered_ftu_data DataFrame to get rows where 'CT ID in CL' is in ct_ids_in_filtered_not_in_refined
data_filtered_not_in_refined = ftu_partonomy_content[ftu_partonomy_content['Cell ID'].isin(ct_ids_in_filtered_not_in_refined)]

# Display the resulting DataFrame
data_filtered_not_in_refined

Unnamed: 0,FTU Partonomy,Cell Type,Cell ID,Biomarker - Gene,Biomarker - Protein


In [27]:
ftu_partonomy_content['Cell ID'].nunique(), refined_ftu_partonomy_data_ct_only['CT ID'].nunique()

(110, 110)

In [28]:
diff = set(filtered_ftu_data['CT ID in CL']) - set(ftu_partonomy_content['Cell ID'])
diff

{'CL:0000084', 'CL:0000738', 'CL:0000787', 'CL:0002144'}

In [29]:
diff = set(filtered_ftu_data['CT ID in CL']) - set(refined_ftu_partonomy_data_ct_only['CT ID'])
diff

{'CL:0000084', 'CL:0000738', 'CL:0000787', 'CL:0002144'}

In [31]:
# Save the DataFrames to a CSV file
output_path_files = 'output/trios/asctb_ftu-partonomy-ct-info.csv'
output_path_json = 'output/trios/asctb_ftu-partonomy-ct-info_v1.csv'

ftu_partonomy_content.to_csv(output_path_files, index=False, sep='\t')
refined_ftu_partonomy_data_ct_only.to_csv(output_path_json, index=False, sep='\t')

print(f"Final CSV file saved.")

Final CSV file saved.


In [34]:
# Load the dataset
ftu_trios_by_bruce = 'ref_data/asctb-trios-by-ftu.csv'
ftu_trios_by_bruce_data = pd.read_csv(ftu_trios_by_bruce)
ftu_trios_by_bruce_data.head()

Unnamed: 0,ftu,table,ftu_label,as_label,ct_label,bm_label,ftu_as,as,ct,bm,bmType
0,https://purl.humanatlas.io/2d-ftu/kidney-ascen...,https://purl.humanatlas.io/asct-b/kidney,kidney ascending thin loop of henle,,,,http://purl.obolibrary.org/obo/UBERON_0004193,,,,
1,https://purl.humanatlas.io/2d-ftu/kidney-corti...,https://purl.humanatlas.io/asct-b/kidney,kidney cortical collecting duct,,,,http://purl.obolibrary.org/obo/UBERON_0004203,,,,
2,https://purl.humanatlas.io/2d-ftu/kidney-desce...,https://purl.humanatlas.io/asct-b/kidney,kidney descending thin loop of henle,descending thin limb of loop of Henle,Descending Thin Limb Cell,SPP1,http://purl.obolibrary.org/obo/UBERON_0001289,http://purl.obolibrary.org/obo/UBERON_0005096,http://purl.obolibrary.org/obo/CL_1001111,http://identifiers.org/hgnc/11255,gene
3,https://purl.humanatlas.io/2d-ftu/kidney-desce...,https://purl.humanatlas.io/asct-b/kidney,kidney descending thin loop of henle,descending thin limb of loop of Henle,Descending Thin Limb Cell,CDH1,http://purl.obolibrary.org/obo/UBERON_0001289,http://purl.obolibrary.org/obo/UBERON_0005096,http://purl.obolibrary.org/obo/CL_1001111,http://identifiers.org/hgnc/1748,gene
4,https://purl.humanatlas.io/2d-ftu/kidney-desce...,https://purl.humanatlas.io/asct-b/kidney,kidney descending thin loop of henle,descending thin limb of loop of Henle,Descending Thin Limb Cell,CRYAB,http://purl.obolibrary.org/obo/UBERON_0001289,http://purl.obolibrary.org/obo/UBERON_0005096,http://purl.obolibrary.org/obo/CL_1001111,http://identifiers.org/hgnc/2389,gene


In [38]:
ftu_trios_by_bruce_data['ct_label'].unique()

array([nan, 'Descending Thin Limb Cell',
       'Descending Thin Limb Cell Type 1',
       'Descending Thin Limb Cell Type 2',
       'Descending Thin Limb Cell Type 3', 'Podocyte',
       'Parietal Epithelial Cell', 'Mesangial Cell',
       'Glomerular Capillary Endothelial Cell',
       'Proximal Tubule Epithelial Cell',
       'Medullary Thick Ascending Limb Cell',
       'Distal Convoluted Tubule Cell Type 1', 'Ascending Thin Limb Cell',
       'Connecting Tubule Principal Cell', 'Connecting Tubule Cell',
       'Connecting Tubule Intercalated Cell Type A',
       'Distal Convoluted Tubule Cell',
       'Proximal Tubule Epithelial Cell Segment 2',
       'Cortical Thick Ascending Limb Cell', 'Intercalated Cell Type B',
       'Proximal Tubule Cell Epithelial Segment 3',
       'Thick Ascending Limb Cell', 'Connecting Tubule Intercalated Cell',
       'Macula Densa cell', 'Distal Convoluted Tubule Cell Type 2',
       'Proximal Tubule Epithelial Cell Segment 1', 'Neutrophil',
      