In [25]:
import pandas as pd
import numpy as np
from difflib import get_close_matches

In [26]:
ftu = '../ref_data/FTU Cell Count Table - Cell_Type_Count.csv'

In [27]:
ftu_df = pd.read_csv(ftu, sep = '\t')

In [28]:
ftu_df.head(1)

Unnamed: 0,Organ,Organ_ID,FTU Label in Uberon,FTU ID in Uberon,CT Label in CL,CT ID in CL,CT Label in 2D Object
0,Kidney,http://purl.obolibrary.org/obo/UBERON_0002113,Cortical Collecting Duct,UBERON:0004203,kidney collecting duct beta-intercalated cell,CL:4030021,Cortical Collecting Duct Intercalated Cell Type B


In [29]:
# File paths
file_paths = {
    "Kidney" : '../ref_data/Anatomogram/ExpDesign-E-CURD-119.tsv',
    "Liver" : '../ref_data/Anatomogram/ExpDesign-E-MTAB-10553.tsv',
    "Lung" : '../ref_data/Anatomogram/ExpDesign-E-GEOD-130148.tsv',
    "Pancreas" : '../ref_data/Anatomogram/ExpDesign-E-MTAB-5061.tsv'
}

In [30]:
# Inspect column names in each file
file_columns = {}
for organ, file_path in file_paths.items():  # Corrected iteration
    try:
        df = pd.read_csv(file_path, sep="\t", nrows=5)  # Read only first few rows for inspection
        file_columns[organ] = df.columns.tolist()  # Store column names under organ key
    except Exception as e:
        file_columns[organ] = f"Error reading file: {e}"

# Display the column names for inspection
file_columns

{'Kidney': ['Assay',
  'Sample Characteristic[organism]',
  'Sample Characteristic Ontology Term[organism]',
  'Sample Characteristic[individual]',
  'Sample Characteristic Ontology Term[individual]',
  'Sample Characteristic[ethnic group]',
  'Sample Characteristic Ontology Term[ethnic group]',
  'Sample Characteristic[sex]',
  'Sample Characteristic Ontology Term[sex]',
  'Sample Characteristic[age]',
  'Sample Characteristic Ontology Term[age]',
  'Sample Characteristic[developmental stage]',
  'Sample Characteristic Ontology Term[developmental stage]',
  'Sample Characteristic[disease]',
  'Sample Characteristic Ontology Term[disease]',
  'Sample Characteristic[organism part]',
  'Sample Characteristic Ontology Term[organism part]',
  'Sample Characteristic[clinical information]',
  'Sample Characteristic Ontology Term[clinical information]',
  'Factor Value[sex]',
  'Factor Value Ontology Term[sex]',
  'Factor Value[inferred cell type - ontology labels]',
  'Factor Value Ontology 

In [31]:
# Updated mapping of columns to extract
column_mapping = {
    "assay": "Assay",
    "organism": "Sample Characteristic[organism]",
    "donor": "Sample Characteristic[individual]",
    "as_label": "Sample Characteristic[organism part]",
    "as_id": "Sample Characteristic Ontology Term[organism part]",
    "sex": "Sample Characteristic[sex]",
    "ethnicity": "Sample Characteristic[ethnic group]",
    "age": "Sample Characteristic[age]",
    "bmi": "Sample Characteristic[body mass index]", 
    "disease": "Sample Characteristic[disease]",
    "cell_id": "Factor Value Ontology Term[inferred cell type - ontology labels]",
    "cell_type": [
        "Factor Value[inferred cell type - ontology labels]",
        "Factor Value[cell type]"
    ]  
}

# Read and extract relevant columns
dataframes = []
for organ, file_path in file_paths.items():  # Include organ name in iteration
    try:
        df = pd.read_csv(file_path, sep="\t", low_memory=False)
        
        # Select available columns based on mapping
        selected_columns = {key: col for key, col in column_mapping.items() if isinstance(col, str) and col in df.columns}
        
        # Handle cell type separately (use the first available column)
        cell_type_col = next((col for col in column_mapping["cell_type"] if col in df.columns), None)
        if cell_type_col:
            selected_columns["cell_type"] = cell_type_col

        # Check if cell_id column exists and process it correctly
        if "cell_id" in selected_columns and selected_columns["cell_id"] in df.columns:
            df["cell_id"] = df[selected_columns["cell_id"]].astype(str).apply(lambda x: x.split('/')[-1].replace('_', ':') if pd.notna(x) else x)
            selected_columns["cell_id"] = "cell_id"  # Update to processed column

        if selected_columns:
            extracted_df = df[list(selected_columns.values())].copy()
            extracted_df = extracted_df.rename(columns={v: k for k, v in selected_columns.items()})
            
            # Add source file and organ columns
            extracted_df["source_file"] = file_path.split("/")[-1]
            extracted_df["organ"] = organ  # Assign organ name

            # Reset index to avoid conflicts
            extracted_df = extracted_df.reset_index(drop=True)

            dataframes.append(extracted_df)
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

dataframes


[                               assay      organism     donor  \
 0      SAMN15040593-AAACCTGAGGACATTA  Homo sapiens  Healthy5   
 1      SAMN15040593-AAACCTGCAGCTCGAC  Homo sapiens  Healthy5   
 2      SAMN15040593-AAACCTGCAGTATAAG  Homo sapiens  Healthy5   
 3      SAMN15040593-AAACCTGCATGCCTAA  Homo sapiens  Healthy5   
 4      SAMN15040593-AAACCTGGTATAGTAG  Homo sapiens  Healthy5   
 ...                              ...           ...       ...   
 30584  SAMN15040597-TTTGTCATCACAGGCC  Homo sapiens  Healthy1   
 30585  SAMN15040597-TTTGTCATCACCAGGC  Homo sapiens  Healthy1   
 30586  SAMN15040597-TTTGTCATCACCGTAA  Homo sapiens  Healthy1   
 30587  SAMN15040597-TTTGTCATCCTACAGA  Homo sapiens  Healthy1   
 30588  SAMN15040597-TTTGTCATCGGCGCTA  Homo sapiens  Healthy1   
 
                as_label                                          as_id  \
 0      cortex of kidney  http://purl.obolibrary.org/obo/UBERON_0001225   
 1      cortex of kidney  http://purl.obolibrary.org/obo/UBERON_0001

In [32]:
# Verify and standardize column names before merging
standard_columns = set()

# Collect all unique column names across all extracted DataFrames
for df in dataframes:
    standard_columns.update(df.columns)

# Ensure all DataFrames have the same columns
for i in range(len(dataframes)):
    for col in standard_columns:
        if col not in dataframes[i].columns:
            dataframes[i][col] = None  
    dataframes[i] = dataframes[i][sorted(standard_columns)]  

In [33]:
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True, sort=False)
    print(combined_df.head())
    combined_df.to_csv('../output/EBI-Anatomogram-metadata.csv', index=False)
else:
    print("No relevant data found in the provided files.")

       age                                          as_id          as_label  \
0  52 year  http://purl.obolibrary.org/obo/UBERON_0001225  cortex of kidney   
1  52 year  http://purl.obolibrary.org/obo/UBERON_0001225  cortex of kidney   
2  52 year  http://purl.obolibrary.org/obo/UBERON_0001225  cortex of kidney   
3  52 year  http://purl.obolibrary.org/obo/UBERON_0001225  cortex of kidney   
4  52 year  http://purl.obolibrary.org/obo/UBERON_0001225  cortex of kidney   

                           assay   bmi     cell_id  \
0  SAMN15040593-AAACCTGAGGACATTA  None  CL:1001106   
1  SAMN15040593-AAACCTGCAGCTCGAC  None         nan   
2  SAMN15040593-AAACCTGCAGTATAAG  None         nan   
3  SAMN15040593-AAACCTGCATGCCTAA  None         nan   
4  SAMN15040593-AAACCTGGTATAGTAG  None         nan   

                                           cell_type disease     donor  \
0  kidney loop of Henle thick ascending limb epit...  normal  Healthy5   
1                                                NaN

In [34]:
combined_df.columns

Index(['age', 'as_id', 'as_label', 'assay', 'bmi', 'cell_id', 'cell_type',
       'disease', 'donor', 'ethnicity', 'organ', 'organism', 'sex',
       'source_file'],
      dtype='object')

In [35]:
combined_df['number_of_cells_total'] = combined_df.groupby('source_file')['cell_type'].transform('count')
combined_df['number_of_unique_cell_types'] = combined_df.groupby('source_file')['cell_type'].transform('nunique')

In [36]:
combined_df['number_of_cells_total'].unique()

array([19974, 15000,  3202,  2207])

In [45]:
source_file_summary = combined_df.groupby('source_file').agg(
    number_of_cells_total=('cell_type', 'count'),
    number_of_unique_cell_types=('cell_type', lambda x: x.nunique())
).reset_index()

print(source_file_summary)

                   source_file  number_of_cells_total  \
0     ExpDesign-E-CURD-119.tsv                  19974   
1  ExpDesign-E-GEOD-130148.tsv                   3202   
2   ExpDesign-E-MTAB-10553.tsv                  15000   
3    ExpDesign-E-MTAB-5061.tsv                   2207   

   number_of_unique_cell_types  
0                           13  
1                           13  
2                           13  
3                           13  


In [38]:
combined_df[combined_df['organ'] == 'Kidney']['cell_type'].unique()

array(['kidney loop of Henle thick ascending limb epithelial cell', nan,
       'parietal epithelial cell', 'kidney capillary endothelial cell',
       'renal alpha-intercalated cell', 'fibroblast',
       'kidney distal convoluted tubule epithelial cell',
       'epithelial cell of proximal tubule',
       'glomerular visceral epithelial cell',
       'kidney connecting tubule epithelial cell', 'renal principal cell',
       'renal beta-intercalated cell', 'mesangial cell', 'leukocyte'],
      dtype=object)

In [39]:
combined_df[combined_df['organ'] == 'Lung']['cell_type'].unique()

array(['lung macrophage', 'type II pneumocyte', 'T cell',
       'type I pneumocyte', 'lung secretory cell', nan, 'mast cell',
       'lung ciliated cell', 'natural killer cell',
       'transformed epithelial cell', 'lymphocyte',
       'lung endothelial cell', 'fibroblast of lung', 'B cell'],
      dtype=object)

In [40]:
combined_df[combined_df['organ'] == 'Pancreas']['cell_type'].unique()

array([nan, 'pancreatic D cell', 'pancreatic A cell',
       'pancreatic PP cell', 'pancreatic ductal cell', 'acinar cell',
       'type B pancreatic cell', 'pancreatic endocrine cell',
       'co-expression cell', 'professional antigen presenting cell',
       'pancreatic stellate cell', 'endothelial cell',
       'pancreatic epsilon cell', 'mast cell'], dtype=object)

In [41]:
combined_df[combined_df['organ'] == 'Liver']['cell_type'].unique()

array([nan, 'hepatocyte', 'macrophage', 'T cell',
       'endothelial cell of hepatic sinusoid', 'natural killer cell',
       'cycling cell', 'plasma cell', 'B cell', 'cholangiocyte',
       'Kupffer cell', 'endothelial cell of vascular tree',
       'hematopoietic stem cell', 'mesothelial cell'], dtype=object)

In [42]:
# Perform a left join on both conditions (organ and cell_id)
merged_df = combined_df.merge(
    ftu_df[['Organ', 'CT ID in CL', 'FTU Label in Uberon', 'FTU ID in Uberon']],
    left_on=['organ', 'cell_id'],
    right_on=['Organ', 'CT ID in CL'],
    how='left'
)

# Drop redundant columns after merge
merged_df.drop(columns=['Organ', 'CT ID in CL'], inplace=True)

# Rename FTU columns for clarity
merged_df.rename(columns={'FTU Label in Uberon': 'FTU_Label', 'FTU ID in Uberon': 'FTU_ID'}, inplace=True)

In [43]:
filtered_df = merged_df.dropna(subset=['cell_id', 'FTU_Label'])

In [44]:
filtered_df.to_csv('../output/EBI-Anatomogram-metadata.csv', index=False)