### Workflow

1. Compare FTU's cell type information between Azimuth, PopV and Celltypist
2. Get the list of dataset for FTU.
    1. Cell summary from the atlas-as-cell-summaries.jsonld
    2. add organ data from the cell annotation crosswalk files
3. Any dataset that is been shared among two FTUs
4. Additonal work

### Get the Cell annotation tool information for cell types in FTUs

In [4]:
import pandas as pd
import requests

# Function to download CSV files from GitHub
def download_csv_from_github(url, local_path):
    response = requests.get(url)
    response.raise_for_status()  # Check if the request was successful
    with open(local_path, 'wb') as file:
        file.write(response.content)

# URLs of the CSV files in the GitHub repository
# github_base_url = 'https://github.com/hubmapconsortium/hra-workflows-runner/blob/main/crosswalking-tables/'
# popv_url = github_base_url + 'popv.csv'
# celltypist_url = github_base_url + 'celltypist.csv'
# azimuth_url = github_base_url + 'azimuth.csv'
# ftu_cell_count_url = github_base_url + 'FTU_Cell_Count_Table_Cell_Type_Count.csv'

# Local paths to save the downloaded files
popv_local_path = 'C:\\Users\\Supriya\\Downloads\\popv.csv'
celltypist_local_path = 'C:\\Users\\Supriya\\Downloads\\celltypist.csv'
azimuth_local_path = 'C:\\Users\\Supriya\\Downloads\\azimuth.csv'
ftu_cell_count_path = 'C:\\Users\\Supriya\\Downloads\\FTU Cell Count Table - Cell_Type_Count.csv'


# Download the files
#download_csv_from_github(popv_url, popv_local_path)
#download_csv_from_github(celltypist_url, celltypist_local_path)
#download_csv_from_github(azimuth_url, azimuth_local_path)

# Load the downloaded files
popv = pd.read_csv(popv_local_path)
celltypist = pd.read_csv(celltypist_local_path)
azimuth = pd.read_csv(azimuth_local_path)
ftu_cell_count = pd.read_csv(ftu_cell_count_path)

# Define the mapping dictionary
organ_to_uberon = {
    'Kidney': 'UBERON:0002113',
    'Lung': 'UBERON:0002048',
    'Pancreas': 'UBERON:0001264',
    'Large Intestine': 'UBERON:0002107',
    'Skin': 'UBERON:0002097',
    'Liver': 'UBERON:0002108',
    'Prostate': 'UBERON:0002367',
    'Thymus': 'UBERON:0002371',
    'Spleen': 'UBERON:0002370',
    'Small Intestine': 'UBERON:0002106'
}

# Filter FTU cell count to include only rows where `CT ID in CL` contains `CL`
ftu_cell_count_filtered = ftu_cell_count[ftu_cell_count['CT ID in CL'].str.contains('CL', na=False)]

# Map the organ names in FTU cell count table to UBERON IDs
ftu_cell_count_filtered.loc[:, 'Organ_ID'] = ftu_cell_count_filtered['Organ'].map(organ_to_uberon)

# Prepare a common structure to hold the results
result = []

# Function to dynamically generate match info
def generate_match_info(tool_name, tool_data):
    levels = set()
    for organ_level in tool_data['Organ_Level']:
        level = organ_level.split('_')[-1]
        levels.add(level)
    
    match_info = {f'{tool_name}_{level}': 0 for level in levels}
    return match_info, levels

# Dynamically generate the match info for each tool
celltypist_info, celltypist_levels = generate_match_info('celltypist', celltypist)
azimuth_info, azimuth_levels = generate_match_info('azimuth', azimuth)
popv_info, popv_levels = generate_match_info('popv', popv)

# Iterate over the filtered FTU cell count data
for idx, ftu_row in ftu_cell_count_filtered.iterrows():
    organ = ftu_row['Organ_ID']
    ftu_label = ftu_row['FTU Label in Uberon']
    ftu_id = ftu_row['FTU ID in Uberon']
    cl_id = ftu_row['CT ID in CL']
    cl_label = ftu_row['CT Label in CL']
    
    # Initialize match columns
    match_info = {
        'Organ': ftu_row['Organ'],
        'FTU Label in Uberon': ftu_label,
        'FTU ID in Uberon': ftu_id,
        'CL_id': cl_id,
        'CT Label in CL': cl_label,
    }
    match_info.update({key: 0 for key in celltypist_info.keys()})
    match_info.update({key: 0 for key in azimuth_info.keys()})
    match_info.update({key: 0 for key in popv_info.keys()})
    
    # Check in celltypist
    celltypist_matches = celltypist[(celltypist['CL_ID'] == cl_id) & (celltypist['Organ_ID'] == organ)]
    for _, ct_match in celltypist_matches.iterrows():
        level = ct_match['Organ_Level'].split('_')[-1]  # Extract the level number
        match_info[f'celltypist_{level}'] = 1

    # Check in azimuth
    azimuth_matches = azimuth[(azimuth['CL_ID'] == cl_id) & (azimuth['Organ_ID'] == organ)]
    for _, az_match in azimuth_matches.iterrows():
        level = az_match['Organ_Level'].split('_')[-1]  # Extract the level number
        match_info[f'azimuth_{level}'] = 1

    # Check in popv
    popv_matches = popv[(popv['CL_ID'] == cl_id) & (popv['Organ_ID'] == organ)]
    if not popv_matches.empty:
        for level in popv_levels:
            match_info[f'popv_{level}'] = 1

    # Append to results
    result.append(match_info)

# Create a dataframe from the results
result_df = pd.DataFrame(result)

# Save the result to a CSV file
output_path = 'FTU_Cell_Count_Annotated.csv'
result_df.to_csv(output_path, index=False)

print(f"File saved to {output_path}")


File saved to FTU_Cell_Count_Annotated.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ftu_cell_count_filtered.loc[:, 'Organ_ID'] = ftu_cell_count_filtered['Organ'].map(organ_to_uberon)


In [8]:
import pandas as pd
import json

# Load all the files into dataframes
atlas_as_cell_summaries_path = 'C:\\Users\\Supriya\\Downloads\\atlas-as-cell-summaries.jsonld'

# Load and display the JSON-LD file
with open(atlas_as_cell_summaries_path, 'r') as file:
    atlas_as_cell_summaries_data = json.load(file)

# Print a sample of the JSON-LD data to understand its structure
sample_data = json.dumps(atlas_as_cell_summaries_data, indent=2)
print(sample_data)

{
  "@context": {
    "CL": {
      "@id": "http://purl.obolibrary.org/obo/CL_",
      "@prefix": true
    },
    "ASCTB-TEMP": {
      "@id": "https://purl.org/ccf/ASCTB-TEMP_",
      "@prefix": true
    },
    "ctpop": {
      "@id": "https://purl.humanatlas.io/graph/hra-pop#",
      "@prefix": true
    },
    "as_3d_id": {
      "@type": "@id"
    },
    "as_id": {
      "@type": "@id"
    },
    "all_collisions": {
      "@id": "ccf:has_collision_summary"
    },
    "collision_source": {
      "@reverse": "ccf:has_collision_summary",
      "@type": "@id"
    },
    "collisions": {
      "@id": "ccf:has_collision_item"
    },
    "corridor_source": {
      "@reverse": "ccf:has_corridor",
      "@type": "@id"
    },
    "corridor": {
      "@id": "ccf:has_corridor"
    },
    "summaries": {
      "@id": "ccf:has_cell_summary"
    },
    "cell_source": {
      "@reverse": "ccf:has_cell_summary",
      "@type": "@id"
    },
    "aggregated_summaries": {
      "@id": "ccf:aggregates",
 

ModuleNotFoundError: No module named 'ace_tools'

In [12]:
import pandas as pd
import json

# Load the provided CSV and JSON files
csv_file_path = 'ref_data\\FTU Cell Count Table - Cell_Type_Count.csv'
json_file_path = 'ref_data\\atlas-as-cell-summaries.jsonld'

# Read the CSV file
csv_data = pd.read_csv(csv_file_path)

# Load the JSON file
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Extract the relevant data from the JSON file
json_graph = json_data['@graph']

# Convert the JSON graph to a DataFrame for easier manipulation
json_df = pd.json_normalize(json_graph)

# Extract and expand the summaries from the JSON DataFrame
expanded_json_df = json_df.explode('summary').reset_index(drop=True)
summary_df = pd.json_normalize(expanded_json_df['summary'])

# Merge the expanded JSON DataFrame with the summary DataFrame
merged_json_df = pd.concat([expanded_json_df.drop(columns=['summary']), summary_df], axis=1)

# Extract relevant columns from the CSV data
csv_relevant_columns = csv_data[['CT ID in CL', 'Organ', 'FTU Label in Uberon', 'FTU ID in Uberon', 'CT Label in CL']]

# Merge the CSV and JSON DataFrames based on matching cell_id and CT ID in CL
merged_data = pd.merge(
    merged_json_df,
    csv_relevant_columns,
    left_on='cell_id',
    right_on='CT ID in CL',
    how='inner'
)

# Create the final DataFrame with the desired columns
final_df = merged_data[[
    'CT ID in CL',
    'cell_id',
    'cell_label',
    'annotation_method',
    'cell_source_label',
    'sex',
    'aggregated_summaries'
]]

# Rename the columns as specified
final_df.columns = [
    'CT ID in CL',
    'cell_id',
    'cell_label',
    'annotation_method',
    'cell_source_label',
    'sex',
    'aggregated_summaries'
]

# Save the final DataFrame to a CSV file
final_csv_path = 'C:\\Users\\Supriya\\Downloads\\CTs-with-datasets.csv'
final_df.to_csv(final_csv_path, index=False)

# Display the first few rows of the final DataFrame
print(final_df.head())
print(f"Final CSV saved to: {final_csv_path}")


  CT ID in CL     cell_id            cell_label annotation_method  \
0  CL:0009080  CL:0009080  intestinal tuft cell              popv   
1  CL:0000057  CL:0000057            fibroblast              popv   
2  CL:0000775  CL:0000775            neutrophil              popv   
3  CL:0000775  CL:0000775            neutrophil              popv   
4  CL:0000097  CL:0000097             mast cell              popv   

  cell_source_label     sex                               aggregated_summaries  
0           jejunum  Female  [https://entity.api.hubmapconsortium.org/entit...  
1           jejunum  Female  [https://entity.api.hubmapconsortium.org/entit...  
2           jejunum  Female  [https://entity.api.hubmapconsortium.org/entit...  
3           jejunum  Female  [https://entity.api.hubmapconsortium.org/entit...  
4           jejunum  Female  [https://entity.api.hubmapconsortium.org/entit...  
Final CSV saved to: C:\Users\Supriya\Downloads\final_output.csv


In [7]:
# Load the uploaded files
final_output_df = pd.read_csv('C:\\Users\\Supriya\\Downloads\\final_output.csv')
popv_df = pd.read_csv('C:\\Users\\Supriya\\Downloads\\popv.csv')
celltypist_df = pd.read_csv('C:\\Users\\Supriya\\Downloads\\celltypist.csv')
azimuth_df = pd.read_csv('C:\\Users\\Supriya\\Downloads\\azimuth.csv')

# Rename the column in final_output_df
final_output_df.rename(columns={'CT ID in CL': 'CL_ID'}, inplace=True)

# Merge with azimuth_df
final_output_azimuth = final_output_df[final_output_df['annotation_method'] == 'azimuth'].merge(
    azimuth_df[['CL_ID', 'Organ_Level']], on='CL_ID', how='left')

# Merge with celltypist_df
final_output_celltypist = final_output_df[final_output_df['annotation_method'] == 'celltypist'].merge(
    celltypist_df[['CL_ID', 'Organ_Level']], on='CL_ID', how='left')

# Merge with popv_df
final_output_popv = final_output_df[final_output_df['annotation_method'] == 'popv'].merge(
    popv_df[['CL_ID', 'Organ_Level']], on='CL_ID', how='left')

# Combine all the dataframes
final_output_combined = pd.concat([final_output_azimuth, final_output_celltypist, final_output_popv], ignore_index=True)

# Define the organ levels for each annotation method
azimuth_organs = ['Kidney_L3', 'Lung_v2_finest_level', 'Liver_L2', 'Liver_L1', 'Kidney_L1',  'Kidney_L2', 
                  'Lung_v2_L1', 'Lung_v2_L2', 'Lung_v2_L3', 'Lung_v2_L4', 'Lung_v2_L5', 'Pancreas_L1']
celltypist_organs = ['intestine_L1', 'kidney_L1', 'liver_L1', 'lung_L1', 'pancreas_L1', 'spleen_L1', 
                     'Adult_Human_Skin_pkl', 'Healthy_Human_Liver_pkl', 'Adult_Human_PancreaticIslet_pkl', 
                     'Human_Lung_Atlas_pkl']
popv_organs = ['large intestine', 'liver', 'lung', 'male reproductive system', 'pancreas', 'prostate gland', 
               'respiratory system', 'skin', 'small intestine', 'spleen', 'thymus']

# Filter the combined dataframe for each annotation method and their corresponding organ levels
filtered_azimuth = final_output_combined[(final_output_combined['annotation_method'] == 'azimuth') & 
                                         (final_output_combined['Organ_Level'].isin(azimuth_organs))]

filtered_celltypist = final_output_combined[(final_output_combined['annotation_method'] == 'celltypist') & 
                                            (final_output_combined['Organ_Level'].isin(celltypist_organs))]

filtered_popv = final_output_combined[(final_output_combined['annotation_method'] == 'popv') & 
                                      (final_output_combined['Organ_Level'].isin(popv_organs))]

# Combine the filtered dataframes
final_filtered_combined = pd.concat([filtered_azimuth, filtered_celltypist, filtered_popv], ignore_index=True)

# Save the combined dataframe to a CSV file
output_path = 'C:\\Users\\Supriya\\Downloads\\filtered-CTs-with-datasets-with-organ'
final_filtered_combined.to_csv(output_path, index=False)

# Display the path to the user
output_path


'C:\\Users\\Supriya\\Downloads\\final_filtered_combined.csv'

###  Additonal work

In [4]:
# Load the files
azimuth = pd.read_csv("C:\\Users\\Supriya\\Downloads\\azimuth.csv")
celltypist = pd.read_csv("C:\\Users\\Supriya\\Downloads\\celltypist.csv")
popv = pd.read_csv("C:\\Users\\Supriya\\Downloads\\popv.csv")

# Get the unique organ levels for each file
azimuth_organs = azimuth['Organ_Level'].unique().tolist()
celltypist_organs = celltypist['Organ_Level'].unique().tolist()
popv_organs = popv['Organ_Level'].unique().tolist()

print("Azimuth Organs:", azimuth_organs)
print()
print("Celltypist Organs:", celltypist_organs)
print()
print("Popv Organs:", popv_organs)

Azimuth Organs: ['Heart_L2', 'Kidney_L3', 'Lung_v2_finest_level', 'Liver_L2', 'Liver_L1', 'Heart_L1', 'Kidney_L1', 'Human_PBMC_L3', 'Human_PBMC_L1', 'Kidney_L2', 'Lung_v2_L1', 'Lung_v2_L2', 'Lung_v2_L3', 'Lung_v2_L4', 'Lung_v2_L5', 'Pancreas_L1', 'Human_PBMC_L2', 'Bone_marrow_L1', 'Bone_marrow_L2', 'Adipose_L1', 'Adipose_L2', 'Tonsil_v2_L2', 'Tonsil_v2_L1']

Celltypist Organs: ['blood_L1', 'bone marrow_L1', 'heart_L1', 'hippocampus_L1', 'intestine_L1', 'kidney_L1', 'liver_L1', 'lung_L1', 'lymph node_L1', 'pancreas_L1', 'skeletal muscle_L1', 'spleen_L1', 'Adult_Human_Skin_pkl', 'Healthy_Human_Liver_pkl', 'Adult_Human_PancreaticIslet_pkl', 'Human_Lung_Atlas_pkl', 'Healthy_Adult_Heart_pkl', 'Human_AdultAged_Hippocampus_pkl']

Popv Organs: ['blood', 'blood vasculature', 'bone marrow', 'eye', 'heart', 'large intestine', 'liver', 'lung', 'lymph node', 'male reproductive system', 'mammary gland', 'mesenteric lymph node', 'pancreas', 'prostate gland', 'respiratory system', 'skin', 'small intes

In [4]:
import json

def print_structure(data, indent=0):
    """
    Recursively prints the structure of the JSON data.
    """
    indent_str = "  " * indent
    if isinstance(data, dict):
        for key, value in data.items():
            print(f"{indent_str}{key}: ({type(value).__name__})")
            print_structure(value, indent + 1)
    elif isinstance(data, list):
        print(f"{indent_str}List of {len(data)} items: ({type(data).__name__})")
        if len(data) > 0:
            print_structure(data[0], indent + 1)
    else:
        print(f"{indent_str}{data} ({type(data).__name__})")

# Load the JSON-LD file
json_file_path = "C:\\Users\\Supriya\\Downloads\\atlas-enriched-dataset-graph.jsonld"
with open(json_file_path, 'r') as file:
    json_data = json.load(file)

# Print the structure of the JSON data
print_structure(json_data)


@context: (dict)
  CL: (dict)
    @id: (str)
      http://purl.obolibrary.org/obo/CL_ (str)
    @prefix: (bool)
      True (bool)
  ASCTB-TEMP: (dict)
    @id: (str)
      https://purl.org/ccf/ASCTB-TEMP_ (str)
    @prefix: (bool)
      True (bool)
  ctpop: (dict)
    @id: (str)
      https://purl.humanatlas.io/graph/hra-pop# (str)
    @prefix: (bool)
      True (bool)
  as_3d_id: (dict)
    @type: (str)
      @id (str)
  as_id: (dict)
    @type: (str)
      @id (str)
  all_collisions: (dict)
    @id: (str)
      ccf:has_collision_summary (str)
  collision_source: (dict)
    @reverse: (str)
      ccf:has_collision_summary (str)
    @type: (str)
      @id (str)
  collisions: (dict)
    @id: (str)
      ccf:has_collision_item (str)
  corridor_source: (dict)
    @reverse: (str)
      ccf:has_corridor (str)
    @type: (str)
      @id (str)
  corridor: (dict)
    @id: (str)
      ccf:has_corridor (str)
  summaries: (dict)
    @id: (str)
      ccf:has_cell_summary (str)
  cell_source: (dict)

In [None]:
import json
import pandas as pd

# Load the JSON file
file_path = 'C:\\Users\\Supriya\\Downloads\\atlas-enriched-dataset-graph.jsonld'

with open(file_path, 'r') as file:
    json_data = json.load(file)

# Provided CT Labels and CT IDs
ct_labels_ids = {
    "glomerular capillary endothelial cell": "CL:1001005",
    "efferent arteriole endothelial cell": "CL:1001099",
    "afferent arteriole endothelial cell": "CL:1001096",
    "peritubular capillary endothelial cell": "CL:1001033",
    "vasa recta ascending limb cell": "CL:1001131",
    "vasa recta descending limb cell": "CL:1001285",
    "alveolar capillary type 1 endothelial cell": "CL:4028002",
    "capillary endothelial cell": "CL:0002144",
    "blood vessel smooth muscle cell": "CL:0019018",
    "endothelial cell of artery": "CL:1000413",
    "vein endothelial cell": "CL:0002543",
    "endothelial cell of hepatic sinusoid": "CL:1000398",
    "prostate gland microvascular endothelial cell": "CL:2000059",
    "blood vessel endothelial cell": "CL:0000071",
    "splenic endothelial cell": "CL:2000053"
}

# Identifying the structure within the "@graph" key to process accordingly
graph_data = json_data["@graph"]

In [29]:
# Grouping ensembl_ids by cell_id and adding cell_label
grouped_matches = {}
for item in graph_data:
    samples = item.get("samples", [])
    for sample in samples:
        sections = sample.get("sections", [])
        for section in sections:
            datasets = section.get("datasets", [])
            for dataset in datasets:
                summaries = dataset.get("summaries", [])
                for summary in summaries:
                    sum_details = summary.get("summary", [])
                    for sum_detail in sum_details:
                        cell_id = sum_detail.get("cell_id")
                        gene_expr_list = sum_detail.get("gene_expr", [])
                        if cell_id in ct_labels_ids.values():
                            if isinstance(gene_expr_list, list):
                                for gene_expr in gene_expr_list:
                                    if isinstance(gene_expr, dict):
                                        ensembl_id = gene_expr.get("ensembl_id")
                                        if cell_id not in grouped_matches:
                                            grouped_matches[cell_id] = {"cell_label": "", "ensembl_ids": []}
                                        grouped_matches[cell_id]["cell_label"] = [label for label, id in ct_labels_ids.items() if id == cell_id][0]
                                        grouped_matches[cell_id]["ensembl_ids"].append(ensembl_id)

# Preparing the final dataframe
final_matches = []
for cell_id, details in grouped_matches.items():
    final_matches.append({
        "cell_id": cell_id,
        "cell_label": details["cell_label"],
        "ensembl_ids": ", ".join(details["ensembl_ids"])
    })

final_matches_df = pd.DataFrame(final_matches)

# Displaying the final dataframe
final_matches_df.to_csv('Biomarker_for_Vasculature_CTs.csv')

In [28]:
final_matches_df['cell_id'].unique()

array(['CL:1001131', 'CL:1001005', 'CL:0002144', 'CL:1000413',
       'CL:0002543', 'CL:1000398'], dtype=object)

In [13]:
import json
import pandas as pd

# Load the JSON file
file_path = 'ref_data/atlas-enriched-dataset-graph.jsonld'

with open(file_path, 'r') as file:
    json_data = json.load(file)

# Provided CT Labels and CT IDs
ct_labels_ids = {
    "glomerular capillary endothelial cell": "CL:1001005",
    "efferent arteriole endothelial cell": "CL:1001099",
    "afferent arteriole endothelial cell": "CL:1001096",
    "peritubular capillary endothelial cell": "CL:1001033",
    "vasa recta ascending limb cell": "CL:1001131",
    "vasa recta descending limb cell": "CL:1001285",
    "alveolar capillary type 1 endothelial cell": "CL:4028002",
    "capillary endothelial cell": "CL:0002144",
    "blood vessel smooth muscle cell": "CL:0019018",
    "endothelial cell of artery": "CL:1000413",
    "vein endothelial cell": "CL:0002543",
    "endothelial cell of hepatic sinusoid": "CL:1000398",
    "prostate gland microvascular endothelial cell": "CL:2000059",
    "blood vessel endothelial cell": "CL:0000071",
    "splenic endothelial cell": "CL:2000053"
}

# Identifying the structure within the "@graph" key to process accordingly
graph_data = json_data["@graph"]

# Grouping ensembl_ids by cell_id and organ_id, adding cell_label and mean_gene_expr_value
grouped_matches = {}
for item in graph_data:
    samples = item.get("samples", [])
    for sample in samples:
        sections = sample.get("sections", [])
        for section in sections:
            datasets = section.get("datasets", [])
            for dataset in datasets:
                summaries = dataset.get("summaries", [])
                for summary in summaries:
                    sum_details = summary.get("summary", [])
                    for sum_detail in sum_details:
                        cell_id = sum_detail.get("cell_id")
                        gene_expr_list = sum_detail.get("gene_expr", [])
                        organ_id = dataset.get("organ_id", "Unknown")
                        if cell_id in ct_labels_ids.values():
                            if isinstance(gene_expr_list, list) and gene_expr_list:
                                for gene_expr in gene_expr_list:
                                    if isinstance(gene_expr, dict):
                                        ensembl_id = gene_expr.get("ensembl_id")
                                        mean_expr_value = gene_expr.get("mean_gene_expr_value")
                                        key = (cell_id, organ_id)
                                        if key not in grouped_matches:
                                            grouped_matches[key] = {"cell_label": "", "ensembl_ids": [], "mean_expr_values": []}
                                        grouped_matches[key]["cell_label"] = [label for label, id in ct_labels_ids.items() if id == cell_id][0]
                                        grouped_matches[key]["ensembl_ids"].append(ensembl_id)
                                        grouped_matches[key]["mean_expr_values"].append(mean_expr_value)
                            else:
                                # Handle entries with empty gene_expr lists
                                key = (cell_id, organ_id)
                                if key not in grouped_matches:
                                    grouped_matches[key] = {"cell_label": "", "ensembl_ids": [], "mean_expr_values": []}
                                grouped_matches[key]["cell_label"] = [label for label, id in ct_labels_ids.items() if id == cell_id][0]

# Preparing the final dataframe
final_matches = []
for (cell_id, organ_id), details in grouped_matches.items():
    final_matches.append({
        "cell_id": cell_id,
        "cell_label": details["cell_label"],
        "organ_id": organ_id,
        "ensembl_ids": ", ".join(details["ensembl_ids"]),
        "mean_expr_values": ", ".join(map(str, details["mean_expr_values"]))
    })

final_matches_df = pd.DataFrame(final_matches)

# Expanding the dataframe to have each row for each ensembl_id
expanded_matches = []
for _, row in final_matches_df.iterrows():
    cell_id = row["cell_id"]
    cell_label = row["cell_label"]
    organ_id = row["organ_id"]
    ensembl_ids = row["ensembl_ids"].split(", ")
    mean_expr_values = row["mean_expr_values"].split(", ")
    
    for ensembl_id, mean_expr_value in zip(ensembl_ids, mean_expr_values):
        try:
            mean_expr_value_float = float(mean_expr_value)
        except ValueError:
            mean_expr_value_float = None  # Handle non-numeric values
        expanded_matches.append({
            "cell_id": cell_id,
            "cell_label": cell_label,
            "organ_id": organ_id,
            "ensembl_id": ensembl_id,
            "mean_expr_value": mean_expr_value_float
        })

expanded_matches_df = pd.DataFrame(expanded_matches)

# Displaying the expanded dataframe
expanded_matches_df.head()  # Displaying only the first few rows

ModuleNotFoundError: No module named 'ace_tools'