# Code to generate 2 CSV files for UMAPs (by cell summary and by intersection percentage with anatomical structures)

## Import statements

In [311]:
import pandas as pd
import json
import re 

## Global variables

In [194]:
hra_pop_version = "0.8.3" 

## Global methods

In [345]:
def strip_reference_organ(reference_organ, as_label):
    """removes a set of predefined strings form a reference organ and returns a column header

    Args:
        reference_organ (string): an IRI for an organ
        as_label (_type_): a human readable label for an anatomical structure

    Returns:
        header (string): a column header
    """
    to_remove = ['VHF', 'VHM']
    pattern = "|".join(map(re.escape, to_remove))
    organ = re.sub(pattern,'', reference_organ)
    header = f'{organ} - {as_label}'
    return header

def get_partial_match_key(header, partial_string):
    """Returns a key given a match with a partial string

    Args:
        dict (string): a header
        partial_string (string): a partial key

    Returns:
        key (if match found): a full key that matches the partial string argument
    """
    if partial_string in header:
        return header
    return None  # Return None if no partial match is found

def remove_version(input_string):
    """Removes the version from a reference organ

    Args:
        input_string (string): the reference organ with version

    Returns:
        string : the reference organ without version
    """
    return re.sub(r'V\d+(\.\d+)?', '', input_string)

## Load enriched-dataset-graph

In [196]:
 # Link to atlas-enriched-dataset-graph
atlas_enriched_dataset_graph_file = open("../../hra-pop/output-data/v"+hra_pop_version+ "/atlas-enriched-dataset-graph.jsonld") # biomarkers and cell types

# Opening Jthe JSON-LD file
dataset_graph = json.load(atlas_enriched_dataset_graph_file)

## Main

In [354]:
# initialize result, to be converted to pandas data farme at the end and exported as CSV
result = {
'dataset_id' : [],
'dataset_combinations' : [],
'organ' : [],
'as_item_combinations' : []
}

# construct columns for output
for donor in dataset_graph['@graph']:
  for sample in donor['samples']:
    for collision_summary in sample['rui_location']['all_collisions']:
      for collision_item in collision_summary['collisions']:
        as_label = collision_item['as_label']
        
        # construct AS column header
        header = strip_reference_organ(remove_version(collision_item['reference_organ'].split('#')[1]), as_label)
        
        # add header if not present yet
        if header not in result:
          result[header] = []



# fill in columns
for donor in dataset_graph['@graph']:
  for sample in donor['samples']:
    
    datasets_in_sample = []
    
    # get dataset_id from sample
    for dataset in sample['datasets']:
      result['dataset_id'].append(dataset['@id'])
      datasets_in_sample.append(dataset['@id'])
    
    # get dataset_id from section
    for section in sample['sections']:
      for dataset in section['datasets']:
        result['dataset_id'].append(dataset['@id'])
        datasets_in_sample.append(dataset['@id'])

    # get organ
    result['organ'].append(remove_version(sample['rui_location']['placement']['target'].split('#')[1]))
    
    # get AS percentages
    for collision_summary in sample['rui_location']['all_collisions']:
      result['dataset_combinations'].append(datasets_in_sample)
      
      # get simplified collision items
      as_items = []
      
      for collision_item in collision_summary['collisions']:
        as_items.append(
          {
            'label': collision_item['as_label'],
            'percentage': collision_item['percentage']
          }          
        )
    result['as_item_combinations'].append(as_items)
      # define columns to ignore

    ignore_columns = ["dataset_id", "organ", "dataset_combinations", "as_item_combinations"]

    keys_matching_condition = []
    for key in result:
      if key in ignore_columns:
       continue
      if item['label'] in key:
        keys_matching_condition.append(key)
    
    for key in result:
      if key in ignore_columns:
       continue
      

    
      if key in keys_matching_condition:
        
        # print(f"Key {key} was found in {keys_matching_condition}")
        for item in as_items:
          
          # print(f"{item['label']} in {key}: {item['label'] in key}")
          if item['label'] in key:
            result[key].append(item['percentage'])
      else:
        result[key].append(0)

# print result for checking
for key in result:
  print(f"Key {key}:\n Length: {len(result[key])},\n Values: {result[key]}")

IndentationError: unexpected indent (798189813.py, line 78)

## Export to CSV

In [306]:
# remove columns before export
result.pop('dataset_id')

# # Convert dict to DataFrame
df = pd.DataFrame.from_dict(result)

# # Export DataFrame to CSV
df.to_csv('output/umap_as_percentage.csv', index=False)

ValueError: All arrays must be of the same length