# Code to generate 1 CSV file for UMAPs (by intersection percentage with anatomical structures)

## Import statements

In [9]:
import pandas as pd
import json
import re 

## Global variables

In [10]:
hra_pop_version = "0.9.0" 
export_version = "umap_datasets_as_percentage"  # alt: umap_as_percentage
ignore_columns = ["dataset_id", "organ", "dataset_combinations",
                  "as_item_combinations"]  # define columns to ignore

## Global methods

In [11]:
def strip_reference_organ(reference_organ, as_label):
    """removes a set of predefined strings form a reference organ and returns a column header

    Args:
        reference_organ (string): an IRI for an organ
        as_label (_type_): a human readable label for an anatomical structure

    Returns:
        header (string): a column header
    """
    to_remove = ['VHF', 'VHM']
    pattern = "|".join(map(re.escape, to_remove))
    organ = re.sub(pattern,'', reference_organ)
    # header = f'{organ} - {as_label}'
    header = f'{as_label}'
    return header

def get_partial_match_key(header, partial_string):
    """Returns a key given a match with a partial string

    Args:
        dict (string): a header
        partial_string (string): a partial key

    Returns:
        key (if match found): a full key that matches the partial string argument
    """
    if partial_string in header:
        return header
    return None  # Return None if no partial match is found

def remove_version(input_string):
    """Removes the version from a reference organ

    Args:
        input_string (string): the reference organ with version

    Returns:
        string : the reference organ without version
    """
    return re.sub(r'V\d+(\.\d+)?', '', input_string)

## Load enriched-dataset-graph

In [12]:
 # Link to atlas-enriched-dataset-graph
atlas_enriched_dataset_graph_file = open("../../hra-pop/output-data/v"+hra_pop_version+ "/atlas-enriched-dataset-graph.jsonld") # biomarkers and cell types

# Opening Jthe JSON-LD file
dataset_graph = json.load(atlas_enriched_dataset_graph_file)

## Extract new dictionary from atlas-enriched-dataset-graph to capture AS collision information per dataset

In [13]:
# initialize result, to be converted to pandas data farme at the end and exported as CSV
result = {
'dataset_id' : [],
'dataset_combinations' : [],
'organ' : [],
'as_item_combinations' : []
}

# dynamically construct columns for output, where each AS is one column
for donor in dataset_graph['@graph']:
  for sample in donor['samples']:
    for collision_summary in sample['rui_location']['all_collisions']:
      for collision_item in collision_summary['collisions']:
        as_label = collision_item['as_label']
        
        # construct AS column header
        header = strip_reference_organ(remove_version(collision_item['reference_organ'].split('#')[1]), as_label)
        
        # add header if not present yet
        if header not in result:
          result[header] = []

# fill in columns
for donor in dataset_graph['@graph']:
  for sample in donor['samples']:
    
    # get all datasets in sample
    datasets_in_sample = []
    
    # get dataset IDs from sample
    for dataset in sample['datasets']:
      result['dataset_id'].append(dataset['@id'])
      datasets_in_sample.append(dataset['@id'])
    
    # get dataset IDs from section
    for section in sample['sections']:
      for dataset in section['datasets']:
        result['dataset_id'].append(dataset['@id'])
        datasets_in_sample.append(dataset['@id'])

    # get organ
    result['organ'].append(remove_version(sample['rui_location']['placement']['target'].split('#')[1]))
    
    # get AS percentages
    for collision_summary in sample['rui_location']['all_collisions']:
      result['dataset_combinations'].append(datasets_in_sample)
      
      # get simplified collision items
      as_items = []
      
      for collision_item in collision_summary['collisions']:
        as_items.append(
          {
            'label': collision_item['as_label'],
            'percentage': collision_item['percentage']
          }
        )

      result['as_item_combinations'].append(as_items)

    # add 0 as default AS collision percentage to AS columns
    for key in result:
      if key in ignore_columns:
       continue
      else:
        result[key].append(0)

# identify as_item_combinations with the same value
multiple_as_item_combinations = {}

for row in result['as_item_combinations']:
 if str(row) not in multiple_as_item_combinations:
   multiple_as_item_combinations[str(row)] = {
     'occurences' : 1,
     'current_index' : 0,
     'indices_in_rows' : []
     }
 else:
   multiple_as_item_combinations[str(row)]['occurences'] = multiple_as_item_combinations[str(row)]['occurences'] + 1
  #  multiple_as_item_combinations[str(row)]['indices_in_rows'] = [index for index, value in enumerate(result['as_item_combinations']) if value == row]
   multiple_as_item_combinations[str(row)]['indices_in_rows'] = [index for index, value in enumerate(result['as_item_combinations']) if str(value) == str(row)]

for row in list(multiple_as_item_combinations.keys()):
  if multiple_as_item_combinations[str(row)]['occurences'] == 1:
    del multiple_as_item_combinations[str(row)]

# fill in cells with percentages (overwriting default: 0)
index_by_row = {}
for row in result["as_item_combinations"]:
  current = index_by_row.setdefault(str(row), 0)
  index_by_row[str(row)] += 1
  for key in result:
    if key in ignore_columns:
      continue
    for item in row:
      if item['label'] == key:

      # check if row occurs multiple times
        for multiple_combo in multiple_as_item_combinations:
          if str(row) == multiple_combo:

            i_in_row = multiple_as_item_combinations[str(row)]['indices_in_rows'][current]


          else:
            i_in_row = result['as_item_combinations'].index(row)

          result[key][i_in_row] = item['percentage']

## Select dataset and AS percentages

In [14]:
# create new dict for export with one row per dataset
export = {
  'dataset_id' : [],
  'organ': []
}

# add columns for AS
for key in result.keys():
  if key not in ignore_columns:
    export[key] = []

# get unique dataset IDs and AS percentages
# loop through all datasets, get row index of dataset_combination that contains this dataset, loop through AS columns, get value at index, add to list in export dict
for dataset in result['dataset_id']:
  for dataset_combination in result['dataset_combinations']:
    if dataset in dataset_combination:
      export['dataset_id'].append(dataset)
      
      # get index of matching dataset_combinations
      index = result['dataset_combinations'].index(dataset_combination)
      
      # get organ at index
      export['organ'].append(result['organ'][index])
      
      # loop through AS columns and get values
      for col in export:
        if col not in ignore_columns:
          export[col].append(result[col][index])

df = pd.DataFrame.from_dict(export)
print(df)

                                            dataset_id              organ  \
0    https://entity.api.hubmapconsortium.org/entiti...  VHFSmallIntestine   
1    https://entity.api.hubmapconsortium.org/entiti...           VHFColon   
2    https://entity.api.hubmapconsortium.org/entiti...  VHFSmallIntestine   
3    https://entity.api.hubmapconsortium.org/entiti...  VHFSmallIntestine   
4    https://entity.api.hubmapconsortium.org/entiti...           VHFColon   
..                                                 ...                ...   
548  https://doi.org/10.1126/science.abl4290#GTEX-1...            VHMLung   
549  https://doi.org/10.1126/science.abl4290#GTEX-1...        VHMProstate   
550  https://doi.org/10.1126/science.abl4290#GTEX-1...        VHMProstate   
551  https://doi.org/10.1126/science.abl4290#GTEX-1...        VHMProstate   
552  https://doi.org/10.1126/science.abl4290#GTEX-1...            VHMSkin   

     jejunum  descending colon  superior part of duodenum  \
0      0.103  

## Export to CSV

In [15]:
# remove columns before export # KEEP IF EXPORTING BY SAMPLE/TISSUE BLOCK!
# result.pop('dataset_id')

df = pd.DataFrame.from_dict(export)

# # Export DataFrame to CSV
df.to_csv('output/' + export_version + '.csv', index=False)