# Code to generate 2 CSV files for UMAPs (by cell summary and by intersection percentage with anatomical structures)

## Import statements

In [130]:
import pandas as pd
import json
import re 

## Global variables

In [131]:
hra_pop_version = "0.8.3" 

## Global methods

In [132]:
def strip_reference_organ(reference_organ, as_label):
    """removes a set of predefined strings form a reference organ and returns a column header

    Args:
        reference_organ (string): an IRI for an organ
        as_label (_type_): a human readable label for an anatomical structure

    Returns:
        header (string): a column header
    """
    to_remove = ['VHF', 'VHM']
    pattern = "|".join(map(re.escape, to_remove))
    organ = re.sub(pattern,'', reference_organ)
    # header = f'{organ} - {as_label}'
    header = f'{as_label}'
    return header

def get_partial_match_key(header, partial_string):
    """Returns a key given a match with a partial string

    Args:
        dict (string): a header
        partial_string (string): a partial key

    Returns:
        key (if match found): a full key that matches the partial string argument
    """
    if partial_string in header:
        return header
    return None  # Return None if no partial match is found

def remove_version(input_string):
    """Removes the version from a reference organ

    Args:
        input_string (string): the reference organ with version

    Returns:
        string : the reference organ without version
    """
    return re.sub(r'V\d+(\.\d+)?', '', input_string)

## Load enriched-dataset-graph

In [133]:
 # Link to atlas-enriched-dataset-graph
atlas_enriched_dataset_graph_file = open("../../hra-pop/output-data/v"+hra_pop_version+ "/atlas-enriched-dataset-graph.jsonld") # biomarkers and cell types

# Opening Jthe JSON-LD file
dataset_graph = json.load(atlas_enriched_dataset_graph_file)

## Main

In [220]:
# initialize result, to be converted to pandas data farme at the end and exported as CSV
result = {
'dataset_id' : [],
'dataset_combinations' : [],
'organ' : [],
'as_item_combinations' : []
}

# construct columns for output
for donor in dataset_graph['@graph']:
  for sample in donor['samples']:
    for collision_summary in sample['rui_location']['all_collisions']:
      for collision_item in collision_summary['collisions']:
        as_label = collision_item['as_label']
        
        # construct AS column header
        header = strip_reference_organ(remove_version(collision_item['reference_organ'].split('#')[1]), as_label)
        
        # add header if not present yet
        if header not in result:
          result[header] = []



# fill in columns
for donor in dataset_graph['@graph']:
  for sample in donor['samples']:
    
    datasets_in_sample = []
    
    # get dataset_id from sample
    for dataset in sample['datasets']:
      result['dataset_id'].append(dataset['@id'])
      datasets_in_sample.append(dataset['@id'])
    
    # get dataset_id from section
    for section in sample['sections']:
      for dataset in section['datasets']:
        result['dataset_id'].append(dataset['@id'])
        datasets_in_sample.append(dataset['@id'])

    # get organ
    result['organ'].append(remove_version(sample['rui_location']['placement']['target'].split('#')[1]))
    
    # get AS percentages
    for collision_summary in sample['rui_location']['all_collisions']:
      result['dataset_combinations'].append(datasets_in_sample)
      
      # get simplified collision items
      as_items = []
      
      for collision_item in collision_summary['collisions']:
        as_items.append(
          {
            'label': collision_item['as_label'],
            'percentage': collision_item['percentage']
          }
        )
        # as_items['label'] = collision_item['as_label']
        # as_items['percentage'] = collision_item['percentage']

      result['as_item_combinations'].append(as_items)
      # define columns to ignore

    ignore_columns = ["dataset_id", "organ", "dataset_combinations", "as_item_combinations"]

    keys_matching_condition = []
    for key in result:
      if key in ignore_columns:
        continue

    # add 0 as default AS collision percentage
    for key in result:
      if key in ignore_columns:
       continue
      else:
        result[key].append(0)

# identify as_item_combinations with the same value
multiple_as_item_combinations = {}

for row in result['as_item_combinations']:
 if str(row) not in multiple_as_item_combinations:
   multiple_as_item_combinations[str(row)] = {
     'occurences' : 1,
     'current_index' : 0,
     'indices_in_rows' : []
     }
 else:
   multiple_as_item_combinations[str(row)]['occurences'] = multiple_as_item_combinations[str(row)]['occurences'] + 1
   multiple_as_item_combinations[str(row)]['indices_in_rows'] = [index for index, value in enumerate(result['as_item_combinations']) if value == row]

for row in list(multiple_as_item_combinations.keys()):
  if multiple_as_item_combinations[str(row)]['occurences'] == 1:
    del multiple_as_item_combinations[str(row)]

# for item in multiple_as_item_combinations:
#   print(f'{item} has: {multiple_as_item_combinations[item]}')

# fill in percentages based on column for all AS intersections per sample/tissue block
for row in result["as_item_combinations"]:
  for key in result:
    if key in ignore_columns:
      continue
    for item in row:
      if item['label'] == key:

      # ALWAYS RETURNS INDEX OF FIRST OCCURENCE! NEED TO ACCOUNT FOR ROWS WITH SAME VALUES!
      # check if row occurs multiple times
        for multiple_combo in multiple_as_item_combinations:
          if str(row) == multiple_combo:
            
            current = multiple_as_item_combinations[str(row)]['current_index']
            print(f'For {str(row)} with {multiple_as_item_combinations[str(row)]}')
            print(f'\t The current index is {current} for {multiple_as_item_combinations[str(row)]}')
            print(f'\t This will return ' + str(multiple_as_item_combinations[str(row)]['indices_in_rows'][current]))
            print()
            i_in_row = multiple_as_item_combinations[str(row)]['indices_in_rows'][current]
   
  
            # print(f'index before ++: {i_in_row}')
      
            multiple_as_item_combinations[str(row)]['current_index'] = multiple_as_item_combinations[str(row)]['current_index'] + 1

            # print(f'index after ++: {i_in_row}')
            
            # print(f"found combo for {str(row)}: {multiple_as_item_combinations[str(row)]}")
            # print(f'index after IF: {index}')
          else:
            i_in_row = result['as_item_combinations'].index(row)
            # print(f'index after ELSE: {index}')
        
        # print(f'post:{i_in_row}')
        result[key][i_in_row] = item['percentage']



print(f"Total datasets: {total}")
print()
# print result for checking
for key in result:
  print(f"Key {key}:\n Length: {len(result[key])},\n Values: {result[key]}")

For [{'label': 'jejunum', 'percentage': 0.103}] with {'occurences': 2, 'current_index': 0, 'indices_in_rows': [0, 98]}
	 The current index is 0 for {'occurences': 2, 'current_index': 0, 'indices_in_rows': [0, 98]}
	 This will return 0

For [{'label': 'descending colon', 'percentage': 0.883}] with {'occurences': 3, 'current_index': 0, 'indices_in_rows': [1, 92, 113]}
	 The current index is 0 for {'occurences': 3, 'current_index': 0, 'indices_in_rows': [1, 92, 113]}
	 This will return 1

For [{'label': 'superior part of duodenum', 'percentage': 0.026}, {'label': 'descending part of duodenum', 'percentage': 0.032}] with {'occurences': 3, 'current_index': 0, 'indices_in_rows': [2, 97, 114]}
	 The current index is 0 for {'occurences': 3, 'current_index': 0, 'indices_in_rows': [2, 97, 114]}
	 This will return 2

For [{'label': 'superior part of duodenum', 'percentage': 0.026}, {'label': 'descending part of duodenum', 'percentage': 0.032}] with {'occurences': 3, 'current_index': 1, 'indices_i

IndexError: list index out of range

## Prepare for export

In [131]:
# remove columns 
remove = ['samples', 'rui_location_id', 'datasets']
for key in remove:
  result.pop(key)

## Export to CSV

In [219]:
# remove columns before export
result.pop('dataset_id')

df = pd.DataFrame.from_dict(result)

# # Export DataFrame to CSV
df.to_csv('output/umap_as_percentage.csv', index=False)