# Code to generate 2 CSV files for UMAPs (by cell summary and by intersection percentage with anatomical structures)

## Import statements

In [5]:
import pandas as pd
import json
import re 

## Global variables

In [6]:
hra_pop_version = "0.8.3" 

## Global methods

In [26]:
def strip_reference_organ(reference_organ, as_label):
    """removes a set of predefined strings form a reference organ and returns a column header

    Args:
        reference_organ (string): an IRI for an organ
        as_label (_type_): a human readable label for an anatomical structure

    Returns:
        header (string): a column header
    """
    to_remove = ['VHF', 'VHM']
    pattern = "|".join(map(re.escape, to_remove))
    organ = re.sub(pattern,'', reference_organ)
    header = f'{organ} - {as_label}'
    return header

def get_partial_match_key(header, partial_string):
    """Returns a key given a match with a partial string

    Args:
        dict (string): a header
        partial_string (string): a partial key

    Returns:
        key (if match found): a full key that matches the partial string argument
    """
    if partial_string in header:
        return header
    return None  # Return None if no partial match is found

## Load enriched-dataset-graph

In [8]:
 # Link to atlas-enriched-dataset-graph
atlas_enriched_dataset_graph_file = open("../../hra-pop/output-data/v"+hra_pop_version+ "/atlas-enriched-dataset-graph.jsonld") # biomarkers and cell types

# Opening Jthe JSON-LD file
dataset_graph = json.load(atlas_enriched_dataset_graph_file)

## Main

In [99]:
# initialize result, to be converted to pandas data farme at the end and exported as CSV
result = {
'samples' : [],
'rui_location_id' : [],
'organ' : []
}

# construct columns for output
for donor in dataset_graph['@graph']:
  for sample in donor['samples']:
    for collision_summary in sample['rui_location']['all_collisions']:
      for collision_item in collision_summary['collisions']:
        as_label = collision_item['as_label']
        
        # construct AS column header
        header = strip_reference_organ(collision_item['reference_organ'].split('#')[1], as_label)
        
        # add header if not present yet
        if header not in result:
          result[header] = []

# fill in columns

for donor in dataset_graph['@graph']:
  
  for sample in donor['samples']:
    
    # get sample_id
    sample_dict = {
      'sample_id' : sample['@id'],
      'associated_dataset_ids' : [],
      'count' : 0
    }   
    
    # get dataset_id from sample
    for dataset in sample['datasets']:
      sample_dict['associated_dataset_ids'].append(dataset['@id'])
    
    # get dataset_id from section
    for section in sample['sections']:
      for dataset in section['datasets']:
       sample_dict['associated_dataset_ids'].append(dataset['@id'])

    sample_dict['count'] = len(sample_dict['associated_dataset_ids'])
    result['samples'].append(sample_dict)
    
    # get rui location id
    result['rui_location_id'].append(sample['rui_location']['@id'])
    
    # get AS percentage
    for collision_summary in sample['rui_location']['all_collisions']:
      for collision_item in collision_summary['collisions']:
        
        # get organ
        result['organ'].append(collision_item['reference_organ'].split('#')[1])
        
        as_label = collision_item['as_label']
        
        # check against columns in output
        ignore_columns = ["samples", "rui_location_id","organ"]
        for key in result:
          if key in ignore_columns:
            continue
          match = get_partial_match_key(key, as_label)
          # print(match)
          if match != None:
            result[key].append(collision_item['percentage'])
          else: 
            try:
              result[key].append(0)
            except:
              print(key)


## Print results

In [100]:
# Get
total = 0
for sample in result['samples']:
  total = total + sample['count']

print(f"Total datasets: {total}")
print()
# print result for checking
for key in result:
  print(f"Key {key}:\n Length: {len(result[key])},\n Values: {result[key]}")

Total datasets: 553

Key samples:
 Length: 282,
 Values: [{'sample_id': 'https://entity.api.hubmapconsortium.org/entities/0b43d8d0dbbc5e3923a8b963650ab8e3', 'associated_dataset_ids': ['https://entity.api.hubmapconsortium.org/entities/3de525fe3e5718f297e8d62e037a042d'], 'count': 1}, {'sample_id': 'https://entity.api.hubmapconsortium.org/entities/3838f4672dd3d3d1ee6cb0809cf1ea03', 'associated_dataset_ids': ['https://entity.api.hubmapconsortium.org/entities/7edbff53248f2a2c8e74f5f955681734'], 'count': 1}, {'sample_id': 'https://entity.api.hubmapconsortium.org/entities/507bb20f36adc8a513fe31a913bcfde6', 'associated_dataset_ids': ['https://entity.api.hubmapconsortium.org/entities/11678046b4e78f95762acc47d3074dc5'], 'count': 1}, {'sample_id': 'https://entity.api.hubmapconsortium.org/entities/750691876232b043dd3b6e2b0098bdbf', 'associated_dataset_ids': ['https://entity.api.hubmapconsortium.org/entities/f6eb890063d13698feb11d39fa61e45a'], 'count': 1}, {'sample_id': 'https://entity.api.hubmapco

## Prepare for export

In [101]:
# remove columns 
remove = ['samples', 'rui_location_id']
for key in remove:
  result.pop(key)

## Export to CSV

In [105]:
# Convert dict to DataFrame
df = pd.DataFrame.from_dict(result)

# # Export DataFrame to CSV
df.to_csv('output/umap_as_percentage.csv', index=False)