# Data Processing: Harmonize Cell Types

> Process all files in the `data/data-processed-nodes` directory to harmonize cell types using the two cell type mapping crosswalks in the `data/mapping_files` directory. Processed files will be saved in the `data/data-processed-nodes-with-harmonized-cell-types` directory.

In [74]:
import numpy as np
import pandas as pd
import os
import json
import requests
import shutil

pd.set_option('display.max_columns', None)

# suppress warnings
import warnings
warnings.filterwarnings("ignore")   

In [75]:
basepath = "/u/yashjain/hra-cell-distance-analysis/data"
mapping_files_dir = "mapping_files"
mapping_file_1 = "cell_type_mappings_1.xlsx"
mapping_file_2 = "cell_type_mappings_2.xlsx"
orig_filedir = "data-processed-nodes"
dest_filedir = "data-processed-nodes-with-harmonized-cell-types"

In [76]:
# Function to load your data
def load_data(path):
    data = pd.read_csv(path)
    return data

In [77]:
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created successfully.")
    else:
        print(f"Directory '{directory}' already exists.")

In [78]:
print(len(os.listdir(os.path.join(basepath, orig_filedir))), os.listdir(os.path.join(basepath, orig_filedir)))

14 ['intestine-codex-stanford', 'tonsil-codex-stanford', 'esophagus-codex-stanford', 'colon-cycif-sorgerlab', 'colon-xenium-stanford', 'lymphnode-codex-yale', 'maternalfetalinterface-mibitof-stanford', 'oralcavity-codex-czi', 'pancreas-geomx-ufl', 'skin-celldive-ge', 'skin-confocal-sorgerlab', 'spleen-codex-ufl', 'lung-codex-urmc', 'bonemarrow-codex-chop']


In [79]:
# Create destination directory. Overwrite if it exists.
if os.path.exists(os.path.join(basepath, dest_filedir)):
    shutil.rmtree(os.path.join(basepath, dest_filedir))
    print(f"Directory '{dest_filedir}' already exists and has been removed. New directory will be created.")
else:
    print(f"Directory '{dest_filedir}' does not exist and will be created.")
os.makedirs(os.path.join(basepath, dest_filedir), exist_ok=False)

Directory 'data-processed-nodes-with-harmonized-cell-types' already exists and has been removed. New directory will be created.


## Mapping first Cell Type Crosswalks

In [80]:
crosswalk_data = pd.read_excel(os.path.join(basepath, mapping_files_dir, mapping_file_1), sheet_name=None)
print(crosswalk_data.keys())

dict_keys(['intestine-codex-stanford', 'esophagus-codex-stanford', 'tonsil-codex-stanford', 'colon-xenium-stanford', 'spleen-codex-ufl', 'colon-cycif-sorgerlab', 'lung-codex-urmc', 'lymphnode-codex-yale', 'skin-celldive-ge', 'skin-confocal-sorgerlab', 'maternalfetalinterface-mibitof', 'pancreas-geomx-ufl', 'BuccalMuscosaMinorSalivaryGland', 'Parotid Major Salivary Gland Su', 'Gingiva-PhenocyclerFusion', 'Tongue-PhenocyclerFusion', 'bonemarrow-codex-chop'])


In [81]:
# Merge different oral cavity crosswalks ['BuccalMuscosaMinorSalivaryGland', 'Parotid Major Salivary Gland-Ph', 'Gingiva-PhenocyclerFusion', 'Tongue-PhenocyclerFusion'] into a single crosswalk ['oral-cavity-czi']. Drop duplicate rows, check by Cell Type column.
crosswalk_data["oralcavity-codex-czi"] = pd.concat([crosswalk_data['BuccalMuscosaMinorSalivaryGland'], crosswalk_data['Parotid Major Salivary Gland Su'], crosswalk_data['Gingiva-PhenocyclerFusion'], crosswalk_data['Tongue-PhenocyclerFusion']]).drop_duplicates(subset=['Cell Type'])

# length of the merged crosswalk
len(crosswalk_data["oralcavity-codex-czi"])

39

In [82]:
# Remove duplicate rows in the lung crosswalk data in crosswalk_data["lung-codex-urmc"].
crosswalk_data["lung-codex-urmc"] = crosswalk_data["lung-codex-urmc"].drop_duplicates(subset=['Cell Type'])

In [83]:
# Dataset name to crosswalk name mapping
dataset_crosswalk_mapping = {
    "colon-cycif-sorgerlab": "colon-cycif-sorgerlab",
    "esophagus-codex-stanford": "esophagus-codex-stanford",
    "intestine-codex-stanford": "intestine-codex-stanford",
    "lung-codex-urmc": "lung-codex-urmc", # NOTE: Structure of the lung-codex-urmc crosswalk is different from other crosswalks.
    "maternalfetalinterface-mibitof-stanford": "maternalfetalinterface-mibitof",
    "skin-celldive-ge": "skin-celldive-ge",
    "skin-confocal-sorgerlab": "skin-confocal-sorgerlab",
    "spleen-codex-ufl": "spleen-codex-ufl",
    "tonsil-codex-stanford": "tonsil-codex-stanford",
    "colon-xenium-stanford": "colon-xenium-stanford",
    "lymphnode-codex-yale": "lymphnode-codex-yale",
    "oralcavity-codex-czi": "oralcavity-codex-czi", # Using merged crosswalk instead of individual. See previous cell where the merged crosswalk is created.
    "pancreas-geomx-ufl": "pancreas-geomx-ufl",
    "bonemarrow-codex-chop": "bonemarrow-codex-chop"
}

In [84]:
# Cleanup crosswalk data
print("Before cleanup:")
all_cell_types = set()
for key in crosswalk_data.keys():
    if 'CL Label' in crosswalk_data[key].columns:
        all_cell_types.update(crosswalk_data[key]['Cell Type'].dropna().unique())
print(f"Total unique cell types: {len(all_cell_types)}")

for key in crosswalk_data.keys(): 
    # Clean CL Label values
    if 'CL Label' in crosswalk_data[key].columns:
        # Convert NaN to empty string first
        crosswalk_data[key]['CL Label'] = crosswalk_data[key]['CL Label'].fillna('')
        
        # Only process non-empty strings
        mask = crosswalk_data[key]['CL Label'] != ''
        if mask.any():
            # Convert to lowercase and strip whitespace
            crosswalk_data[key].loc[mask, 'CL Label'] = crosswalk_data[key].loc[mask, 'CL Label'].str.lower().str.strip()
            
            # Replace 'cells' with 'cell'
            crosswalk_data[key].loc[mask, 'CL Label'] = crosswalk_data[key].loc[mask, 'CL Label'].str.replace('cells', 'cell')
    
    # Clean Cell Type values for consistency
    if 'CL Label' in crosswalk_data[key].columns:
        # Convert NaN to empty string first
        crosswalk_data[key]['CL Label'] = crosswalk_data[key]['CL Label'].fillna('')
        
        # Only process non-empty strings
        mask = crosswalk_data[key]['CL Label'] != ''
        if mask.any():
            crosswalk_data[key].loc[mask, 'CL Label'] = crosswalk_data[key].loc[mask, 'CL Label'].str.strip()
    
    # Convert empty strings to NaN
    crosswalk_data[key] = crosswalk_data[key].replace('', np.nan)

print("\nAfter cleanup:")
all_cell_types = set()
for key in crosswalk_data.keys():
    if 'CL Label' in crosswalk_data[key].columns:
        all_cell_types.update(crosswalk_data[key]['CL Label'].dropna().unique())
print(f"Total unique cell types: {len(all_cell_types)}")

Before cleanup:
Total unique cell types: 294

After cleanup:
Total unique cell types: 159


In [85]:

for dirpath, dirnames, filenames in os.walk(os.path.join(basepath, orig_filedir)):
    for filename in [f for f in filenames if f.endswith("-nodes.csv")]:
        dataset_name = dirpath.split("/")[-1]

        if dataset_name in dataset_crosswalk_mapping:
            nodes_df = load_data(os.path.join(dirpath, filename))
            og_df = nodes_df.shape[0]
            crosswalk = crosswalk_data[dataset_crosswalk_mapping[dataset_name]]

            # Store Cell Type column in a new column ("Initial CT")
            nodes_df['Original Cell Type'] = nodes_df['Cell Type']

            # Merge nodes_df with crosswalk on Cell Type column. If CL Label is not null, replace Cell Type with CL Label, else keep original cell type name. 
            nodes_df = pd.merge(nodes_df, crosswalk, on='Cell Type', how='left')
            nodes_df['Cell Type'] = np.where(nodes_df['CL Label'].isnull(), nodes_df['Cell Type'], nodes_df['CL Label'])
            # nodes_df = nodes_df.drop(columns=['CL Label'])

            # Drop CL Label column.
            nodes_df = nodes_df.drop(columns=['CL Label'])

            # Convert to lowercase and strip whitespace for all values in Cell Type column.
            nodes_df['Cell Type'] = nodes_df['Cell Type'].str.lower().str.strip()

            # Replace "cells" with "cell" for all values in Cell Type column.
            nodes_df['Cell Type'] = nodes_df['Cell Type'].str.lower().str.replace("cells", "cell")

            # Check if number of rows before and after merge are same.
            if nodes_df.shape[0] != og_df:
                print(f"Number of rows before merge: {og_df}")
                print(f"Number of rows after merge: {nodes_df.shape[0]}")

            # Create destination directory for the dataset if it doesn't exist.
            dest_dir = os.path.join(basepath, dest_filedir, dataset_name)
            create_directory(dest_dir)

            # Write the updated nodes_df to a new file.
            nodes_df.to_csv(os.path.join(dest_dir, filename), index=False)

Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes-with-harmonized-cell-types/intestine-codex-stanford' created successfully.
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes-with-harmonized-cell-types/intestine-codex-stanford' already exists.
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes-with-harmonized-cell-types/intestine-codex-stanford' already exists.
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes-with-harmonized-cell-types/intestine-codex-stanford' already exists.
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes-with-harmonized-cell-types/intestine-codex-stanford' already exists.
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes-with-harmonized-cell-types/intestine-codex-stanford' already exists.
Directory '/u/yashjain/hra-cell-distance-analysis/data/data-processed-nodes-with-harmonized-cell-types/intestine-codex-stanf

## Mapping second Cell Type Crosswalks

In [86]:
secondary_crosswalk_data = pd.read_excel(os.path.join(basepath, mapping_files_dir, mapping_file_2))

In [87]:
# Clean up secondary crosswalk data. 
# Drop Tissue Type and Organ_ID columns.
secondary_crosswalk_data = secondary_crosswalk_data.drop(columns=['Tissue Type', 'Organ_ID'])

# Print number of unique cell types in the secondary crosswalk data based on Original Cell Type column.
print(f"Number of unique cell types in secondary crosswalk data: {secondary_crosswalk_data['Original Cell Type'].nunique()}")

# Print number of rows. 
print(f"Number of rows in secondary crosswalk data: {secondary_crosswalk_data.shape[0]}")

# Remove duplicate rows based on Original Cell Type column.
secondary_crosswalk_data = secondary_crosswalk_data.drop_duplicates(subset=['Original Cell Type'])

# Print number of unique cell types in the secondary crosswalk data after removing duplicates.
print(f"Number of unique cell types in secondary crosswalk data after removing duplicates: {secondary_crosswalk_data['Original Cell Type'].nunique()}")

# Print number of rows after removing duplicates.
print(f"Number of rows in secondary crosswalk data after removing duplicates: {secondary_crosswalk_data.shape[0]}")

Number of unique cell types in secondary crosswalk data: 239
Number of rows in secondary crosswalk data: 309
Number of unique cell types in secondary crosswalk data after removing duplicates: 239
Number of rows in secondary crosswalk data after removing duplicates: 239


In [88]:
# Bring in secondary crosswalk data with multiple merge levels.
for dirpath, dirnames, filenames in os.walk(os.path.join(basepath, dest_filedir)):
    for filename in [f for f in filenames if f.endswith("-nodes.csv")]:
        dataset_name = dirpath.split("/")[-1]
        # if dataset_name in dataset_crosswalk_mapping:
        nodes_df = load_data(os.path.join(dirpath, filename))
        og_df_num_rows = nodes_df.shape[0]

        # Merge nodes_df with crosswalk_data on Cell Type column in nodes_df and Original Cell Type column in secondary_crosswalk_data.
        nodes_df = pd.merge(nodes_df, secondary_crosswalk_data, left_on='Cell Type', right_on='Original Cell Type', how='left')
        # print(nodes_df.columns)

        # Rename Original Cell Type_x column.
        nodes_df = nodes_df.rename(columns={'Original Cell Type_x': 'Original Cell Type'})

        # Drop columns: Original Cell Type_y
        # Drop columns if they exist
        columns_to_drop = ['Original Cell Type_y', 'Cell Type']
        existing_columns = [col for col in columns_to_drop if col in nodes_df.columns]
        nodes_df = nodes_df.drop(columns=existing_columns)

        # Check if number of rows before and after merge are same.
        if nodes_df.shape[0] != og_df_num_rows:
            print(f"Number of rows before merge: {og_df_num_rows}")
            print(f"Number of rows after merge: {nodes_df.shape[0]}")

        # Write the updated nodes_df to a new file.
        nodes_df.to_csv(os.path.join(dirpath, filename), index=False)

In [89]:
# Print final message
print("All datasets processed and saved with harmonized cell types.")

All datasets processed and saved with harmonized cell types.


## Data Validation

In [90]:
# Data Validation: Compute and print total number of unique values for each specified column across all datasets in the destination directory.
columns = ["Original Cell Type", "Level Three Cell Type", "Level Two Cell Type", "Level One Cell Type"]
unique_counts = {col: set() for col in columns}

for dirpath, dirnames, filenames in os.walk(os.path.join(basepath, dest_filedir)):
    for filename in [f for f in filenames if f.endswith("-nodes.csv")]:
        nodes_df = load_data(os.path.join(dirpath, filename))
        for col in columns:
            if col in nodes_df.columns:
                unique_counts[col].update(nodes_df[col].dropna().unique())

for col in columns:
    print(f"Total unique values in '{col}': {len(unique_counts[col])}")

Total unique values in 'Original Cell Type': 295
Total unique values in 'Level Three Cell Type': 162
Total unique values in 'Level Two Cell Type': 57
Total unique values in 'Level One Cell Type': 8


In [91]:
# Data Validation: Compute the total number of rows in data-processed-nodes and data-processed-nodes-with-harmonized-cell-types directories.
orig_nodes_count = 0
harmonized_nodes_count = 0
for dirpath, dirnames, filenames in os.walk(os.path.join(basepath, orig_filedir)):
    for filename in [f for f in filenames if f.endswith("-nodes.csv")]:
        nodes_df = load_data(os.path.join(dirpath, filename))
        orig_nodes_count += nodes_df.shape[0]
for dirpath, dirnames, filenames in os.walk(os.path.join(basepath, dest_filedir)):
    for filename in [f for f in filenames if f.endswith("-nodes.csv")]:
        nodes_df = load_data(os.path.join(dirpath, filename))
        harmonized_nodes_count += nodes_df.shape[0]
print(f"Total number of nodes in data-processed-nodes: {orig_nodes_count}")
print(f"Total number of nodes in data-processed-nodes-with-harmonized-cell-types: {harmonized_nodes_count}")
# print difference. 
print(f"Difference in number of nodes: {harmonized_nodes_count - orig_nodes_count}")

Total number of nodes in data-processed-nodes: 47349496
Total number of nodes in data-processed-nodes-with-harmonized-cell-types: 47349496
Difference in number of nodes: 0


In [92]:
# Data Validation: Compute the total number of rows in data-processed-nodes and data-processed-nodes-with-harmonized-cell-types directories.
# Compute it per file, and print the file name if the number of rows is different.
orig_nodes_count_per_file = {}
harmonized_nodes_count_per_file = {}
for dirpath, dirnames, filenames in os.walk(os.path.join(basepath, orig_filedir)):
    for filename in [f for f in filenames if f.endswith("-nodes.csv")]:
        nodes_df = load_data(os.path.join(dirpath, filename))
        orig_nodes_count_per_file[filename] = nodes_df.shape[0]
for dirpath, dirnames, filenames in os.walk(os.path.join(basepath, dest_filedir)):
    for filename in [f for f in filenames if f.endswith("-nodes.csv")]:
        nodes_df = load_data(os.path.join(dirpath, filename))
        harmonized_nodes_count_per_file[filename] = nodes_df.shape[0]
# Compare the two dictionaries and print the file name if the number of rows is different.
for filename in orig_nodes_count_per_file.keys():
    if filename in harmonized_nodes_count_per_file:
        if orig_nodes_count_per_file[filename] != harmonized_nodes_count_per_file[filename]:
            print(f"File '{filename}' has different number of rows: {orig_nodes_count_per_file[filename]} (original) vs {harmonized_nodes_count_per_file[filename]} (harmonized)")
    else:
        print(f"File '{filename}' not found in harmonized directory.")
for filename in harmonized_nodes_count_per_file.keys():
    if filename not in orig_nodes_count_per_file:
        print(f"File '{filename}' not found in original directory.")
# Print final message
print("Data validation completed. Check the output for any discrepancies in row counts between original and harmonized datasets.")

Data validation completed. Check the output for any discrepancies in row counts between original and harmonized datasets.


In [93]:
# Read all datasets and check if there are any missing or null values in any columns.
for dirpath, dirnames, filenames in os.walk(os.path.join(basepath, dest_filedir)):
    for filename in [f for f in filenames if f.endswith("-nodes.csv")]:
        dataset_name = dirpath.split("/")[-1]
        nodes_df = load_data(os.path.join(dirpath, filename))
        if nodes_df.isnull().values.any():
            print(f"Missing values found in {dataset_name}")

print("NOTE: Bone marrow dataset is expected to be flagged here since the cell type mutant blast:npm1 does not have a corresponding value in CL Label/ID/Match columns for Level Three Cell Type.\nThis should not impact any downstream analysis.\nIf any other datasets are flagged, then investigate further.")

Missing values found in bonemarrow-codex-chop
Missing values found in bonemarrow-codex-chop
Missing values found in bonemarrow-codex-chop
Missing values found in bonemarrow-codex-chop
Missing values found in bonemarrow-codex-chop
NOTE: Bone marrow dataset is expected to be flagged here since the cell type mutant blast:npm1 does not have a corresponding value in CL Label/ID/Match columns for Level Three Cell Type.
This should not impact any downstream analysis.
If any other datasets are flagged, then investigate further.
