### Workflow

1. FTU segmentation
2. Compare FTU's cell type information between Azimuth, PopV and Celltypist

In [None]:
#!pip install opencv-python

In [None]:
import cv2
import numpy as np
from skimage import filters, measure, color, morphology
import matplotlib.pyplot as plt

# Load image
image_path = "D:\\vostro\\Degree\\Masters\\CNS_Project\\Release_4\\issue-231\\PNG\\2d-ftu-small-intestine-villus.png"
image = cv2.imread(image_path)
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# Apply Gaussian Blur
blurred = cv2.GaussianBlur(gray_image, (5, 5), 0)

# Edge detection
edges = cv2.Canny(blurred, 500, 150)

# Thresholding to create a binary image
thresh = filters.threshold_otsu(blurred)
binary = gray_image > thresh

# Remove small objects
cleaned = morphology.remove_small_objects(binary, min_size=500)

# Label the image
labeled = measure.label(cleaned)
image_label_overlay = color.label2rgb(labeled, image=image, bg_label=0)

# Count the number of unique labels (cells)
num_cells = np.max(labeled)

# Plot the images
fig, ax = plt.subplots(1, 4, figsize=(20, 5))  # Adjust subplot size
ax[0].imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
ax[0].set_title('Original Image')
ax[0].axis('off')

ax[1].imshow(edges, cmap='gray')
ax[1].set_title('Edges')
ax[1].axis('off')

ax[2].imshow(image_label_overlay)
ax[2].set_title('Labeled Cells')
ax[2].axis('off')

# Display counts on a new subplot
ax[3].text(0.5, 0.5, f'Total Cells: {num_cells}', horizontalalignment='center', verticalalignment='center', fontsize=12, color='red')
ax[3].axis('off')

plt.show()

In [None]:
import cv2
import numpy as np
from skimage import measure, color
import matplotlib.pyplot as plt

def segment_and_count_cells(image_path):
    # Load image
    image = cv2.imread(image_path)
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Enhance contrast using CLAHE
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    contrast_enhanced = clahe.apply(gray_image)

    # Apply Gaussian Blur to smooth out the image
    blurred = cv2.GaussianBlur(contrast_enhanced, (5, 5), 0)

    # Adaptive Thresholding to create a binary image
    thresh = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                   cv2.THRESH_BINARY_INV, 11, 2)

    # Morphological operations to clean the image
    kernel = np.ones((3, 3), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=2)

    # Create sure background area by dilating the opening
    sure_bg = cv2.dilate(opening, kernel, iterations=3)

    # Finding sure foreground area using the distance transform and thresholding
    dist_transform = cv2.distanceTransform(opening, cv2.DIST_L2, 5)
    _, sure_fg = cv2.threshold(dist_transform, 0.2 * dist_transform.max(), 255, 0)

    # Finding unknown region (borders)
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg, sure_fg)

    # Label markers for Watershed
    _, markers = cv2.connectedComponents(sure_fg)
    markers = markers + 1
    markers[unknown == 255] = 0

    # Apply the Watershed algorithm
    markers = cv2.watershed(image, markers)
    image[markers == -1] = [255, 0, 0]  # Mark boundaries in red

    # Count cells
    unique_markers = np.unique(markers)
    cell_count = len(unique_markers) - 1  # exclude background

    # Display results
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.title('Original Image')
    plt.axis('off')

    plt.subplot(1, 2, 2)
    plt.imshow(color.label2rgb(markers, image=image, bg_label=0), cmap='jet')
    plt.title(f'Segmented Image - Cells Counted: {cell_count}')
    plt.axis('off')

    plt.show()

    return cell_count

# Path to your image
image_path = "D:\\vostro\\Degree\\Masters\\CNS_Project\\Release_4\\issue-231\\PNG\\2d-ftu-kidney-renal-corpuscle.png"
total_cells = segment_and_count_cells(image_path)
print(f"Total cells counted: {total_cells}")


### Get the Cell annotation tool for cell types in FTUs

In [None]:
import pandas as pd
import requests

# Function to fetch and extract data from JSON link
def fetch_json_data(url):
    response = requests.get(url)
    json_data = response.json()
    return json_data

# Load the CSV
data = pd.read_csv("C:\\Users\\Supriya\\Downloads\\Query-23.csv")

# Prepare columns for 'origin', 'dataset_type', and 'dataset_info'
data['origin'] = ''
data['dataset_type'] = ''
data['dataset_info'] = ''

# Iterate over the rows and fetch JSON data
for index, row in data.iterrows():
    print(f"Processing dataset {index+1}/{len(data)}: {row['dataset']}")
    try:
        json_data = fetch_json_data(row['dataset'])

        # Extract required fields
        origins = [d['origin'] for d in json_data.get('ingest_metadata', {}).get('dag_provenance_list', [])]
        data.at[index, 'origin'] = ', '.join(origins)
        data.at[index, 'dataset_type'] = json_data.get('dataset_type', '')
        data.at[index, 'dataset_info'] = json_data.get('dataset_info', '')
    except Exception as e:
        print(f"Failed to process dataset at {row['dataset']} due to an error: {e}")

# Save the modified dataframe to a new CSV file
data.to_csv('C:\\Users\\Supriya\\Downloads\\updated.csv', index=False)


In [None]:
import pandas as pd
import json

# Load JSON data
with open("C:\\Users\\Supriya\\Downloads\\crosswalks.jsonld", 'r') as file:
    json_data = json.load(file)

print(json_data.keys())
# Assuming json_data is structured as a dictionary with necessary keys
json_df = pd.DataFrame(json_data['@graph'])

# Convert the provided cell ID list into a DataFrame
cell_ids_list = [
    "CL:1001107", "CL:1001111", "CL:1000768", "CL:1001106", "CL:0002306",
    "CL:1000850", "CL:1000742", "CL:1000850", "CL:0000653", "CL:0002306",
    "CL:1001005", "CL:1001099", "CL:1000452", "CL:1001096", "CL:1000714",
    "CL:4030021", "CL:4030020", "CL:1001033", "CL:1000718", "CL:1001033",
    "CL:1000717", "CL:1000716", "CL:1001033", "CL:1001107", "CL:1001131",
    "CL:1001111", "CL:1001285", "CL:1001107", "CL:1001131", "CL:4033024",
    "CL:4033022", "CL:4033023", "CL:0000313", "CL:4033003", "CL:0002063",
    "CL:0009089", "CL:0002062", "CL:4028004", "CL:4033016", "CL:4028002",
    "CL:1001433", "CL:0002064", "CL:0002080", "CL:0019032", "CL:1000322",
    "CL:1001433", "CL:0002080", "CL:1000321", "CL:0002071", "CL:0000165",
    "CL:0009016", "CL:0019032", "CL:0000171", "CL:0000169", "CL:0000173",
    "CL:0002275", "CL:0002144", "CL:0000235", "CL:0000115", "CL:0000649",
    "CL:0002457", "CL:0000646", "CL:0000242", "CL:1000458", "CL:0009100",
    "CL:0019018", "CL:1000413", "CL:0000241", "CL:0000075", "CL:0002543",
    "CL:1000398", "CL:0000632", "CL:0000091", "CL:0000182", "CL:1000301",
    "CL:0000669", "CL:1000487", "CL:1000299", "CL:2000059", "CL:0002340",
    "CL:0002341", "CL:0000359", "CL:0000809", "CL:0000942", "CL:0000883",
    "CL:0009077", "CL:0001070", "CL:0009071", "CL:0000071", "CL:0000787",
    "CL:0009070", "CL:0000942", "CL:0000738", "CL:0000882", "CL:0000788",
    "CL:0001055", "CL:0000442", "CL:0000980", "CL:0000775", "CL:0000877",
    "CL:0000872", "CL:0000874", "CL:0001056", "CL:0000037", "CL:0001055",
    "CL:0000442", "CL:0000980", "CL:0000775", "CL:0000876", "CL:0000192",
    "CL:2000053", "CL:0000499", "CL:1000489", "CL:0000623", "CL:0000057",
    "CL:0009080", "CL:0000584", "CL:0000097", "CL:1000324", "CL:0000186",
    "CL:0000786", "CL:0000451", "CL:0000236", "CL:0000771", "CL:0000235",
    "CL:0001065", "CL:0000163"
]

# Create DataFrame for the cell IDs
cell_ids_df = pd.DataFrame(cell_ids_list, columns=['CT ID in CL'])

# Match these IDs with the JSON DataFrame we previously extracted and parsed
matched_df = cell_ids_df.merge(json_df, left_on='CT ID in CL', right_on='cell_id', how='left')

print(matched_df.head())

# Create new columns for 'azimuth', 'popv', and 'celltypist'
# Populate 'azimuth', 'popv', and 'celltypist' based on the 'tool' from the JSON
# Here, we assume any valid 'tool' entry means a match (adjust as necessary based on actual tool names)
matched_df['azimuth'] = matched_df['tool'].apply(lambda x: 1 if x == 'azimuth' else 0)
matched_df['popv'] = matched_df['tool'].apply(lambda x: 1 if x == 'popv' else 0)
matched_df['celltypist'] = matched_df['tool'].apply(lambda x: 1 if x == 'celltypist' else 0)

In [None]:
# Selecting relevant columns for the final CSV
final_output_df = matched_df[['organ_level', 'organ_id', 'cell_label', 'CT ID in CL', 'azimuth', 'popv', 'celltypist']].fillna(0)

print(final_output_df.head())

# Show the first few rows of the final DataFrame to verify the content before saving it
output_csv_path = 'C:\\Users\\Supriya\\Downloads\\Matched_Cell_IDs.csv'
final_output_df.to_csv(output_csv_path, index=False)

In [None]:
import pandas as pd

# Load the CSV files from the uploaded paths
azimuth_df = pd.read_csv('C:\\Users\\Supriya\\Downloads\\azimuth.csv')
popv_df = pd.read_csv('C:\\Users\\Supriya\\Downloads\\popv.csv')
celltypist_df = pd.read_csv('C:\\Users\\Supriya\\Downloads\\celltypist.csv')
ftu_cell_count_df = pd.read_csv('C:\\Users\\Supriya\\Downloads\\FTU Cell Count Table - Cell_Type_Count.csv')

# Display the first few rows of each dataframe to inspect their structure
print("Azimuth DataFrame Sample:")
print(azimuth_df.head())

print("\nPOPV DataFrame Sample:")
print(popv_df.head())

print("\nCelltypist DataFrame Sample:")
print(celltypist_df.head())

print("\nFTU Cell Count DataFrame Sample:")
print(ftu_cell_count_df.head())


In [None]:
# Function to extract unique levels from the organ_level column
def extract_levels(df, tool):
    levels = df['Organ_Level'].str.split('_').apply(lambda x: x[-1] if isinstance(x, list) else None).unique()
    if tool == 'popv':
        levels_with_suffix = [f'{tool}_{level}' for level in levels if level and 'L' in level]
        has_suffix = df['Organ_Level'].str.contains('L', na=False).any()
        return levels_with_suffix, has_suffix
    return [f'{tool}_{level}' for level in levels if level], True

azimuth_levels, _ = extract_levels(azimuth_df, 'azimuth')
popv_levels, popv_has_suffix = extract_levels(popv_df, 'popv')
celltypist_levels, _ = extract_levels(celltypist_df, 'celltypist')


# Print the extracted levels for debugging
print("\nAzimuth Levels:", azimuth_levels)
print("\nPOPV Levels:", popv_levels)
print("\nCelltypist Levels:", celltypist_levels)


In [None]:
# Add 'popv' if no _Lx suffix is found in any organ_level for popv
if not popv_has_suffix:
    popv_levels.append('popv')

# Combine all levels into a set to avoid duplicates
all_levels = set(azimuth_levels + popv_levels + celltypist_levels)

# Initialize the result dataframe with the fixed columns
result_df = pd.DataFrame(columns=['Organ', 'FTU Label in Uberon', 'FTU ID in Uberon', 'CL_id'] + list(all_levels))

# Iterate through the rows of the FTU Cell Count Table
for _, row in ftu_cell_count_df.iterrows():
    organ = row['Organ']
    ftu_label = row['FTU Label in Uberon']
    ftu_id = row['FTU ID in Uberon']
    cell_id = row['CT ID in CL']

    # Find matching organ_level in azimuth, popv, celltypist
    azimuth_row = azimuth_df[azimuth_df['Organ_Level'].str.contains(organ, na=False)]
    popv_row = popv_df[popv_df['Organ_Level'].str.contains(organ, na=False)]
    celltypist_row = celltypist_df[celltypist_df['Organ_Level'].str.contains(organ, na=False)]

    # Check if cell_id matches in azimuth, popv, celltypist
    azimuth_cell = azimuth_row[azimuth_row['CL_ID'] == cell_id]
    popv_cell = popv_row[popv_row['CL_ID'] == cell_id]
    celltypist_cell = celltypist_row[celltypist_row['CL_ID'] == cell_id]

    # Create a new row dictionary with all levels set to 0
    new_row = {level: 0 for level in all_levels}
    new_row.update({
        'Organ': organ,
        'FTU Label in Uberon': ftu_label,
        'FTU ID in Uberon': ftu_id,
        'CL_id': cell_id
    })

    # Populate 0 or 1 based on match
    for level in azimuth_levels:
        if not azimuth_cell.empty and any(azimuth_cell['Organ_Level'].str.contains(level.split('_')[-1])):
            new_row[level] = 1
    for level in popv_levels:
        if level == 'popv':
            if not popv_cell.empty and all(~popv_cell['Organ_Level'].str.contains('_')):
                new_row[level] = 1
        else:
            if not popv_cell.empty and any(popv_cell['Organ_Level'].str.contains(level.split('_')[-1])):
                new_row[level] = 1
    for level in celltypist_levels:
        if not celltypist_cell.empty and any(celltypist_cell['Organ_Level'].str.contains(level.split('_')[-1])):
            new_row[level] = 1

    # Append the row to the result dataframe using pd.concat
    result_df = pd.concat([result_df, pd.DataFrame([new_row])], ignore_index=True)

# Save the result dataframe to a CSV file
result_path = 'C:\\Users\\Supriya\\Downloads\\combined_results_NEW.csv'
result_df.to_csv(result_path, index=False)

# Return the result dataframe for inspection
result_df.head() #, result_path

In [None]:
# Get the CL_IDs in each dataframe
azimuth_cl_ids = set(azimuth_df['CL_ID'])
celltypist_cl_ids = set(celltypist_df['CL_ID'])
popv_cl_ids = set(popv_df['CL_ID'])
ftu_cl_ids = set(ftu_cell_count_df['CT ID in CL'])

# Find the common CL_IDs in each dataframe with ftu_cell_count_df
common_azimuth = ftu_cl_ids.intersection(azimuth_cl_ids)
common_celltypist = ftu_cl_ids.intersection(celltypist_cl_ids)
common_popv = ftu_cl_ids.intersection(popv_cl_ids)

# Print the number of common CL_IDs in each case
num_common_azimuth = len(common_azimuth)
num_common_celltypist = len(common_celltypist)
num_common_popv = len(common_popv)

print(f"Number of common CL_IDs in azimuth: {num_common_azimuth}")
print(f"Number of common CL_IDs in celltypist: {num_common_celltypist}")
print(f"Number of common CL_IDs in popv: {num_common_popv}")

In [10]:
import pandas as pd

# Load the provided files
popv = pd.read_csv('C:\\Users\\Supriya\\Downloads\\popv.csv')
celltypist = pd.read_csv('C:\\Users\\Supriya\\Downloads\\celltypist.csv')
azimuth = pd.read_csv('C:\\Users\\Supriya\\Downloads\\azimuth.csv')
ftu_cell_count = pd.read_csv('C:\\Users\\Supriya\\Downloads\\FTU Cell Count Table - Cell_Type_Count.csv')

# Define the mapping dictionary
organ_to_uberon = {
    'Kidney': 'UBERON:0002113',
    'Lung': 'UBERON:0002048',
    'Pancreas': 'UBERON:0001264',
    'Large Intestine': 'UBERON:0002107',
    'Skin': 'UBERON:0002097',
    'Liver': 'UBERON:0002108',
    'Prostate': 'UBERON:0002367',
    'Thymus': 'UBERON:0002371',
    'Spleen': 'UBERON:0002370',
    'Small Intestine': 'UBERON:0002106'
}

# Filter FTU cell count to include only rows where `CT ID in CL` contains `CL`
ftu_cell_count_filtered = ftu_cell_count[ftu_cell_count['CT ID in CL'].str.contains('CL', na=False)]

# Map the organ names in FTU cell count table to UBERON IDs
ftu_cell_count_filtered.loc[:, 'Organ_ID'] = ftu_cell_count_filtered['Organ'].map(organ_to_uberon)

# Prepare a common structure to hold the results
result = []

# Iterate over the filtered FTU cell count data
for idx, ftu_row in ftu_cell_count_filtered.iterrows():
    organ = ftu_row['Organ_ID']
    ftu_label = ftu_row['FTU Label in Uberon']
    ftu_id = ftu_row['FTU ID in Uberon']
    cl_id = ftu_row['CT ID in CL']
    cl_label = ftu_row['CT Label in CL']
    
    # Initialize match columns
    match_info = {
        'Organ': ftu_row['Organ'],
        'FTU Label in Uberon': ftu_label,
        'FTU ID in Uberon': ftu_id,
        'CL_id': cl_id,
        'CT Label in CL': cl_label,
        'popv': 0,
        'celltypist_L1': 0,
        'celltypist_pkl': 0,
        'azimuth_L1': 0,
        'azimuth_L2': 0,
        'azimuth_L3': 0,
        'azimuth_L4': 0,
        'azimuth_L5': 0,
        'azimuth_level' : 0
    }
    
    # Check in celltypist
    celltypist_matches = celltypist[(celltypist['CL_ID'] == cl_id) & (celltypist['Organ_ID'] == organ)]
    for _, ct_match in celltypist_matches.iterrows():
        level = ct_match['Organ_Level'].split('_')[-1]  # Extract the level number
        match_info[f'celltypist_{level}'] = 1

    # Check in azimuth
    azimuth_matches = azimuth[(azimuth['CL_ID'] == cl_id) & (azimuth['Organ_ID'] == organ)]
    for _, az_match in azimuth_matches.iterrows():
        level = az_match['Organ_Level'].split('_')[-1]  # Extract the level number
        match_info[f'azimuth_{level}'] = 1

    # Check in popv
    popv_matches = popv[(popv['CL_ID'] == cl_id) & (popv['Organ_ID'] == organ)]
    if not popv_matches.empty:
        match_info['popv'] = 1

    # Append to results
    result.append(match_info)

# Create a dataframe from the results
result_df = pd.DataFrame(result)

# Save the result to a CSV file
output_path = 'C:\\Users\\Supriya\\Downloads\\FTU_Cell_Count_Annotated_1.csv'
result_df.to_csv(output_path, index=False)

print(f"File saved to {output_path}")


File saved to C:\Users\Supriya\Downloads\FTU_Cell_Count_Annotated_1.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ftu_cell_count_filtered.loc[:, 'Organ_ID'] = ftu_cell_count_filtered['Organ'].map(organ_to_uberon)


In [6]:
import pandas as pd
import json

# Load the JSON-LD file manually to inspect its structure
with open("C:\\Users\\Supriya\\Downloads\\atlas-as-cell-summaries.jsonld", 'r') as file:
    atlas_as_cell_summaries = json.load(file)

# Convert the relevant part of the JSON-LD file to a DataFrame
atlas_as_cell_summaries_graph = atlas_as_cell_summaries['@graph']
atlas_as_cell_summaries_df = pd.json_normalize(atlas_as_cell_summaries_graph, 'summary', ['sex', 'cell_source_label', 'annotation_method', 'aggregated_summaries'])

# Load the FTU Cell Count Table
ftu_cell_count_table = pd.read_csv("C:\\Users\\Supriya\\Downloads\\FTU Cell Count Table - Cell_Type_Count.csv")

# Extract relevant columns
ftu_relevant_columns = ftu_cell_count_table[['CT ID in CL', 'Organ']]
atlas_relevant_columns = atlas_as_cell_summaries_df[['cell_id', 'cell_label', 'annotation_method', 'cell_source_label', 'sex', 'aggregated_summaries']]

# Merge dataframes on the 'cell_id' and 'CT ID in CL' assuming they are the same
merged_df = pd.merge(atlas_relevant_columns, ftu_relevant_columns, left_on='cell_id', right_on='CT ID in CL', how='inner')

# Save the merged DataFrame to a CSV file
merged_df.to_csv("C:\\Users\\Supriya\\Downloads\\merged_dataframe_1.csv", index=False)

# Find cell_labels that share the same values in aggregated_summaries
shared_summary_cells = atlas_relevant_columns.explode('aggregated_summaries')
shared_summary_cells = shared_summary_cells.groupby('aggregated_summaries')['cell_label'].unique().reset_index()

# Filter for rows where more than one cell_label share the same aggregated_summary
shared_summary_cells = shared_summary_cells[shared_summary_cells['cell_label'].apply(lambda x: len(x) > 1)]

# Modify the shared_summary_cells DataFrame to include cell_source_label
shared_summary_cells_with_source = shared_summary_cells.explode('cell_label')
shared_summary_cells_with_source = pd.merge(
    shared_summary_cells_with_source,
    atlas_as_cell_summaries_df[['cell_label', 'cell_source_label']],
    on='cell_label',
    how='left'
).drop_duplicates()

# Saving the shared_summary_cells_with_source DataFrame to a CSV file
shared_summary_cells_with_source.to_csv("C:\\Users\\Supriya\\Downloads\\shared_summary_cells_with_source_1.csv", index=False)
