In [None]:
import os
import pandas as pd
import scanpy as sc  # Required to load adata
adata = sc.read_h5ad(path_to_adata_file)

In [None]:
def compute_total_cells_per_annotation(adata, annotation, loc="Region", status="ConditionID",
                                       sample_id="imageid", patient_id="SubjectID",
                                       exclude_samples=['H1', 'P4', 'P2b'], #Excluding samples H1, P4 (small detached tissues) and P2b (poor orientation with a bias for epithelium)
                                       output_dir='path_to_output_directory'):
    meta = adata.obs.copy()

    # Get unique combinations (loc removed to avoid duplicate counting)
    unique_combinations = meta[[patient_id, sample_id, annotation, status]].drop_duplicates().astype(str)

    result_table = pd.DataFrame(columns=['sample', 'patient', 'status', 'annotation', 'total.cells'])

    for _, row in unique_combinations.iterrows():
        current_sample_id = row[sample_id]
        current_patient_id = row[patient_id]
        current_annotation = row[annotation]
        current_status = row[status]

        # Filter data just once per (sample + annotation + status)
        current_meta = meta[
            (meta[annotation] == current_annotation) &
            (meta[sample_id] == current_sample_id) &
            (meta[status] == current_status)
        ]

        if current_meta.empty:
            print(f"No data for {current_annotation} in sample {current_sample_id}. Skipping.")
            continue

        total_cells = len(current_meta)

        new_row = pd.DataFrame({
            'sample': [current_sample_id],
            'patient': [current_patient_id],
            'status': [current_status],
            'annotation': [current_annotation],
            'total.cells': [total_cells]
        })

        result_table = pd.concat([result_table, new_row], ignore_index=True)

    result_table['total.cells'] = pd.to_numeric(result_table['total.cells'], errors='coerce')

    grouped = result_table.groupby(['patient', 'sample', 'annotation', 'status']).agg(
        total_cells_pt=('total.cells', 'sum')
    ).reset_index()

    grouped_filtered = grouped[~grouped['sample'].isin(exclude_samples)]

    filename = f"{annotation}_total_cells_filtered.csv"
    grouped_filtered.to_csv(f"{output_dir}{filename}", index=False)

    print(f"Saved: {output_dir}{filename}")
    return grouped_filtered


# Run for each annotation level
annotations_to_run = ['Lvl1', 'Lvl2.5', 'niche_merged', 'Lvl4']
results = {}

for annot in annotations_to_run:
    results[annot] = compute_total_cells_per_annotation(adata, annotation=annot)


In [None]:
# Directory containing the CSV files which are the individual exports from the Fiji automated area calculation
directory = 'path_to_region_area_calculation'

# Output base directory
output_dir = directory

# List all .csv files
all_files = [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.csv')]

# Store all parsed DataFrames
dfs = []

for file in all_files:
    sample_id = os.path.basename(file).replace('.csv', '')
    
    try:
        df = pd.read_csv(file)
        
        # Insert 'sample' column if not already present
        df.insert(0, 'sample', sample_id)
                
        # Drop 2nd column if it's not needed (e.g., an unnamed index)
        df.drop(df.columns[1], axis=1, inplace=True)
        
        # Rename the third column to 'Region' if needed
        df.rename(columns={df.columns[2]: 'Region'}, inplace=True)
        
        # Rename 'Area' to 'Region_Area'
        df.rename(columns={'Area': 'Region_Area'}, inplace=True)

        dfs.append(df)

    except Exception as e:
        print(f"Error processing {file}: {e}")

# Combine all cleaned DataFrames
merged_df = pd.concat(dfs, ignore_index=True)

# Export filtered files by Region
for region in ['CT', 'Epi', 'Total']:
    region_df = merged_df[merged_df['Region'] == region]
    region_df.to_csv(os.path.join(output_dir, f"{region.lower()}_area_merged_calculations.csv"), index=False)

In [None]:
# Calculate cells per area in each sample
df1 = pd.read_csv('path_to_annotation_total_cell_count.csv')
df2 = pd.read_csv('path_to_region_area_calculation.csv')

# Merge the tables on the 'sample' column
df_merged = pd.merge(df1, df2[['sample', 'Region_Area']], on='sample', how='left')

# Convert Region_Area to mm²
df_merged['Region_Area_mm2'] = df_merged['Region_Area'] / 1000000  # divide by 1,000,000 to convert to mm²
df_merged['cells_per_mm2'] = df_merged['total_cells_pt'] / df_merged['Region_Area_mm2']
df_merged.to_csv('path_to_total_cells_per_area_per_sample.csv')

# Check the first few rows to confirm
print(df_merged.head())


In [None]:
# Aggregate cells per area for each patient
df1 = pd.read_csv('path_to_annotation_total_cell_count.csv')
df2 = pd.read_csv('path_to_region_area_calculation.csv')

# Merge the tables on the 'sample' column
df_merged2 = pd.merge(df1, df2[['sample', 'Region_Area']], on='sample', how='left')

# Convert Region_Area to mm²
df_merged2['Region_Area_mm2'] = df_merged2['Region_Area'] / 1000000  

# Group by patient, annotation, and sample and sum the total_cells_pt
df_merged2 = df_merged2.groupby(['patient', 'annotation', 'status']).agg({
    'total_cells_pt': 'sum',
    'Region_Area': 'sum',
    'Region_Area_mm2': 'sum'
}).reset_index()

# Cells per area calculation
df_merged2['cells_per_mm2'] = df_merged2['total_cells_pt'] / df_merged2['Region_Area_mm2']

# Save the result to a CSV file
df_merged2.to_csv('path_to_total_cells_per_area_per_patient.csv', index=False)

# Display the merged DataFrame
df_merged2
