In [None]:
import os
import re 
from pathlib import Path

import pandas as pd
import numpy as np
import scanpy as sc
import scanpy.external as sce
import anndata as ad
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import matplotlib as mpl
from statannotations.Annotator import Annotator

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/T_NK/alla_Tcells')
sc.settings.figdir = "/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/T_NK/alla_Tcells"

mpl.rcParams['pdf.fonttype'] = 42 #make text editable in pdf
mpl.rcParams['svg.fonttype'] = 'none'

In [None]:
merged = ad.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/T_NK/annotated.h5ad')


In [None]:
clintable = pd.read_csv('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/sample_clin_data/sample_table_DI.txt', delimiter='\t')
clintable[['Sample', 'MRD']]



In [None]:
merged.obs

In [None]:
merged.obs = (
    merged.obs
    .reset_index()
    .merge(clintable[['Sample', 'MRD']], on=['Sample'], how='left')
    .set_index('index')
)


In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['Sample','Collection', 'UPN', 'subset']].copy()


# Count total cells per Sample and annot
counts = df.groupby(['Sample','Collection', 'UPN', 'subset'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['Sample', 'Collection', 'UPN']].drop_duplicates(subset=['Sample'])
)

all_combos = (
    pd.MultiIndex.from_product([df['Sample'].unique(),df['subset'].unique()],
        names=['Sample', 'subset']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['Sample','subset', 'count']], on=['Sample','subset'], how='left')
    .merge(sample_meta, on='Sample', how='left')
    .fillna({'count': 0})
)


# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["NBM", "NDMM", "PT", "Relapse"]
order_celltypes = sorted(full_counts["subset"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Collection",
    y="proportion",
    hue="Collection",
    col="subset",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum", "Relapse": 'purple'},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["subset"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Collection",
        y="proportion",
        hue="Collection",
        order=order_timepoints,
        palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT": "plum", "Relapse": 'purple'},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of T cells")
    ax.set_title(cell_type)
    ax.set_ylim(-1, None)  # adjust y range
    
     # --- Add lines ---
    for pid, grp in sub.groupby("UPN"):
        if grp["Collection"].isin(["NDMM", "PT", "Relapse"]).sum() >= 2:
            grp_sorted = grp.sort_values("Collection", key=lambda x: x.map({"NDMM": 1, "PT": 2,"Relapse": 3}))
            ax.plot(
                [order_timepoints.index(tp) for tp in grp_sorted["Collection"]],
                grp_sorted["proportion"],
                color="gray",
                alpha=0.5,
                linewidth=1
            )
     # --- Add significance tests ---
    pairs = [("NBM", "NDMM"), ("NDMM", "PT")]  # specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Collection", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate()       
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("cell_type_proportions/Tcell_celltype_boxplots_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection

pairs = [("NBM", "NDMM"), ("NDMM", "PT")]

results = []

for cell_type, sub in full_counts.groupby("subset"):

    for cond1, cond2 in pairs:

        x = sub.loc[sub["Collection"] == cond1, "proportion"]
        y = sub.loc[sub["Collection"] == cond2, "proportion"]

        # If either group is empty, skip
        if len(x) == 0 or len(y) == 0:
            continue

        # Mann–Whitney U test (unpaired Wilcoxon)
        stat, pval = mannwhitneyu(x, y, alternative="two-sided")

        results.append({
            "subset": cell_type,
            "comparison": f"{cond2} vs {cond1}",
            "cond1": cond1,
            "cond2": cond2,
            "n_cond1": len(x),
            "n_cond2": len(y),
            "stat": stat,
            "p_raw": pval,
        })

# Convert to DataFrame
res_df = pd.DataFrame(results)

# Apply FDR correction across all tests
reject, p_fdr = fdrcorrection(res_df["p_raw"], alpha=0.05, method="indep")
res_df["p_fdr"] = p_fdr
res_df["significant_fdr_0.05"] = reject

# Save to CSV
out_path = "cell_type_proportions/Tcell_celltype_boxplots_Mann-Whitney_FDR_results.csv"
res_df.to_csv(out_path, index=False)

print("Saved results to:", out_path)
print(res_df)

In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['Sample','Collection', 'UPN', 'MRD', 'subset']].copy()

# Count total cells per Sample and annot
counts = df.groupby(['Sample','Collection', 'UPN', 'MRD','subset'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['Sample', 'Collection','MRD', 'UPN']].drop_duplicates(subset=['Sample'])
)

all_combos = (
    pd.MultiIndex.from_product([df['Sample'].unique(),df['subset'].unique()],
        names=['Sample', 'subset']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['Sample','subset', 'count']], on=['Sample','subset'], how='left')
    .merge(sample_meta, on='Sample', how='left')
    .fillna({'count': 0})
)

full_counts["Collection_new"]  = np.select(
    [
        full_counts["Collection"].str.contains("PT") & (full_counts["MRD"] == 'Positive'),   # first condition
        full_counts["Collection"].str.contains("PT") & (full_counts["MRD"] == 'Negative'), 
        full_counts["Collection"].str.contains("PT") & (full_counts["MRD"] == 'Unk'), 
    ],
    [
        # corresponding outputs
        full_counts["Collection"].astype(str) + " MRD+",   
        full_counts["Collection"].astype(str) + " MRD-",  
        full_counts["Collection"].astype(str) + " MRD??",   
    ],
    default=full_counts["Collection"],  
)

full_counts = full_counts[full_counts['Collection_new'] != "PT MRD??"]

# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["NBM", "NDMM", "PT MRD+","PT MRD-", "Relapse"]
order_celltypes = sorted(full_counts["subset"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Collection_new",
    y="proportion",
    hue="Collection_new",
    col="subset",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT MRD+": "plum","PT MRD-": "plum","PT MRD??": "plum", "Relapse": 'purple'},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["subset"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Collection_new",
        y="proportion",
        hue="Collection_new",
        order=order_timepoints,
        palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT MRD+": "plum","PT MRD-": "plum","PT MRD??": "plum", "Relapse": 'purple'},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of T cells")
    ax.set_title(cell_type)
    ax.set_ylim(-1, None)  # adjust y range
    
#      # --- Add lines ---
#     for pid, grp in sub.groupby("UPN"):
#         if grp["Collection"].isin(["NDMM", "PT_Positive","PT_Negative","PT_Unk", "Relapse"]).sum() >= 2:
#             grp_sorted = grp.sort_values("Collection_new", key=lambda x: x.map({"NDMM": 1, "PT": 2,"Relapse": 3}))
#             ax.plot(
#                 [order_timepoints.index(tp) for tp in grp_sorted["Collection"]],
#                 grp_sorted["proportion"],
#                 color="gray",
#                 alpha=0.5,
#                 linewidth=1
#             )
     # --- Add significance tests ---
    pairs = [
        ("NBM", "NDMM"),
        ("NDMM", "PT MRD-"),
        ("NDMM", "PT MRD+"),
        ("PT MRD-", "PT MRD+"),
        ("NDMM", "Relapse")
    ]# specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Collection_new", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="simple",       # displays raw p-value
        show_test_name=False, 
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate()       
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("cell_type_proportions/Tcell_celltype_boxplots_MRD_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection

pairs = [
        ("NBM", "NDMM"),
        ("NDMM", "PT MRD-"),
        ("NDMM", "PT MRD+"),
        ("PT MRD-", "PT MRD+"),
        ("NDMM", "Relapse")
    ]

results = []

for cell_type, sub in full_counts.groupby("subset"):

    for cond1, cond2 in pairs:

        x = sub.loc[sub["Collection_new"] == cond1, "proportion"]
        y = sub.loc[sub["Collection_new"] == cond2, "proportion"]

        # If either group is empty, skip
        if len(x) == 0 or len(y) == 0:
            continue

        # Mann–Whitney U test (unpaired Wilcoxon)
        stat, pval = mannwhitneyu(x, y, alternative="two-sided")

        results.append({
            "subset": cell_type,
            "comparison": f"{cond2} vs {cond1}",
            "cond1": cond1,
            "cond2": cond2,
            "n_cond1": len(x),
            "n_cond2": len(y),
            "stat": stat,
            "p_raw": pval,
        })

# Convert to DataFrame
res_df = pd.DataFrame(results)

# Apply FDR correction across all tests
reject, p_fdr = fdrcorrection(res_df["p_raw"], alpha=0.05, method="indep")
res_df["p_fdr"] = p_fdr
res_df["significant_fdr_0.05"] = reject

# Save to CSV
out_path = "cell_type_proportions/Tcell_celltype_boxplots_MRD_Mann-Whitney_FDR_results.csv"
res_df.to_csv(out_path, index=False)

print("Saved results to:", out_path)
print(res_df)

In [None]:
sc.pl.umap(merged, color=["subset"], save="subset_umap.png")




In [None]:
# plot porportion of cell types for each section
# Extract relevant columns
df = merged.obs[['Sample','Collection', 'UPN', 'MRD', 'subset2']].copy()

# Count total cells per Sample and annot
counts = df.groupby(['Sample','Collection', 'UPN', 'MRD','subset2'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['Sample', 'Collection','MRD', 'UPN']].drop_duplicates(subset=['Sample'])
)

all_combos = (
    pd.MultiIndex.from_product([df['Sample'].unique(),df['subset2'].unique()],
        names=['Sample', 'subset2']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['Sample','subset2', 'count']], on=['Sample','subset2'], how='left')
    .merge(sample_meta, on='Sample', how='left')
    .fillna({'count': 0})
)

full_counts["Collection_new"]  = np.select(
    [
        full_counts["Collection"].str.contains("PT") & (full_counts["MRD"] == 'Positive'),   # first condition
        full_counts["Collection"].str.contains("PT") & (full_counts["MRD"] == 'Negative'), 
        full_counts["Collection"].str.contains("PT") & (full_counts["MRD"] == 'Unk'), 
    ],
    [
        # corresponding outputs
        full_counts["Collection"].astype(str) + " MRD+",   
        full_counts["Collection"].astype(str) + " MRD-",  
        full_counts["Collection"].astype(str) + " MRD??",   
    ],
    default=full_counts["Collection"],  
)

full_counts = full_counts[full_counts['Collection_new'] != "PT MRD??"]

# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

sns.set(style="whitegrid", font_scale=1.2)
order_timepoints = ["NBM", "NDMM", "PT MRD+","PT MRD-", "Relapse"]
order_celltypes = sorted(full_counts["subset2"].unique())  # or custom order

g = sns.catplot(
    data=full_counts,
    x="Collection_new",
    y="proportion",
    hue="Collection_new",
    col="subset2",
    kind="box",
    order=order_timepoints,
    col_order=order_celltypes,
    palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT MRD+": "plum","PT MRD-": "plum","PT MRD??": "plum", "Relapse": 'purple'},
    dodge=False,
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=0.9,
    showfliers=False
)

# Add individual data points on top of boxplots
for cell_type, ax in g.axes_dict.items():
    sub = full_counts[full_counts["subset2"] == cell_type]
    sns.stripplot(
        data=sub,
        x="Collection_new",
        y="proportion",
        hue="Collection_new",
        order=order_timepoints,
        palette={"NBM": "forestgreen", "NDMM": "deeppink", "PT MRD+": "plum","PT MRD-": "plum","PT MRD??": "plum", "Relapse": 'purple'},
        dodge=False,
        size=5,
        ax=ax,
        linewidth=0.5,
        edgecolor="black",
        legend=False
    )
    ax.set_xlabel("")  # cleaner look
    ax.set_ylabel("% of T cells")
    ax.set_title(cell_type)
    ax.set_ylim(-1, None)  # adjust y range
    
     # --- Add significance tests ---
    pairs = [
        ("NBM", "NDMM"),
        ("NDMM", "PT MRD-"),
        ("NDMM", "PT MRD+"),
        ("PT MRD-", "PT MRD+"),
        ("NDMM", "Relapse")
    ]# specify which groups to compare
    annotator = Annotator(ax, pairs, data=sub, x="Collection_new", y="proportion", order=order_timepoints)
    annotator.configure(
        test="Mann-Whitney",          # or "Mann-Whitney" for non-parametric
        text_format="full",       # displays raw p-value
        show_test_name=False, 
    #    loc="outside",              # put p-value above brackets
    #    line_offset=0.05,           # vertical offset
    #    line_height=0.02,           # bracket height
        verbose=False
    )
    annotator.apply_and_annotate()       
# Clean up legend and layout
g.set_titles("{col_name}")
g.fig.subplots_adjust(top=0.9)
g.fig.suptitle("Cell-type proportions by Timepoint", fontsize=16)
g.add_legend(title="Timepoint")
g.fig.savefig("cell_type_proportions/Tcell_celltype2_boxplots_MRD_Mann-Whitney.pdf", dpi=300, bbox_inches="tight")

plt.show()



In [None]:
genesets = pd.read_csv('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/analysis/organized/TNK/pancan_T_genesigs.txt', delimiter='\t')
genesets


In [None]:
merged.obs["Collection_subtype"] = merged.obs['subset'].astype(str) + '_' +  merged.obs['Collection'].astype(str)
selected_groups = ['CD8T_NBM','CD8T_NDMM','CD8T_PT']
merged_subset = merged[merged.obs['Collection_subtype'].isin(selected_groups)].copy()

dot = sc.pl.dotplot(merged_subset, var_names=['GZMA', 'GZMB', 'GZMK', 'GZMH', "CTSC", 'CTSW', 'CST7'],
              layer='normalized',standard_scale='var', return_fig=True,
              groupby="Collection_subtype")
dot.savefig("dotplot_cytotoxic_genes_inCD8.pdf", dpi=300, bbox_inches='tight')


In [None]:
sc.pl.matrixplot(
    merged_subset, 
    var_names=['GZMA', 'GZMB', 'GZMK', 'GZMH', "CTSC", 'CTSW', 'CST7'], 
    groupby="Collection_subtype",
    dendrogram=False,              
    standard_scale='var',           
    figsize=(3.5, 1),
    show=False
)
plt.savefig("heatmap_cytotoxic_genes_inCD8.pdf", dpi=300, bbox_inches='tight')



In [None]:
selected_groups = ['CD4T_NBM','CD4T_NDMM','CD4T_PT']
merged_subset = merged[merged.obs['Collection_subtype'].isin(selected_groups)].copy()

dot = sc.pl.dotplot(merged_subset, var_names=['LEF1', 'TCF7', 'SELL', 'IL7R'],
              layer='normalized',standard_scale='var', return_fig=True,
              groupby="Collection_subtype")
dot.savefig("dotplot_naive_genes_inCD4.pdf", dpi=300, bbox_inches='tight')



In [None]:
selected_groups = ['CD4T_NBM','CD4T_NDMM','CD4T_PT']
merged_subset = merged[merged.obs['Collection_subtype'].isin(selected_groups)].copy()

sc.pl.matrixplot(
    merged_subset, 
    var_names=['LEF1', 'TCF7', 'SELL', 'IL7R'], 
    groupby="Collection_subtype",
    dendrogram=False,              
    standard_scale='var',           
    figsize=(2.5, 1),
    show=False
)
plt.savefig("heatmap_naive_genes_inCD4.pdf", dpi=300, bbox_inches='tight')



In [None]:
(
    merged.obs
    .groupby(['Sample', 'Collection','subset', 'MRD'])['Exhaustion']
    .mean()
    .reset_index()
    .dropna()
)


# prepare for survival analysis

In [None]:
os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/survival')

In [None]:
clintable = pd.read_csv('Surv_clin_table.tsv', delimiter='\t',keep_default_na=False, na_values=["#N/A", "Unknown", "Unk"])
clintable

In [None]:

clintable['Primary.Maintenance'] = np.select(
    [
        clintable["Primary.Maintenance"].str.contains("Ixazomib", case=False, na=False),   # first condition
        clintable["Primary.Maintenance"].str.contains("Lenalidomide", case=False, na=False),   # second condition
        clintable["Primary.Maintenance"].str.contains("None", case=False, na=False), # third condition
    ],
    [
        # corresponding outputs
        "Ixazomib",   
        "Lenalidomide",
        "None",
    ],
    default = clintable["Primary.Maintenance"],  
)

clintable

In [None]:
# Count total cells per Sample and annot
df = merged.obs[['Sample','Collection', 'UPN', 'subset']].copy()
df = df[df['Collection'] == 'PT']


counts = df.groupby(['Sample','Collection', 'UPN', 'subset'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['Sample', 'Collection', 'UPN']].drop_duplicates(subset=['Sample'])
)

all_combos = (
    pd.MultiIndex.from_product([df['Sample'].unique(),df['subset'].unique()],
        names=['Sample', 'subset']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['Sample','subset', 'count']], on=['Sample','subset'], how='left')
    .merge(sample_meta, on='Sample', how='left')
    .fillna({'count': 0})
)


# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals

full_counts.head()

In [None]:
ct_prop = full_counts.pivot_table(index="UPN", columns="subset", values="proportion", aggfunc="mean").reset_index('UPN')

df = ct_prop.merge(clintable[['upn', 'age','sex','clonoSeq_MRD','ISS.Stage','Primary.Maintenance', 'PFS.time','PFS']], 
                   left_on='UPN', right_on='upn', how='right')

df.to_csv('Tcell_pct_with_survival.csv', index=False)

df

In [None]:
merged = ad.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/B/annotated.h5ad')


In [None]:
# Count total cells per Sample and annot
df = merged.obs[['Sample','Collection', 'UPN', 'subset']].copy()
df = df[df['Collection'] == 'PT']


counts = df.groupby(['Sample','Collection', 'UPN', 'subset'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['Sample', 'Collection', 'UPN']].drop_duplicates(subset=['Sample'])
)

all_combos = (
    pd.MultiIndex.from_product([df['Sample'].unique(),df['subset'].unique()],
        names=['Sample', 'subset']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['Sample','subset', 'count']], on=['Sample','subset'], how='left')
    .merge(sample_meta, on='Sample', how='left')
    .fillna({'count': 0})
)


# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals


ct_prop = full_counts.pivot_table(index="UPN", columns="subset", values="proportion", aggfunc="mean").reset_index('UPN')
ct_prop.head()
ct_prop.to_csv('Bcell_pct.csv', index=False)


In [None]:
merged = ad.read_h5ad('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/revision/merge/no_harmony/Mye/annotated.h5ad')


In [None]:
# Count total cells per Sample and annot
df = merged.obs[['Sample','Collection', 'UPN', 'subset']].copy()
df = df[df['Collection'] == 'PT']


counts = df.groupby(['Sample','Collection', 'UPN', 'subset'], observed=True).size().reset_index(name='count')
sample_meta = (
    df[['Sample', 'Collection', 'UPN']].drop_duplicates(subset=['Sample'])
)

all_combos = (
    pd.MultiIndex.from_product([df['Sample'].unique(),df['subset'].unique()],
        names=['Sample', 'subset']).to_frame(index=False)
)

# merge and fill missing counts with 0
full_counts = (
    all_combos
    .merge(counts[['Sample','subset', 'count']], on=['Sample','subset'], how='left')
    .merge(sample_meta, on='Sample', how='left')
    .fillna({'count': 0})
)


# Count total cells per section_id
totals = full_counts.groupby('Sample', observed=True)['count'].transform('sum')

# Add a proportion column
full_counts['proportion'] = full_counts['count'] * 100 / totals


ct_prop = full_counts.pivot_table(index="UPN", columns="subset", values="proportion", aggfunc="mean").reset_index('UPN')

ct_prop.to_csv('Mye_pct.csv', index=False)
ct_prop.head()