In [None]:
import os
from pathlib import Path
from typing import Annotated

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
import tifffile

from sklearn.cluster import KMeans
from skimage.color import label2rgb
from sklearn.neighbors import radius_neighbors_graph
from sklearn.neighbors import NearestNeighbors

from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import euclidean_distances
from scipy.stats import wilcoxon
from statsmodels.stats.multitest import multipletests
from scipy.stats import entropy, chi2_contingency
from matplotlib.backends.backend_pdf import PdfPages
from statannotations.Annotator import Annotator

from scipy import sparse

plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['pdf.fonttype'] = 42 #make text editable in pdf

os.chdir('/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/lineage_specific_analysis/Mye/')
os.getcwd()


In [None]:
merged = sc.read_h5ad("/diskmnt/Projects/myeloma_scRNA_analysis/MMY_IRD/Xenium/analysis/radial_neighborhoods/Output/merged_RN.h5ad")

In [None]:
# make a new row combining UPN and collection
merged.obs['UPN_Collection'] = merged.obs['UPN'].astype('string').str.cat(merged.obs['Collection'].astype('string'), sep='_', na_rep='')

sample_to_collection = merged.obs.set_index('Sample')['Collection'].to_dict()
uc_to_collection = merged.obs.set_index('UPN_Collection')['Collection'].to_dict()
uc_to_upn =  merged.obs.set_index('UPN_Collection')['UPN'].to_dict()

In [None]:
collection_order = ["NBM", "NDMM", "PT"] 
timecols = {"NBM": "#0C7515", "NDMM": "#E619B9", "PT": "#CF99C3"} 

In [None]:
mye = merged[merged.obs['ct'].isin(['GMP', 'Late Myeloid', 'Neutrophil', 'cDC', 'Ba/Eo/Ma', 'Monocyte', 'Macrophage'])].copy()

In [None]:
mye.obs['Panel'].unique()

In [None]:
merged.obs['ct'].unique().tolist()

In [None]:
celltypes = mye.obs['ct'].unique().tolist()
genes=['CD274', 'CTLA4','SPP1', 'HIF1A']
pdf_path = "mye_matrixplots_by_ct.pdf"

with PdfPages(pdf_path) as pdf:
    for ct in celltypes:
        ad = mye[mye.obs['ct'] == ct].copy()

        # create a new figure for each plot
        plt.figure(figsize=(3, 2))

        sc.pl.matrixplot(
            ad,
            var_names=genes,
            groupby='Collection',
            standard_scale='var',
            dendrogram=False,
            show=False 
        )

        plt.suptitle(f"Cell type: {ct}", y=1.02, fontsize=12)
        pdf.savefig(bbox_inches='tight')
        plt.close()

In [None]:
celltypes = mye.obs['ct'].unique().tolist()
genes=['STAT3', 'STAT5A', 'IFI44L', 'ISG15']
pdf_path = "mye_matrixplots_by_ct_v6.pdf"

with PdfPages(pdf_path) as pdf:
    for ct in celltypes:
        ad = mye[(mye.obs['ct']==ct) & (mye.obs['Panel']=='BYGXJ6_hMulti')].copy()

        # create a new figure for each plot
        plt.figure(figsize=(3, 2))

        sc.pl.matrixplot(
            ad,
            var_names=genes,
            groupby='Collection',
            standard_scale='var',
            dendrogram=False,
            show=False 
        )

        plt.suptitle(f"Cell type: {ct}", y=1.02, fontsize=12)
        pdf.savefig(bbox_inches='tight')
        plt.close()

In [None]:
sc.pl.matrixplot(
    mye[(mye.obs['Panel']=='BYGXJ6_hMulti')],
    var_names=['STAT3', 'STAT5A', 'IFI44L', 'ISG15' ],
    groupby=['Collection'],
    standard_scale="var",                    
    dendrogram=False,
    show=False
)
plt.savefig("v6_overall_mye.pdf")

In [None]:
def remove_outliers_iqr(df, col='frac_ct', k=1.5):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - k * iqr
    upper = q3 + k * iqr
    return df[(df[col] >= lower) & (df[col] <= upper)]

def compute_positive_fraction(adata, gene, threshold=0):
    x = adata[:, gene].X
    x = x.toarray().ravel()
    pos = x > threshold
    
    needed_cols = ["Collection", "UPN"]
    obs_cols = [c for c in needed_cols if c in adata.obs.columns]
    df = adata.obs[obs_cols].copy()
    df["pos"] = pos
    group_cols = [col for col in ["UPN", "Collection"] if col in df.columns]

    out = (
        df.groupby(group_cols, observed=True)
          .agg(frac_pos=("pos", "mean"), n_cells=("pos", "size"))
          .reset_index()
    )
    out["gene"] = gene
    return out


In [None]:
genes = ["CD274", "CTLA4", "HIF1A",  "SPP1", 'NFKB1']  
collection_order = ["NBM", "NDMM", "PT"]     
pdf_path = "gene_positive_fraction_mye_boxplots.pdf"
threshold = 0     

all_results = []
#adata=mye[mye.obs['ct']=='Macrophage'].copy()
adata=mye
for gene in genes:
    df_gene = compute_positive_fraction(adata, gene, threshold=threshold)
    all_results.append(df_gene)

df_all = pd.concat(all_results, ignore_index=True)

with PdfPages(pdf_path) as pdf:
    for gene in genes:
        sub = df_all[df_all["gene"] == gene].copy()

        plt.figure(figsize=(2, 4))
        ax = sns.boxplot(
            data=remove_outliers_iqr(sub,col='frac_pos'),
            x="Collection",
            y="frac_pos", palette=timecols,
            order=collection_order,
            fliersize=0
        )
        sns.stripplot(
            data=remove_outliers_iqr(sub,col='frac_pos'),
            x="Collection",
            y="frac_pos",
            order=collection_order,
            color="black",
            size=3,
            alpha=0.8,
            jitter=True
        )
        pairs = []
        # generate all ordered pairs in collection_order
        for i in range(len(collection_order)):
            for j in range(i+1, len(collection_order)):
                pairs.append((collection_order[i], collection_order[j]))

        annot = Annotator(
            ax, pairs, data=sub,
            x="Collection", y="frac_pos",
            order=collection_order
        )
        annot.configure(
            test="Mann-Whitney",
            text_format="star",
            comparisons_correction=None,
            hide_non_significant=True,
            loc="inside",
            line_height=0.03,
            line_offset=0.02
        )
        annot.apply_and_annotate()

        plt.title(f"{gene}")
        plt.ylabel("% Positive")
        plt.xlabel("Collection")
        plt.tight_layout()
        pdf.savefig()
        plt.close()


In [None]:
genes = ['STAT3', 'STAT5A', 'IFI44L', 'ISG15']  
collection_order = ["NBM", "NDMM", "PT"]     
pdf_path = "gene_positive_fraction_mye_V6panel_boxplots.pdf"
threshold = 0     

all_results = []
adata=mye[(mye.obs['Panel']=='BYGXJ6_hMulti')].copy()
for gene in genes:
    df_gene = compute_positive_fraction(adata, gene, threshold=threshold)
    all_results.append(df_gene)

df_all = pd.concat(all_results, ignore_index=True)

with PdfPages(pdf_path) as pdf:
    for gene in genes:
        sub = df_all[df_all["gene"] == gene].copy()

        plt.figure(figsize=(2, 4))
        ax = sns.boxplot(
            data=remove_outliers_iqr(sub,col='frac_pos'),
            x="Collection",
            y="frac_pos", palette=timecols,
            order=collection_order,
            fliersize=0
        )
        sns.stripplot(
            data=remove_outliers_iqr(sub,col='frac_pos'),
            x="Collection",
            y="frac_pos",
            order=collection_order,
            color="black",
            size=3,
            alpha=0.8,
            jitter=True
        )
        pairs = []
        # generate all ordered pairs in collection_order
        for i in range(len(collection_order)):
            for j in range(i+1, len(collection_order)):
                pairs.append((collection_order[i], collection_order[j]))

        annot = Annotator(
            ax, pairs, data=sub,
            x="Collection", y="frac_pos",
            order=collection_order
        )
        annot.configure(
            test="Mann-Whitney",
            text_format="star",
            comparisons_correction=None,
            hide_non_significant=True,
            loc="inside",
            line_height=0.03,
            line_offset=0.02
        )
        annot.apply_and_annotate()

        plt.title(f"{gene}")
        plt.ylabel("% Positive")
        plt.xlabel("Collection")
        plt.tight_layout()
        pdf.savefig()
        plt.close()


In [None]:
genes = ["CD274", "CTLA4", "HIF1A",  "SPP1", 'NFKB1']  
collection_order = ["NBM", "NDMM", "PT"]     
pdf_path = "gene_positive_fraction_mye_panel_boxplots_pairedWilcoxon.pdf"
threshold = 0     

adata = mye.copy()

all_results = []
for gene in genes:
    df_gene = compute_positive_fraction(adata, gene, threshold=threshold)
    all_results.append(df_gene)

df_all = pd.concat(all_results, ignore_index=True)

# get paired upns
tmp = (
    df_all[["UPN", "Collection"]]
      .drop_duplicates()
      .query("Collection in ['NDMM', 'PT']")
)

paired_upns = (
    tmp.groupby("UPN")["Collection"]
       .nunique()
)
paired_upns = paired_upns[paired_upns == 2].index.tolist()
paired_upns

with PdfPages(pdf_path) as pdf:
    for gene in genes:
        sub = df_all[
            (df_all["gene"] == gene) &
            (df_all["UPN"].isin(paired_upns))
        ].copy()

        sub_no_out = remove_outliers_iqr(sub, col='frac_pos')

        # sort by UPN then Collection so Wilcoxon pairs align by UPN
        sub_no_out = sub_no_out.sort_values(["UPN", "Collection"])

        plt.figure(figsize=(2.2, 4))
        ax = sns.boxplot(
            data=sub_no_out,
            x="Collection",
            y="frac_pos",
            palette=timecols,
            fliersize=0
        )
        sns.stripplot(
            data=sub_no_out,
            x="Collection",
            y="frac_pos",
            color="black",
            size=3,
            alpha=0.8,
            jitter=True
        )
        # paired lines
        for upn, g in sub_no_out.groupby("UPN"):
            g_ndmm = g[g["Collection"] == "NDMM"]
            g_pt   = g[g["Collection"] == "PT"]
            if len(g_ndmm) == 1 and len(g_pt) == 1:
                y_vals = [g_ndmm["frac_pos"].iloc[0], g_pt["frac_pos"].iloc[0]]
                ax.plot([1, 2], y_vals, color="lightgray", linewidth=1, alpha=0.7)

        pairs = [("NDMM", "PT")]
        annot = Annotator(
            ax,
            pairs,
            data=sub,
            x="Collection",
            y="frac_pos",
        )
        annot.configure(
            test="Wilcoxon",      
            text_format="star",
            comparisons_correction=None,
            hide_non_significant=True,
            loc="inside",
            line_height=0.03,
            line_offset=0.02,
        )
        annot.apply_and_annotate()

        plt.title(f"{gene} (paired NDMM vs PT)")
        plt.ylabel("% Positive")
        plt.xlabel("Collection")
        plt.tight_layout()
        pdf.savefig()
        plt.close()

In [None]:
genes = ['STAT3', 'STAT5A', 'IFI44L', 'ISG15']  
collection_order = ["NBM", "NDMM", "PT"]     
pdf_path = "gene_positive_fraction_mye_V6panel_pairedWilcoxon_boxplots.pdf"
threshold = 0     

all_results = []
adata=mye[(mye.obs['Panel']=='BYGXJ6_hMulti')].copy()

all_results = []
for gene in genes:
    df_gene = compute_positive_fraction(adata, gene, threshold=threshold)
    all_results.append(df_gene)

df_all = pd.concat(all_results, ignore_index=True)

# get paired upns
tmp = (
    df_all[["UPN", "Collection"]]
      .drop_duplicates()
      .query("Collection in ['NDMM', 'PT']")
)

paired_upns = (
    tmp.groupby("UPN")["Collection"]
       .nunique()
)
paired_upns = paired_upns[paired_upns == 2].index.tolist()
paired_upns

with PdfPages(pdf_path) as pdf:
    for gene in genes:
        sub = df_all[
            (df_all["gene"] == gene) &
            (df_all["UPN"].isin(paired_upns))
        ].copy()

        sub_no_out = remove_outliers_iqr(sub, col='frac_pos')

        # sort by UPN then Collection so Wilcoxon pairs align by UPN
        sub_no_out = sub_no_out.sort_values(["UPN", "Collection"])

        plt.figure(figsize=(2.2, 4))
        ax = sns.boxplot(
            data=sub_no_out,
            x="Collection",
            y="frac_pos",
            palette=timecols,
            fliersize=0
        )
        sns.stripplot(
            data=sub_no_out,
            x="Collection",
            y="frac_pos",
            color="black",
            size=3,
            alpha=0.8,
            jitter=True
        )
        # paired lines
        for upn, g in sub_no_out.groupby("UPN"):
            g_ndmm = g[g["Collection"] == "NDMM"]
            g_pt   = g[g["Collection"] == "PT"]
            if len(g_ndmm) == 1 and len(g_pt) == 1:
                y_vals = [g_ndmm["frac_pos"].iloc[0], g_pt["frac_pos"].iloc[0]]
                ax.plot([1, 2], y_vals, color="lightgray", linewidth=1, alpha=0.7)

        pairs = [("NDMM", "PT")]
        annot = Annotator(
            ax,
            pairs,
            data=sub,
            x="Collection",
            y="frac_pos",
        )
        annot.configure(
            test="Wilcoxon",      
            text_format="star",
            comparisons_correction=None,
            hide_non_significant=True,
            loc="inside",
            line_height=0.03,
            line_offset=0.02,
        )
        annot.apply_and_annotate()

        plt.title(f"{gene} (paired NDMM vs PT)")
        plt.ylabel("% Positive")
        plt.xlabel("Collection")
        plt.tight_layout()
        pdf.savefig()
        plt.close()

In [None]:
# check RN distribution of CTLA4+ myeloid cells
adata = mye.copy()
gene = "CTLA4"
threshold = 0

Xg = adata[:, gene].X
if sparse.issparse(Xg):
    Xg = Xg.toarray()
expr = np.ravel(Xg)

adata.obs["gene_pos"] = (expr > threshold).astype(int)

# get fraction positive per UPN × Collection × RN
df = (
    adata.obs
    .groupby(["UPN", "Collection", "rn"], observed=True)
    .agg(
        n_cells=("gene_pos", "size"),
        n_pos=("gene_pos", "sum")
    )
    .reset_index()
)

df["frac_pos"] = df["n_pos"] / df["n_cells"]


collection_order = ["NBM", "NDMM", "PT"]
rns = sorted(df["rn"].unique())
pdf_path = "CTLA4_mye_fracPos_by_RN_groupedBoxplot.pdf"

with PdfPages(pdf_path) as pdf:
    plt.figure(figsize=(10,5))
    ax = sns.boxplot(
        data= remove_outliers_iqr(df, col='frac_pos'),
        x="rn",
        y="frac_pos",
        hue="Collection",
        hue_order=collection_order,
        palette=timecols,
        fliersize=0
    )
    sns.stripplot(
        data=remove_outliers_iqr(df, col='frac_pos'),
        x="rn",
        y="frac_pos",
        hue="Collection",
        hue_order=collection_order,
        dodge=True,
        alpha=0.7,
        size=2,
        palette="dark:black"
    )

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[:3], labels[:3], title="Collection")

    pairs = []
    for rn in rns:
        for i in range(len(collection_order)):
            for j in range(i+1, len(collection_order)):
                pairs.append(((rn, collection_order[i]), (rn, collection_order[j])))

    annot = Annotator(
        ax,
        pairs,
        data=df,
        x="rn",
        y="frac_pos",
        hue="Collection",
        order=rns,
        hue_order=collection_order
    )
    annot.configure(
        test="Mann-Whitney",
        text_format="star",
        comparisons_correction='BH',
        hide_non_significant=True,
        loc="inside",
        line_height=0.03,
        line_offset=0.02
    )
    annot.apply_and_annotate()

    plt.title(f'{gene}+ of fraction of total myeloid in radial neighborhoods')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

In [None]:
# check RN distribution of HIF1A+ myeloid cells
adata = mye.copy()
gene = "HIF1A"
threshold = 0

Xg = adata[:, gene].X
if sparse.issparse(Xg):
    Xg = Xg.toarray()
expr = np.ravel(Xg)

adata.obs["gene_pos"] = (expr > threshold).astype(int)

# get fraction positive per UPN × Collection × RN
df = (
    adata.obs
    .groupby(["UPN", "Collection", "rn"], observed=True)
    .agg(
        n_cells=("gene_pos", "size"),
        n_pos=("gene_pos", "sum")
    )
    .reset_index()
)

df["frac_pos"] = df["n_pos"] / df["n_cells"]


collection_order = ["NBM", "NDMM", "PT"]
rns = sorted(df["rn"].unique())
pdf_path = "HIF1A_mye_fracPos_by_RN_groupedBoxplot.pdf"

with PdfPages(pdf_path) as pdf:
    plt.figure(figsize=(10,5))
    ax = sns.boxplot(
        data= remove_outliers_iqr(df, col='frac_pos'),
        x="rn",
        y="frac_pos",
        hue="Collection",
        hue_order=collection_order,
        palette=timecols,
        fliersize=0
    )
    sns.stripplot(
        data=remove_outliers_iqr(df, col='frac_pos'),
        x="rn",
        y="frac_pos",
        hue="Collection",
        hue_order=collection_order,
        dodge=True,
        alpha=0.7,
        size=2,
        palette="dark:black"
    )

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[:3], labels[:3], title="Collection")

    pairs = []
    for rn in rns:
        for i in range(len(collection_order)):
            for j in range(i+1, len(collection_order)):
                pairs.append(((rn, collection_order[i]), (rn, collection_order[j])))

    annot = Annotator(
        ax,
        pairs,
        data=df,
        x="rn",
        y="frac_pos",
        hue="Collection",
        order=rns,
        hue_order=collection_order
    )
    annot.configure(
        test="Mann-Whitney",
        text_format="star",
        comparisons_correction='BH',
        hide_non_significant=True,
        loc="inside",
        line_height=0.03,
        line_offset=0.02
    )
    annot.apply_and_annotate()

    plt.title(f'{gene}+ of fraction of total myeloid in radial neighborhoods')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

In [None]:
# check RN distribution of HIF1A+ myeloid cells
adata = mye.copy()
gene = "CD274"
threshold = 0

Xg = adata[:, gene].X
if sparse.issparse(Xg):
    Xg = Xg.toarray()
expr = np.ravel(Xg)

adata.obs["gene_pos"] = (expr > threshold).astype(int)

# get fraction positive per UPN × Collection × RN
df = (
    adata.obs
    .groupby(["UPN", "Collection", "rn"], observed=True)
    .agg(
        n_cells=("gene_pos", "size"),
        n_pos=("gene_pos", "sum")
    )
    .reset_index()
)

df["frac_pos"] = df["n_pos"] / df["n_cells"]


collection_order = ["NBM", "NDMM", "PT"]
rns = sorted(df["rn"].unique())
pdf_path = "CD274_mye_fracPos_by_RN_groupedBoxplot.pdf"

with PdfPages(pdf_path) as pdf:
    plt.figure(figsize=(10,5))
    ax = sns.boxplot(
        data= remove_outliers_iqr(df, col='frac_pos'),
        x="rn",
        y="frac_pos",
        hue="Collection",
        hue_order=collection_order,
        palette=timecols,
        fliersize=0
    )
    sns.stripplot(
        data=remove_outliers_iqr(df, col='frac_pos'),
        x="rn",
        y="frac_pos",
        hue="Collection",
        hue_order=collection_order,
        dodge=True,
        alpha=0.7,
        size=2,
        palette="dark:black"
    )

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[:3], labels[:3], title="Collection")

    pairs = []
    for rn in rns:
        for i in range(len(collection_order)):
            for j in range(i+1, len(collection_order)):
                pairs.append(((rn, collection_order[i]), (rn, collection_order[j])))

    annot = Annotator(
        ax,
        pairs,
        data=df,
        x="rn",
        y="frac_pos",
        hue="Collection",
        order=rns,
        hue_order=collection_order
    )
    annot.configure(
        test="Mann-Whitney",
        text_format="star",
        comparisons_correction='BH',
        hide_non_significant=True,
        loc="inside",
        line_height=0.03,
        line_offset=0.02
    )
    annot.apply_and_annotate()

    plt.title(f'{gene}+ of fraction of total myeloid in radial neighborhoods')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

In [None]:
# check RN distribution of SPP1 myeloid cells
adata = mye.copy()
gene = "SPP1"
threshold = 0

Xg = adata[:, gene].X
if sparse.issparse(Xg):
    Xg = Xg.toarray()
expr = np.ravel(Xg)

adata.obs["gene_pos"] = (expr > threshold).astype(int)

# get fraction positive per UPN × Collection × RN
df = (
    adata.obs
    .groupby(["UPN", "Collection", "rn"], observed=True)
    .agg(
        n_cells=("gene_pos", "size"),
        n_pos=("gene_pos", "sum")
    )
    .reset_index()
)

df["frac_pos"] = df["n_pos"] / df["n_cells"]


collection_order = ["NBM", "NDMM", "PT"]
rns = sorted(df["rn"].unique())
pdf_path = "SPP1_mye_fracPos_by_RN_groupedBoxplot.pdf"

with PdfPages(pdf_path) as pdf:
    plt.figure(figsize=(10,5))
    ax = sns.boxplot(
        data= remove_outliers_iqr(df, col='frac_pos'),
        x="rn",
        y="frac_pos",
        hue="Collection",
        hue_order=collection_order,
        palette=timecols,
        fliersize=0
    )
    sns.stripplot(
        data=remove_outliers_iqr(df, col='frac_pos'),
        x="rn",
        y="frac_pos",
        hue="Collection",
        hue_order=collection_order,
        dodge=True,
        alpha=0.7,
        size=2,
        palette="dark:black"
    )

    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[:3], labels[:3], title="Collection")

    pairs = []
    for rn in rns:
        for i in range(len(collection_order)):
            for j in range(i+1, len(collection_order)):
                pairs.append(((rn, collection_order[i]), (rn, collection_order[j])))

    annot = Annotator(
        ax,
        pairs,
        data=df,
        x="rn",
        y="frac_pos",
        hue="Collection",
        order=rns,
        hue_order=collection_order
    )
    annot.configure(
        test="Mann-Whitney",
        text_format="star",
        comparisons_correction='BH',
        hide_non_significant=True,
        loc="inside",
        line_height=0.03,
        line_offset=0.02
    )
    annot.apply_and_annotate()

    plt.title(f'{gene}+ of fraction of total myeloid in radial neighborhoods')
    plt.tight_layout()
    pdf.savefig()
    plt.close()

In [None]:
genes = ["HIF1A", "CTLA4", "SPP1", "CD274"]  
collection_order = ["NBM", "NDMM", "PT"]     
pdf_path = "gene_positive_fraction_mye_panel_boxplots_pairedWilcoxon.pdf"
threshold = 0     

adata = mye.copy()

all_results = []
for gene in genes:
    df_gene = compute_positive_fraction(adata, gene, threshold=threshold)
    all_results.append(df_gene)

df_all = pd.concat(all_results, ignore_index=True)

# get paired upns
tmp = (
    df_all[["UPN", "Collection"]]
      .drop_duplicates()
      .query("Collection in ['NDMM', 'PT']")
)

paired_upns = (
    tmp.groupby("UPN")["Collection"]
       .nunique()
)
paired_upns = paired_upns[paired_upns == 2].index.tolist()

paired_upns
upn_cols= {'WU007':"lightgray", 'WU025':"lightgray", 'WU030':"lightgray", 'WU043':"black", 'WU050':"lightgray", 'WU066':"lightgray", 'WU068':"lightgray", 'WU107':"lightgray"}



with PdfPages(pdf_path) as pdf:
    for gene in genes:
        sub = df_all[
            (df_all["gene"] == gene) &
            (df_all["UPN"].isin(paired_upns))
        ].copy()

        sub_no_out = remove_outliers_iqr(sub, col='frac_pos')

        # sort by UPN then Collection so Wilcoxon pairs align by UPN
        sub_no_out = sub_no_out.sort_values(["UPN", "Collection"])

        plt.figure(figsize=(2.2, 4))
        ax = sns.boxplot(
            data=sub_no_out,
            x="Collection",
            y="frac_pos",
            palette=timecols,
            fliersize=0
        )
        sns.stripplot(
            data=sub_no_out,
            x="Collection",
            y="frac_pos",
            color="black",
            size=3,
            alpha=0.8,
            jitter=True
        )
        # paired lines
        for upn, g in sub_no_out.groupby("UPN"):
            g_ndmm = g[g["Collection"] == "NDMM"]
            g_pt   = g[g["Collection"] == "PT"]
            if len(g_ndmm) == 1 and len(g_pt) == 1:
                y_vals = [g_ndmm["frac_pos"].iloc[0], g_pt["frac_pos"].iloc[0]]
                ax.plot([1, 2], y_vals, color=upn_cols[upn],linewidth=1, alpha=0.7)

        pairs = [("NDMM", "PT")]
        annot = Annotator(
            ax,
            pairs,
            data=sub,
            x="Collection",
            y="frac_pos",
        )
        annot.configure(
            test="Wilcoxon",      
            text_format="star",
            comparisons_correction=None,
            hide_non_significant=True,
            loc="inside",
            line_height=0.03,
            line_offset=0.02,
        )
        annot.apply_and_annotate()

        plt.title(f"{gene} (paired NDMM vs PT)")
        plt.ylabel("% Positive")
        plt.xlabel("Collection")
        plt.tight_layout()
        pdf.savefig()
        plt.close()