In [None]:
import os, sys, re, subprocess
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, logomaker
from logomaker import transform_matrix
from Bio import AlignIO
from scipy.cluster.hierarchy import linkage, fcluster, cophenet, dendrogram
from scipy.spatial.distance import pdist
import matplotlib.ticker as mtick

In [None]:
def normalize_cols(df):
    df = df.copy()
    df.columns = (
        df.columns
          .str.strip()
          .str.lower()
          .str.replace('-', '_')
          .str.replace(' ', '_')
    )
    return df

# 1a) CoV-AbDab (phage only, preserve Ab_or_Nb)
ab = pd.read_csv('CoV-AbDab_080224.csv', dtype=str)
ab = normalize_cols(ab)
ab = ab[ab['origin'].str.contains('phage', case=False, na=False)]
ab = ab.rename(columns={'name':'Name', 'cdrh3':'CDRH3', 'ab_or_nb':'Ab_or_Nb'})
ab = ab[['Name','CDRH3','Ab_or_Nb']].dropna()

# 1b) Tim3.csv → all antibodies
t3 = pd.read_csv('Tim3.csv', dtype=str)
t3 = normalize_cols(t3)
t3 = t3.rename(columns={'clone':'Name','cdrh3':'CDRH3'})
t3 = t3[['Name','CDRH3']].dropna()
t3['Ab_or_Nb'] = 'Ab'

# 1c) Sanger.csv → all Abs
sg = pd.read_csv('Sanger.csv', dtype=str)
sg = normalize_cols(sg)

for c in ('name','clone_name','clone'):
    if c in sg.columns:
        sg = sg.rename(columns={c:'Name'})
        break
for c in ('cdrh3','cdr_h3','cdr_h_3'):
    if c in sg.columns:
        sg = sg.rename(columns={c:'CDRH3'})
        break
sg = sg[['Name','CDRH3']].dropna()
sg['Ab_or_Nb'] = 'Ab'

# 1d) BA5RFullLength Clones.csv → Abs, prefix with BA5_
ba5 = pd.read_csv('BA5RFullLength Clones.csv', dtype=str)
ba5 = normalize_cols(ba5)
ba5 = ba5.rename(columns={'clone':'Name','cdrh3':'CDRH3'})
ba5 = ba5[['Name','CDRH3']].dropna()
ba5['Name'] = 'BA5_' + ba5['Name'].astype(str)
ba5['Ab_or_Nb'] = 'Ab'

combined = pd.concat([ab, t3, sg, ba5], ignore_index=True)
combined['CDRH3'] = combined['CDRH3'].str.replace(r'\s+', '', regex=True)

valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
mask = combined['CDRH3'].apply(lambda s: all(c in valid_aa for c in s))
if not mask.all():
    print(f"Removing {(~mask).sum()} invalid sequences")
    combined = combined[mask]

combined = combined.drop_duplicates(subset=['CDRH3']).reset_index(drop=True)

combined.to_csv('combined_all_metadata.csv', index=False)
print(f"Saved {len(combined)} entries to combined_all_metadata.csv")

print(combined.head())

In [None]:
df = pd.read_csv('combined_all_metadata.csv')
with open('combined_all.fasta','w') as fh:
    for name, seq in zip(df['Name'], df['CDRH3']):
        fh.write(f">{name}\n{seq}\n")

In [None]:
# Run these in the commandline
# makeblastdb -in combined_all.fasta -dbtype prot -out cdrh3_db
# blastp -task blastp-short -query combined_all.fasta -db your_db -outfmt 6 -out results_allvsall.txt

In [None]:
COMBINED_META_CSV = 'combined_all_metadata.csv'
BLAST_OUT   = 'results_allvsall.txt'
SIM_CSV     = 'similarity_matrix_evalue.csv'
OUTPUT_DIR  = 'grouped_clusters'
LOGO_DIR    = 'cluster_logos'
DIST_CUTOFF = 5000
LINK_METHOD = 'ward'
LEFT_WIDTH  = 5
RIGHT_WIDTH = 15
PSEUDOCOUNT = 0.01
CLUSTALO_EXE = (r"C:\Program Files\clustal-omega-1.2.2-win64"
                r"\clustal-omega-1.2.2-win64\clustalo.exe")
COLOR_DICT = {
    **dict.fromkeys(list("AGST"), 'green'),
    **dict.fromkeys(list("CLIV"), 'yellow'),
    **dict.fromkeys(list("DEQ"),  'blue'),
    **dict.fromkeys(list("FWY"),  'orange'),
    **dict.fromkeys(list("KR"),   'purple'),
    'H':'pink','P':'red'
}

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGO_DIR,  exist_ok=True)

blast = pd.read_csv(
    BLAST_OUT, sep='\t', header=None,
    names=['query','subject','pident','length','mismatch','gapopen',
           'qstart','qend','sstart','send','evalue','bitscore'],
    usecols=['query','subject','evalue'], comment='#'
)
sim = blast.pivot_table('evalue','query','subject', aggfunc='min', fill_value=1e300)
sim = sim.combine_first(sim.T)
for seq in sim.index:
    sim.at[seq,seq] = sim.loc[seq].min()
sim.to_csv(SIM_CSV, float_format='%.3e')
print(f"Wrote similarity matrix → {SIM_CSV}")

meta = pd.read_csv(COMBINED_META_CSV, dtype=str)
meta = meta.rename(columns=str.strip)
for col in ['Name','CDRH3','Ab_or_Nb']:
    if col not in meta.columns:
        sys.exit(f"ERROR: {col} column not found in {COMBINED_META_CSV}")
meta = meta[['Name','CDRH3','Ab_or_Nb']].dropna(subset=['Name','CDRH3','Ab_or_Nb'])
meta[['Name','CDRH3','Ab_or_Nb']] = meta[['Name','CDRH3','Ab_or_Nb']].apply(lambda s: s.str.strip())

seq_map = meta.drop_duplicates('Name').set_index('Name')['CDRH3']

# reload & subset sim to only your Names
sim = pd.read_csv(SIM_CSV, index_col=0).apply(pd.to_numeric, errors='coerce')
sim.index = sim.index.str.strip()
sim.columns= sim.columns.str.strip()
common = seq_map.index.intersection(sim.index).intersection(sim.columns)
missing = set(seq_map.index) - set(common)
if missing:
    print(f"{len(missing)} names missing from SIM, skipping e.g. {list(missing)[:5]}")
seq_map = seq_map.reindex(common)
sim     = sim.loc[common, common]

def run_clustalo(inp, outp):
    if os.path.exists(outp): os.remove(outp)
    r = subprocess.run([CLUSTALO_EXE,'-i',inp,'-o',outp,'--auto','--force'],
                       stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if r.returncode: raise RuntimeError(r.stderr.decode())
    return outp

def window_counts(fa):
    aln = AlignIO.read(fa,'fasta')
    L   = aln.get_alignment_length()
    anchor = max(range(L), key=lambda i: sum(r.seq[i]=='C' for r in aln))
    start,end = max(0,anchor-LEFT_WIDTH), min(L,anchor+RIGHT_WIDTH)
    aas = list("ACDEFGHIKLMNPQRSTVWY")
    mat = pd.DataFrame(0, index=range(end-start), columns=aas)
    for rec in aln:
        for i in range(start,end):
            aa = rec.seq[i]
            if aa in aas: mat.at[i-start,aa]+=1
    return mat

def bits_and_freq(counts):
    ic   = transform_matrix(counts + PSEUDOCOUNT,
                             from_type='counts', to_type='information')
    freq = counts.div(counts.sum(axis=1), axis=0).fillna(0)
    freq[freq<0.05]=0
    return ic, freq

def save_logo(mat, path, mode, cid):
    ylabel = 'Bits' if mode=='bits' else 'Frequency'
    fig,ax = plt.subplots(figsize=(mat.shape[0]*0.5,3))
    logomaker.Logo(mat, ax=ax, color_scheme=COLOR_DICT, stack_order='big_on_top')
    ax.set_xticks(range(mat.shape[0])); ax.set_xticklabels(range(mat.shape[0]))
    ax.set_xlabel(f"Pos (anchor-C at col {LEFT_WIDTH})")
    ax.set_ylabel(ylabel)
    ax.set_ylim(0,2 if mode=='bits' else 1)
    for sp in ('top','right'): ax.spines[sp].set_visible(False)
    ax.set_title(f"Cluster {cid}: {ylabel}", pad=12)
    fig.savefig(path, dpi=300, bbox_inches='tight')
    plt.close(fig)

for grp in ['Ab','Nb']:
    names = meta.query("Ab_or_Nb == @grp")['Name'].tolist()
    names = [n for n in names if n in common]
    if not names:
        print(f"No {grp} sequences, skipping"); continue

    D = -np.log10(sim.loc[names,names])
    Z = linkage(D.values, method=LINK_METHOD)
    cls = fcluster(Z, t=DIST_CUTOFF, criterion='distance')
    cc,_= cophenet(Z, pdist(D.values))
    print(f"{grp}: cophenetic corr={cc:.4f}")

    cluster_details = {}

    fn = os.path.join(OUTPUT_DIR, f"{grp}_cluster_assignments.csv")
    dfc = pd.DataFrame({'Name':names,'cluster':cls})
    dfc.to_csv(fn, index=False)

    cluster_counts = pd.Series(cls).value_counts().sort_index()

    for cid, sub in dfc.groupby('cluster'):
        members = sub['Name'].tolist()
        if len(members) < 2:
            print(f"Skipping cluster {cid} with only {len(members)} sequence(s)")
            continue

        print(f"Processing cluster {cid} with {len(members)} sequences…")

        fa = os.path.join(LOGO_DIR, f"{grp}_c{cid}.fasta")
        aln = fa.replace('.fasta','_aln.fasta')
        with open(fa,'w') as fh:
            for i, s in enumerate(seq_map.reindex(members)):
                fh.write(f">seq{i}\n{s}\n")
        run_clustalo(fa, aln)

        # logos
        cnt = window_counts(aln)
        ic, fq = bits_and_freq(cnt)
        save_logo(fq, os.path.join(LOGO_DIR, f"{grp}_c{cid}_freq.png"), 'freq', cid)
        save_logo(ic, os.path.join(LOGO_DIR, f"{grp}_c{cid}_bits.png"), 'bits', cid)

        # detailed dendrogram
        fig,ax = plt.subplots(figsize=(8, max(4,0.15*len(members))))
        dendrogram(
            linkage((-np.log10(sim.loc[members,members])).values, method=LINK_METHOD),
            orientation='right',
            labels=members,
            above_threshold_color='grey',
            leaf_font_size=6
        )
        ax.axvline(DIST_CUTOFF, ls='--', c='black')
        ax.set_title(f"{grp} cluster {cid} detail (n={len(members)})", pad=12)
        for sp in ('top','right','bottom'): ax.spines[sp].set_visible(True)
        ax.spines['left'].set_visible(True)
        detail_file = os.path.join(OUTPUT_DIR, f"{grp}_c{cid:02d}_detail.png")
        fig.savefig(detail_file, dpi=300, bbox_inches='tight', pad_inches=0.1)
        plt.close(fig)

        cluster_details[cid] = {
            'members': len(members),
            'detail_file': detail_file
        }

        # LaTeX table
        df = pd.read_csv(fn)
        df = df.merge(meta[['Name','CDRH3']], on='Name', how='left').rename(columns={'CDRH3':'Sequence'})
        with open(os.path.join(OUTPUT_DIR, f"{grp}_table.tex"), 'w') as fh:
            fh.write(r"\begin{table}[H]\centering\scriptsize" + "\n")
            fh.write(r"\begin{tabular}{|l|l|r|}\hline Name & Sequence & Cluster \\\hline" + "\n")
            for _,r in df.iterrows():
                nm = re.sub(r'([_#%&$])',r'\\\1', r['Name'])
                fh.write(f"{nm} & {r['Sequence']} & {int(r['cluster'])} \\\n")
            fh.write(r"\hline\end{tabular}\end{table}" + "\n")
        print(f"Wrote {grp} table → {OUTPUT_DIR}/{grp}_table.tex")

        # overview dendrogram
        fig, ax = plt.subplots(figsize=(12, 6))
        dendrogram(
            Z,
            color_threshold=DIST_CUTOFF,
            above_threshold_color='grey',
            no_labels=True,
            ax=ax
        )
        ax.axhline(DIST_CUTOFF, ls='--', c='black')
        ax.set_title(f"{grp} overview dendrogram (cut at {DIST_CUTOFF})")
        ax.set_ylabel('')
        for spine in ax.spines.values(): spine.set_visible(True)
        plt.tight_layout()
        fig.savefig(os.path.join(OUTPUT_DIR, f"{grp}_overview_dend.png"), dpi=300)
        plt.close(fig)

        # cluster sizes bar chart
        fig, ax = plt.subplots(figsize=(8, 5))
        cluster_counts.plot.bar(ax=ax)
        ax.set_title(f"{grp} cluster sizes")
        ax.set_xlabel("Cluster"); ax.set_ylabel("# seqs")
        ax.xaxis.set_major_locator(mtick.MaxNLocator(integer=True))
        for spine in ax.spines.values(): spine.set_visible(True)
        plt.tight_layout()
        fig.savefig(os.path.join(OUTPUT_DIR, f"{grp}_cluster_sizes.png"), dpi=300)
        plt.close(fig)

        # truncated dendrogram
        fig, ax = plt.subplots(figsize=(12, 6))
        dendrogram(
            Z,
            truncate_mode='lastp',
            p=30,
            color_threshold=DIST_CUTOFF,
            above_threshold_color='grey',
            show_leaf_counts=True,
            leaf_rotation=0,
            ax=ax
        )
        ax.axhline(DIST_CUTOFF, ls='--', c='black')
        ax.set_title(f"{grp} truncated dendrogram (last 30 merges)")
        ax.set_ylabel('Distance')
        for spine in ax.spines.values(): spine.set_visible(True)
        plt.tight_layout()
        fig.savefig(os.path.join(OUTPUT_DIR, f"{grp}_trunc_dend.png"), dpi=300)
        plt.close(fig)

        # subclustering for cluster 4
        if cid == 4:
            memb4 = members
            D4   = -np.log10(sim.loc[memb4, memb4])
            Z4   = linkage(D4.values, method=LINK_METHOD)
            SUB_DIST = 2000
            sub_labels = fcluster(Z4, t=SUB_DIST, criterion='distance')
            sub_df = pd.DataFrame({'Name': memb4, 'subcluster': sub_labels})
            sub_df.to_csv(os.path.join(OUTPUT_DIR, f"{grp}_c04_subclusters.csv"), index=False)
            print(f"Cluster 4 split into {sub_df.subcluster.nunique()} pieces → saved CSV.")

            # cluster-4 subtree dendrogram
            fig,ax = plt.subplots(figsize=(6, max(4,0.12*len(memb4))))
            dendrogram(
                Z4,
                orientation='right',
                labels=memb4,
                color_threshold=0.5*(Z4[:,2].max() + SUB_DIST),
                above_threshold_color='grey',
                leaf_font_size=5,
                ax=ax
            )
            ax.axvline(SUB_DIST, ls='--', c='black')
            ax.set_title(f"{grp} c4 subtree (cut @ {SUB_DIST})", pad=12)
            ax.set_xlabel('Distance')
            for sp in ('top','right','bottom','left'): ax.spines[sp].set_visible(True)
            plt.tight_layout()
            fig.savefig(os.path.join(OUTPUT_DIR, f"{grp}_c04_subdend.png"), dpi=300)
            plt.close(fig)

            for sublab in sorted(sub_df.subcluster.unique()):
                leaves = sub_df.query("subcluster==@sublab")['Name'].tolist()
                if len(leaves)<2: continue
                Dsub = -np.log10(sim.loc[leaves, leaves])
                Zsub = linkage(Dsub.values, method=LINK_METHOD)
                fig,ax = plt.subplots(figsize=(4, max(2, 0.12*len(leaves))))
                dendrogram(
                    Zsub,
                    orientation='right',
                    labels=leaves,
                    color_threshold=0.7*Zsub[:,2].max(),
                    leaf_font_size=6,
                    above_threshold_color='grey',
                    ax=ax
                )
                ax.axvline(DIST_CUTOFF, ls='--', c='black')
                ax.set_title(f"{grp} c4.{sublab} (n={len(leaves)})", pad=8, fontsize=9)
                ax.set_xlabel('Distance')
                for spine in ax.spines.values(): spine.set_visible(True)
                plt.tight_layout()
                fig.savefig(os.path.join(OUTPUT_DIR, f"{grp}_c04_sub{sublab:02d}_detail.png"),
                            dpi=300, bbox_inches='tight')
                plt.close(fig)
                fa_sub = os.path.join(LOGO_DIR, f"{grp}_c04_sub{sublab:02d}.fasta")
                with open(fa_sub,'w') as fh:
                    for i, seq in enumerate(seq_map.reindex(leaves).dropna().tolist()):
                        fh.write(f">seq{i}\n{seq}\n")
                aln_sub = fa_sub.replace('.fasta','_aln.fasta')
                run_clustalo(fa_sub, aln_sub)
                cnt_sub = window_counts(aln_sub)
                ic_sub, fq_sub = bits_and_freq(cnt_sub)
                save_logo(fq_sub, os.path.join(LOGO_DIR, f"{grp}_c04_sub{sublab:02d}_freq.png"), 'freq', f"4.{sublab}")
                save_logo(ic_sub, os.path.join(LOGO_DIR, f"{grp}_c04_sub{sublab:02d}_bits.png"), 'bits', f"4.{sublab}")

    summary_file = os.path.join(OUTPUT_DIR, f"{grp}_cluster_summary.txt")
    with open(summary_file, 'w') as fh:
        fh.write(f"# {grp} Cluster Summary\n\n")
        fh.write(f"Total sequences: {len(names)}\n")
        fh.write(f"Total clusters: {len(cluster_counts)}\n\n")
        fh.write("Cluster sizes:\n")
        for cid, count in cluster_counts.items():
            fh.write(f"  Cluster {cid}: {count} sequences\n")
        fh.write("\nCluster details:\n")
        for cid, details in cluster_details.items():
            fh.write(f"  Cluster {cid}:\n")
            fh.write(f"    Members: {details['members']}\n")
            fh.write(f"    Detail file: {os.path.basename(details['detail_file'])}\n")
    print(f"Wrote {grp} cluster summary → {summary_file}")