In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score 
import matplotlib.pyplot as plt, logomaker
from logomaker import transform_matrix
from Bio import AlignIO
import matplotlib.ticker as mtick
import os, sys, re, subprocess
import pandas as pd, numpy as np
from scipy.cluster.hierarchy import linkage, fcluster, cophenet, dendrogram
from scipy.spatial.distance import pdist
import matplotlib.ticker as mtick

In [None]:
BA5_CSV   = 'BA5RFullLength Clones.csv'
TIM3_CSV  = 'Tim3.csv'
SANGER_CSV = 'sanger_dedup.csv'
OUT_CSV   = 'sanger_only_metadata.csv'
OUT_FASTA = 'sanger_only_sequences.fasta'

def normalize_cols(df):
    df = df.copy()
    df.columns = (
        df.columns.str.strip()
                  .str.lower()
                  .str.replace('-', '_')
                  .str.replace(' ', '_')
    )
    return df

frames = []

# BA5 → prefix names with "BA5_"
df_ba5 = pd.read_csv(BA5_CSV, dtype=str)
df_ba5 = normalize_cols(df_ba5)
df_ba5 = df_ba5.rename(columns={'clone':'Name','cdrh3':'CDRH3'})
df_ba5 = df_ba5[['Name','CDRH3']].dropna()
df_ba5['Name'] = 'BA5_' + df_ba5['Name'].astype(str)
frames.append(df_ba5)

# Tim3 → all Ab
df_t3 = pd.read_csv(TIM3_CSV, dtype=str)
df_t3 = normalize_cols(df_t3)
df_t3 = df_t3.rename(columns={'clone':'Name','cdrh3':'CDRH3'})
df_t3 = df_t3[['Name','CDRH3']].dropna()
frames.append(df_t3)

# Sanger dedup
df_sg = pd.read_csv(SANGER_CSV, dtype=str)
df_sg = normalize_cols(df_sg)
# find a name column
for c in ('clone_name','name','clone'):
    if c in df_sg.columns:
        df_sg = df_sg.rename(columns={c:'Name'})
        break
# find CDRH3
for c in ('cdrh3','cdr_h3','cdr_h_3'):
    if c in df_sg.columns:
        df_sg = df_sg.rename(columns={c:'CDRH3'})
        break
df_sg = df_sg[['Name','CDRH3']].dropna()
frames.append(df_sg)

combined = pd.concat(frames, ignore_index=True)

combined['Name']   = combined['Name'].str.strip()
combined['CDRH3']  = combined['CDRH3'].str.replace(r'\s+','',regex=True)
# drop any sequences with invalid letters
valid_aa = set('ACDEFGHIKLMNPQRSTVWY')
mask = combined['CDRH3'].apply(lambda s: all(c in valid_aa for c in s))
combined = combined[mask].drop_duplicates(subset=['CDRH3']).reset_index(drop=True)

combined.to_csv(OUT_CSV, index=False)
with open(OUT_FASTA,'w') as fh:
    for _, row in combined.iterrows():
        # sanitize name for fasta header
        name = re.sub(r'\s+','_', row['Name'])
        fh.write(f">{name}\n{row['CDRH3']}\n")

print(f"Wrote {len(combined)} sequences → {OUT_CSV} and {OUT_FASTA}")

In [None]:
# Run these in the commandline
# makeblastdb -in cdrh3_sequences.fasta -dbtype prot -out cdrh3_db
# blastp -task blastp-short -query sanger_only_sequences.fasta -db your_db -outfmt 6 -out results_SangerOnly.txt

In [None]:
META_CSV    = 'sanger_only_metadata.csv'
BLAST_OUT   = 'results_SangerOnly.txt'
SIM_CSV     = 'sanger_only_sim.csv'
FASTA       = 'sanger_only_sequences.fasta'
OUTPUT_DIR  = 'grouped_clusters_sanger'
LOGO_DIR    = 'cluster_logos_sanger'
DIST_CUTOFF = 1500
LINK_METHOD = 'ward'
LEFT_WIDTH  = 5
RIGHT_WIDTH = 15
PSEUDOCOUNT = 0.01
CLUSTALO = (r"C:\Program Files\clustal-omega-1.2.2-win64"
                r"\clustal-omega-1.2.2-win64\clustalo.exe")
COLOR_DICT  = {
    **dict.fromkeys(list("AGST"), 'green'),
    **dict.fromkeys(list("CLIV"), 'yellow'),
    **dict.fromkeys(list("DEQ"), 'blue'),
    **dict.fromkeys(list("FWY"), 'orange'),
    **dict.fromkeys(list("KR"), 'purple'),
    'H':'pink','P':'red'
}

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(LOGO_DIR,   exist_ok=True)

def run_clustalo(i,o):
    if os.path.exists(o): os.remove(o)
    r = subprocess.run([CLUSTALO, '-i', i, '-o', o, '--auto','--force'],
                       stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if r.returncode:
        sys.exit("ClustalΩ failed:\n" + r.stderr.decode())

def window_counts(aln_fa):
    aln = AlignIO.read(aln_fa,'fasta')
    L   = aln.get_alignment_length()
    anchor = max(range(L), key=lambda i: sum(r.seq[i]=='C' for r in aln))
    st,ed = max(0,anchor-LEFT_WIDTH), min(L,anchor+RIGHT_WIDTH)
    aas = list("ACDEFGHIKLMNPQRSTVWY")
    M   = pd.DataFrame(0, index=range(ed-st), columns=aas)
    for rec in aln:
        for i in range(st,ed):
            aa = rec.seq[i]
            if aa in aas: M.at[i-st,aa]+=1
    return M

def bits_and_freq(cnt):
    ic   = transform_matrix(cnt+PSEUDOCOUNT, from_type='counts', to_type='information')
    fq   = cnt.div(cnt.sum(axis=1),axis=0).fillna(0)
    fq[fq<0.05]=0
    return ic, fq

def save_logo(mat, path, mode, cid):
    ylabel = 'Bits' if mode=='bits' else 'Frequency'
    fig,ax = plt.subplots(figsize=(mat.shape[0]*0.5,3))
    logomaker.Logo(mat, ax=ax, color_scheme=COLOR_DICT, stack_order='big_on_top')
    ax.set_xticks(range(mat.shape[0])); ax.set_xticklabels(range(mat.shape[0]))
    ax.set_xlabel(f"Pos (anchor-C at col {LEFT_WIDTH})"); ax.set_ylabel(ylabel)
    ax.set_ylim(0,2 if mode=='bits' else 1)
    for sp in ('top','right'): ax.spines[sp].set_visible(False)
    ax.set_title(f"Cluster {cid}: {ylabel}", pad=12)
    fig.savefig(path, dpi=300, bbox_inches='tight')
    plt.close(fig)

blast = pd.read_csv(BLAST_OUT, sep='\t', header=None,
    names=['query','subject','pident','length','mismatch','gapopen',
           'qstart','qend','sstart','send','evalue','bitscore'],
    usecols=['query','subject','evalue'], comment='#'
)
sim = blast.pivot_table(index='query',columns='subject',values='evalue',
                        aggfunc='min', fill_value=1e300)
sim = sim.combine_first(sim.T)
for s in sim.index: sim.at[s,s] = sim.loc[s].min()
sim.to_csv(SIM_CSV, float_format='%.3e')
print("Wrote similarity matrix →", SIM_CSV)

meta = pd.read_csv(META_CSV, dtype=str)
if not {'Name','CDRH3'}.issubset(meta): 
    sys.exit("sanger_only_metadata.csv needs Name, CDRH3 columns")
meta = meta[['Name','CDRH3']].dropna().drop_duplicates()
seqs = meta.set_index('Name')['CDRH3']
common = seqs.index.intersection(sim.index).intersection(sim.columns)
seqs   = seqs.reindex(common)
sim    = sim.loc[common, common]

names = list(common)
D     = -np.log10(sim.loc[names,names])
Z     = linkage(D.values, method=LINK_METHOD)
cls   = fcluster(Z, t=DIST_CUTOFF, criterion='distance')
cc,_  = cophenet(Z,pdist(D.values))
print(f"Cophenetic corr = {cc:.4f}")

# assignment
pd.DataFrame({'Name':names,'cluster':cls})\
  .to_csv(f"{OUTPUT_DIR}/Sanger_clusters.csv",index=False)

# overview dendrogram
fig,ax=plt.subplots(figsize=(10,6))
dendrogram(Z, color_threshold=DIST_CUTOFF, above_threshold_color='grey',
           no_labels=True, ax=ax)
ax.axhline(DIST_CUTOFF,ls='--',c='black')
ax.set_title("Sanger overview dendrogram (cut at 1500)"); 
ax.set_ylabel('Distance');
plt.tight_layout()
fig.savefig(f"{OUTPUT_DIR}/Sanger_overview.png",dpi=300); plt.close(fig)

# truncated dendrogram
fig,ax=plt.subplots(figsize=(10,6))
dendrogram(Z, truncate_mode='lastp', p=30, color_threshold=DIST_CUTOFF,
           above_threshold_color='grey', show_leaf_counts=True, ax=ax)
ax.axhline(DIST_CUTOFF,ls='--',c='black')
ax.set_title("Sanger truncated dendrogram"); 
ax.set_xlabel('Count'); 
ax.set_ylabel('Distance')
plt.tight_layout()
fig.savefig(f"{OUTPUT_DIR}/Sanger_trunc.png",dpi=300); plt.close(fig)

# cluster‐sizes bar chart
counts = pd.Series(cls).value_counts().sort_index()
fig,ax=plt.subplots(figsize=(6,4))
counts.plot.bar(ax=ax)
ax.set_title("Sanger cluster sizes"); ax.set_xlabel('Cluster'); ax.set_ylabel('# seqs')
ax.xaxis.set_major_locator(mtick.MaxNLocator(integer=True))
plt.tight_layout()
fig.savefig(f"{OUTPUT_DIR}/Sanger_sizes.png",dpi=300); plt.close(fig)

# per‐cluster logos & detail
for cid in sorted(set(cls)):
    members = [n for n,c in zip(names,cls) if c==cid]
    if len(members)<2: continue

    # fasta & align
    fa  = f"Sanger_c{cid}.fasta"
    aln = fa.replace('.fasta','_aln.fasta')
    with open(fa,'w') as fh:
        for m in members:
            fh.write(f">{m}\n{seqs[m]}\n")
    run_clustalo(fa,aln)

    # logos
    cnt = window_counts(aln)
    ic, fq = bits_and_freq(cnt)
    save_logo(fq, f"{LOGO_DIR}/Sanger_c{cid}_freq.png",'freq',cid)
    save_logo(ic, f"{LOGO_DIR}/Sanger_c{cid}_bits.png",'bits',cid)

    # detailed dendrogram
    Dsub = -np.log10(sim.loc[members,members])
    Zsub = linkage(Dsub.values, method=LINK_METHOD)
    fig,ax=plt.subplots(figsize=(8, max(4,0.2*len(members))))
    dendrogram(Zsub, orientation='right', labels=members,
              above_threshold_color='grey',
               leaf_font_size=6, ax=ax)
    ax.axvline(DIST_CUTOFF,ls='--',c='black')
    ax.set_title(f"Cluster {cid} detail (n={len(members)})"); 
    ax.set_ylabel('')
    for sp in ('top','right','bottom'): ax.spines[sp].set_visible(True)
    ax.spines['left'].set_visible(True)
    plt.tight_layout()
    fig.savefig(f"{OUTPUT_DIR}/Sanger_c{cid}_detail.png",dpi=300); plt.close(fig)

# LaTeX summary table
tbl = pd.DataFrame({'Name':names,'cluster':cls}).merge(meta,on='Name')
with open(f"{OUTPUT_DIR}/Sanger_table.tex",'w') as fh:
    fh.write(r"\begin{table}[H]\centering\scriptsize"+"\n")
    fh.write(r"\begin{tabular}{|l|l|r|}\hline Name & CDRH3 & Cluster \\\hline"+"\n")
    for _,r in tbl.iterrows():
        nm = re.sub(r'([&_#%])',r'\\\1',r['Name'])
        fh.write(f"{nm} & {r['CDRH3']} & {r['cluster']} \\\\\n")
    fh.write(r"\hline\end{tabular}\end{table}"+"\n")