In [None]:
import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, dendrogram
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns
import os
import sys
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from scipy.cluster.hierarchy import linkage, dendrogram, cophenet, fcluster
from scipy.spatial.distance import pdist 
from sklearn.metrics import silhouette_score

In [None]:
df = pd.read_csv('CoV-AbDab_080224.csv')  
filtered_df = df[df['Origin'].str.contains('phage', case=False, na=False)]

cdrh3_values = filtered_df['CDRH3']

print(cdrh3_values)

In [None]:
cdrh3_sequences = filtered_df['CDRH3']
sequence_names = filtered_df['Name'] 

counts = filtered_df['CDRH3'].value_counts()

duplicate_loops = counts[counts > 1].index.tolist()

print("Found {} duplicated CDRH3 sequences:".format(len(duplicate_loops)))
for loop in duplicate_loops:
    names = filtered_df.loc[filtered_df['CDRH3'] == loop, 'Name'].tolist()
    print(f"{loop!r} appears {len(names)}× in clones: {names}")


with open('cdrh3_sequences.fasta', 'w') as fasta_file:
    for name, sequence in zip(sequence_names, cdrh3_sequences):
        fasta_file.write(f">{name}\n{sequence}\n")

print("FASTA file 'cdrh3_sequences.fasta' created successfully.")

In [None]:
df = pd.read_csv('results_opigOnly.txt', sep='\t', header=None)

df.columns = ['query', 'subject', 'percent_identity', 'alignment_length', 'mismatches', 
              'gap_opens', 'q_start', 'q_end', 's_start', 's_end', 
              'evalue', 'bit_score']

df = df.drop_duplicates(subset=['query', 'subject'])
df_similarity = df[['query', 'subject', 'evalue']]

similarity_matrix_evalue = df_similarity.pivot(index='query', columns='subject', values='evalue')

similarity_matrix_evalue.to_csv('similarity_matrix_evalue.csv')

df = pd.read_csv('similarity_matrix_evalue.csv', index_col=0)
df = df.apply(pd.to_numeric, errors='coerce')

df = df.replace(0, 1e-300).fillna(1e-300)

evalue_log = -np.log10(df)

Z = linkage(evalue_log, method='ward')
dists = pdist(evalue_log.values, metric='euclidean')
c, coph_dists = cophenet(Z, dists)
print(f"Cophenetic correlation: {c:.3f}")

plt.figure(figsize=(50, 10))
dendrogram(Z, labels=evalue_log.index)
plt.title('Dendrogram of Log-Transformed E-values')
plt.xlabel('Clones')
plt.ylabel('Distance')
plt.xticks(rotation=90)

plt.savefig('dendrogram_evalue_log.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
#Run these in the commandline
# makeblastdb -in cdrh3_sequences.fasta -dbtype prot -out cdrh3_db
# blastp -task blastp-short -query cdrh3_sequences.fasta -db your_db -outfmt 6 -out results_opigOnly.txt

In [None]:
META_CSV       = 'CoV-AbDab_080224.csv'
SIM_CSV        = 'similarity_matrix_evalue.csv'
OUTPUT_DIR     = 'grouped_clusters'
DIST_CUTOFF    = 5000
LINKAGE_METHOD = 'ward'

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Loading metadata from {META_CSV}…")
try:
    meta = pd.read_csv(META_CSV, dtype=str)
except FileNotFoundError:
    print(f"ERROR: {META_CSV} not found"); sys.exit(1)

for col in ('Name','Ab or Nb','CDRH3'):
    if col not in meta.columns:
        print(f"ERROR: missing column {col} in metadata"); sys.exit(1)

meta['Name']     = meta['Name'].str.strip()
meta['Ab or Nb'] = meta['Ab or Nb'].str.strip()
meta['CDRH3']    = meta['CDRH3'].astype(str).str.strip()

try:
    sim_all = pd.read_csv(SIM_CSV, index_col=0)
except FileNotFoundError:
    print(f"ERROR: {SIM_CSV} not found"); sys.exit(1)

sim_all = (
    sim_all
    .apply(pd.to_numeric, errors='coerce')
    .replace(0, 1e-300)
    .fillna(1e-300)
)

for ab_or_nb in ['Ab','Nb']:
    print(f"\n=== {ab_or_nb}s ===")
    ids = meta.loc[meta['Ab or Nb']==ab_or_nb, 'Name'].unique().tolist()
    if not ids:
        print(" none found, skipping"); continue

    missing = set(ids) - set(sim_all.index)
    if missing:
        print(f" WARNING: {len(missing)} names missing from sim matrix, dropping them")
        ids = [i for i in ids if i in sim_all.index]

    sim_grp = sim_all.loc[ids, ids]

    evalue_log = -np.log10(sim_grp)

    Z = linkage(evalue_log.values, method=LINKAGE_METHOD)
    cluster_ids = fcluster(Z, t=DIST_CUTOFF, criterion='distance')
    clusters = pd.Series(cluster_ids, index=evalue_log.index, name='cluster')
    nclus = clusters.nunique()
    print(f" → {nclus} clusters at cutoff {DIST_CUTOFF}")

    prefix = os.path.join(OUTPUT_DIR, f"{ab_or_nb}_clusters")

    clusters.to_csv(prefix + "_assignments.csv", header=True)

    seq_map = dict(zip(meta['Name'], meta['CDRH3']))
    with open(prefix + "_sequences_by_cluster.txt", 'w') as fh:
        for cid, members in clusters.groupby(clusters).groups.items():
            fh.write(f"Cluster {cid} (n={len(members)}):\n")
            for name in members:
                fh.write(f"> {name}\n{seq_map.get(name,'[missing]')}\n")
            fh.write("\n")

    plt.figure(figsize=(10, 6))
    dendrogram(Z,
               color_threshold=DIST_CUTOFF,
               above_threshold_color='grey',
               no_labels=True)
    plt.axhline(DIST_CUTOFF, color='k', ls='--')
    plt.title(f"{ab_or_nb} overview dendrogram")
    plt.xlabel('Sample index')
    plt.ylabel('Distance')
    plt.tight_layout()
    plt.savefig(prefix + "_overview.png", dpi=300)
    plt.close()

    sizes = clusters.value_counts().sort_index()
    plt.figure(figsize=(6,4))
    plt.bar(sizes.index.astype(str), sizes.values)
    plt.title(f"{ab_or_nb}: Cluster sizes at d={DIST_CUTOFF}")
    plt.xlabel('Cluster ID')
    plt.ylabel('Number of sequences')
    plt.tight_layout()
    plt.savefig(prefix + "_cluster_sizes.png", dpi=300)
    plt.close()

    for cid, members in clusters.groupby(clusters).groups.items():
        if len(members) < 2:
            continue
        submat = evalue_log.loc[members, members].values
        subZ   = linkage(submat, method=LINKAGE_METHOD)
        plt.figure(figsize=(6,4))
        dendrogram(subZ,
                   labels=members,
                   orientation='right',
                   color_threshold=None)
        plt.title(f"{ab_or_nb} cluster {cid} (n={len(members)})")
        plt.xlabel('Distance')
        plt.ylabel('')
        plt.tight_layout()
        fn = f"{prefix}_cluster_{cid:02d}_detail.png"
        plt.savefig(fn, dpi=300)
        plt.close()