In [21]:
# Notebook 01: PF03060 Family Pipeline (Data -> MSA -> PSSM -> HMM)
# ---------------------------------------------------------------
# This notebook implements the first half of the project pipeline:
# 1) Fetch SwissProt proteins annotated with PF03060
# 2) Extract Pfam domain sequences
# 3) Build and clean a multiple sequence alignment (MSA)
# 4) Build a PSSM profile model
# 5) Build a lightweight profile HMM

!pip install biopython

import requests
import pandas as pd
import numpy as np
from Bio import SeqIO, pairwise2
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from collections import Counter, defaultdict
import math
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

PFAM_ID = "PF03060"
OUTPUT_DIR = "../data"
RESULTS_DIR = "../results"




Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [22]:
# -----------------------------
# STEP 1: Fetch representative UniProt SwissProt entries
# -----------------------------
MAX_SEQ = 150

def fetch_uniprot_sample_from_pfam(pfam_id, max_entries=150, page_size=50):
    url = f"https://www.ebi.ac.uk/interpro/api/protein/UniProt/entry/pfam/{pfam_id}/?page_size={page_size}"
    results = []
    page = 1

    while url and len(results) < max_entries:
        print(f"Fetching page {page} from InterPro...")
        r = requests.get(url, headers={"Accept": "application/json"}, timeout=15)
        r.raise_for_status()

        js = r.json()
        for res in js.get("results", []):
            results.append(res["metadata"]["accession"])
            if len(results) >= max_entries:
                break

        url = js.get("next")
        page += 1

    print(f"Collected {len(results)} UniProt IDs")
    return results

import time
from io import StringIO

def fetch_uniprot_entries_via_idmapping(uniprot_ids):
    """
    Robust UniProt batch retrieval using the ID-mapping API.
    """
    # 1) Submit mapping job
    submit_url = "https://rest.uniprot.org/idmapping/run"
    data = {
        "from": "UniProtKB_AC-ID",
        "to": "UniProtKB",
        "ids": ",".join(uniprot_ids)
    }

    r = requests.post(submit_url, data=data)
    r.raise_for_status()
    job_id = r.json()["jobId"]
    print(f"Submitted ID-mapping job: {job_id}")

    # 2) Poll job status
    status_url = f"https://rest.uniprot.org/idmapping/status/{job_id}"
    while True:
        r = requests.get(status_url)
        r.raise_for_status()
        js = r.json()
        if js.get("jobStatus") == "RUNNING":
            print("ID-mapping job running...")
            time.sleep(3)
        else:
            break

    # 3) Retrieve results as TSV
    result_url = f"https://rest.uniprot.org/idmapping/uniprotkb/results/{job_id}"
    params = {
        "format": "tsv",
        "fields": "accession,sequence,organism_name,lineage,go_id"
    }

    r = requests.get(result_url, params=params)
    r.raise_for_status()

    df = pd.read_csv(StringIO(r.text), sep="\t")
    return df

print(f"Fetching up to {MAX_SEQ} UniProt IDs for Pfam family {PFAM_ID}...")
uniprot_ids = fetch_uniprot_sample_from_pfam(PFAM_ID, max_entries=MAX_SEQ)

print("Fetching full UniProt entries via ID-mapping API...")
pfam_df = fetch_uniprot_entries_via_idmapping(uniprot_ids)

out_path = f"{OUTPUT_DIR}/raw_{PFAM_ID}_uniprot.tsv"
pfam_df.to_csv(out_path, index=False)

print(f"Downloaded {len(pfam_df)} proteins, saved to {out_path}")

Fetching up to 150 UniProt IDs for Pfam family PF03060...
Fetching page 1 from InterPro...
Fetching page 2 from InterPro...
Fetching page 3 from InterPro...
Collected 150 UniProt IDs
Fetching full UniProt entries via ID-mapping API...
Submitted ID-mapping job: eAVJOPPtVL
Downloaded 25 proteins, saved to ../data/raw_PF03060_uniprot.tsv


In [24]:
#column name check 

print(pfam_df.columns.tolist())


['From', 'Entry', 'Sequence', 'Organism', 'Taxonomic lineage', 'Gene Ontology IDs']


In [25]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

# -----------------------------
# STEP 2: Extract full-length PF03060 protein sequences
# -----------------------------

def extract_pfam_sequences(df):
    records = []
    meta = []

    for _, row in df.iterrows():
        acc = row["Entry"]          # UniProt accession
        seq = row["Sequence"]       # Protein sequence

        if pd.isna(seq) or not seq:
            continue

        rec = SeqRecord(
            Seq(seq),
            id=acc,
            description=""
        )
        records.append(rec)

        meta.append({
            "accession": acc,
            "length": len(seq),
            "organism": row.get("Organism", ""),
            "lineage": row.get("Taxonomic lineage", ""),
            "go_ids": row.get("Gene Ontology IDs", "")
        })

    return records, pd.DataFrame(meta)


print("Extracting PF03060 protein sequences (full-length)...")
records, domain_meta = extract_pfam_sequences(pfam_df)

fasta_path = f"{OUTPUT_DIR}/PF03060_full_length.fasta"
meta_path = f"{OUTPUT_DIR}/PF03060_metadata.csv"

SeqIO.write(records, fasta_path, "fasta")
domain_meta.to_csv(meta_path, index=False)

print(f"Extracted {len(records)} sequences")
print(f"Saved FASTA to: {fasta_path}")
print(f"Saved metadata to: {meta_path}")


Extracting PF03060 protein sequences (full-length)...
Extracted 25 sequences
Saved FASTA to: ../data/PF03060_full_length.fasta
Saved metadata to: ../data/PF03060_metadata.csv


In [26]:
print("Number of sequences:", len(records))
lengths = [len(r.seq) for r in records]
print("Min length:", min(lengths))
print("Max length:", max(lengths))
print("Mean length:", sum(lengths) / len(lengths))


Number of sequences: 25
Min length: 224
Max length: 415
Mean length: 345.16


In [None]:
import subprocess
from pathlib import Path
from Bio import SeqIO, AlignIO
import numpy as np
from collections import Counter

# -----------------------------
# Folders and paths
# -----------------------------
INPUT_FASTA = "../data/PF03060_full_length.fasta"
RESULTS_DIR = Path("../results")
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

MSA_RAW = RESULTS_DIR / "PF03060.aln.fasta"
MSA_CLEAN = RESULTS_DIR / "PF03060.cleaned.aln.fasta"
PSSM_FILE = RESULTS_DIR / "PF03060.pssm.txt"
HMM_FILE = RESULTS_DIR / "PF03060.hmm"

# -----------------------------
# STEP 3: Build MSA via Clustal Omega
# -----------------------------
print("Running Clustal Omega to generate MSA...")
try:
    subprocess.run(
        ["clustalo", "-i", INPUT_FASTA, "-o", str(MSA_RAW), "--force", "--threads=4"],
        check=True
    )
    print("MSA generated:", MSA_RAW)
except subprocess.CalledProcessError:
    print("Error: Clustal Omega failed. Make sure it is installed and in your PATH.")

# -----------------------------
# STEP 3b: Clean MSA (remove columns >50% gaps)
# -----------------------------
print("Cleaning MSA (removing highly gappy columns)...")
alignment = AlignIO.read(MSA_RAW, "fasta")
seqs = np.array([list(str(rec.seq)) for rec in alignment])
keep_cols = [i for i in range(seqs.shape[1]) if np.mean(seqs[:, i] == '-') < 0.5]

cleaned_records = []
for rec in alignment:
    new_seq = ''.join([rec.seq[i] for i in keep_cols])
    rec.seq = new_seq
    cleaned_records.append(rec)

AlignIO.write(cleaned_records, MSA_CLEAN, "fasta")
print("Cleaned MSA saved:", MSA_CLEAN)
print("Cleaned MSA length:", len(cleaned_records[0].seq))

# -----------------------------
# STEP 4: Build PSSM
# -----------------------------
print("Building crude PSSM from cleaned MSA...")
seq_array = np.array([list(str(r.seq)) for r in cleaned_records])
pssm = []

for col in range(seq_array.shape[1]):
    counts = Counter(seq_array[:, col])
    total = sum(counts.values())
    freqs = {aa: counts.get(aa, 0)/total for aa in "ACDEFGHIKLMNPQRSTVWY-"}
    pssm.append(freqs)

# Save PSSM
with open(PSSM_FILE, "w") as f:
    f.write("\t".join(["Pos"] + list("ACDEFGHIKLMNPQRSTVWY-")) + "\n")
    for i, col in enumerate(pssm, 1):
        f.write(f"{i}\t" + "\t".join(f"{col[aa]:.3f}" for aa in "ACDEFGHIKLMNPQRSTVWY-") + "\n")
print("PSSM saved to:", PSSM_FILE)

# -----------------------------
# STEP 5: Build HMM via HMMER
# -----------------------------
print("Building HMM from cleaned MSA using hmmbuild...")
try:
    subprocess.run(
        ["hmmbuild", str(HMM_FILE), str(MSA_CLEAN)],
        check=True
    )
    print("HMM saved to:", HMM_FILE)
except subprocess.CalledProcessError:
    print("Error: HMMER hmmbuild failed. Make sure hmmbuild is installed and in your PATH.")



In [None]:
# -----------------------------
# STEP 3: Build MSA (python version - old)
# -----------------------------

print("Building crude MSA using seed-based alignment")
seed_seq = records[0].seq
aligned_records = []

for rec in records:
    alns = pairwise2.align.globalxx(seed_seq, rec.seq)
    best = alns[0]
    aligned_seed, aligned_seq = best.seqA, best.seqB
    aligned_records.append(SeqRecord(Seq(aligned_seq), id=rec.id, description=""))

SeqIO.write(aligned_records, f"{OUTPUT_DIR}/pf03060_msa_raw.fasta", "fasta")

# -----------------------------
# STEP 3b: Clean MSA (remove gappy columns)
# -----------------------------

def clean_msa(records, gap_thresh=0.5):
    seqs = np.array([list(str(r.seq)) for r in records])
    keep_cols = []
    for i in range(seqs.shape[1]):
        col = seqs[:, i]
        gap_frac = np.mean(col == '-')
        if gap_frac < gap_thresh:
            keep_cols.append(i)
    cleaned = []
    for r in records:
        new_seq = ''.join([str(r.seq)[i] for i in keep_cols])
        cleaned.append(SeqRecord(Seq(new_seq), id=r.id, description=""))
    return cleaned

msa_clean = clean_msa(aligned_records)
SeqIO.write(msa_clean, f"{OUTPUT_DIR}/pf03060_msa_clean.fasta", "fasta")
print("Cleaned MSA length:", len(msa_clean[0].seq))

In [None]:

# -----------------------------
# STEP 4: Build a PSSM - Python (old)
# -----------------------------

print("Building PSSM from cleaned MSA")

AA_LIST = list("ACDEFGHIKLMNPQRSTVWY")
bg_freq = {aa: 1/20 for aa in AA_LIST}

msa_array = np.array([list(str(r.seq)) for r in msa_clean])

pssm = []

for i in range(msa_array.shape[1]):
    col = msa_array[:, i]
    counts = Counter(col)
    total = sum(counts[aa] for aa in counts if aa != '-') + 20  # pseudocounts
    col_scores = {}
    for aa in AA_LIST:
        freq = (counts.get(aa, 0) + 1) / total
        col_scores[aa] = math.log2(freq / bg_freq[aa])
    pssm.append(col_scores)

pssm_df = pd.DataFrame(pssm)
pssm_df.to_csv(f"{RESULTS_DIR}/pssm_matrix.csv", index=False)
print("PSSM saved")


In [None]:

# -----------------------------
# STEP 5: Build lightweight profile HMM - Python (old)
# -----------------------------

print("Building lightweight profile HMM")

hmm = []

for i in range(msa_array.shape[1]):
    col = msa_array[:, i]
    counts = Counter(col)
    total = sum(counts[aa] for aa in counts if aa != '-') + 20
    emissions = {}
    for aa in AA_LIST:
        emissions[aa] = (counts.get(aa, 0) + 1) / total
    hmm.append(emissions)

import json
with open(f"{RESULTS_DIR}/hmm_params.json", "w") as f:
    json.dump(hmm, f, indent=2)

print("HMM parameters saved")
print("Notebook 01 complete: Data -> MSA -> PSSM -> HMM")
