In [4]:
import gzip
from Bio import SeqIO
from pathlib import Path

DATA = Path("../data")

chroms = ["chr1", "chr2", "chr3", "chr22"]
ref_dict = {}

for chrom in chroms:
    gz_path = DATA / f"{chrom}.fa.gz"
    print("Loading:", gz_path)

    with gzip.open(gz_path, "rt") as handle:
        for rec in SeqIO.parse(handle, "fasta"):
            key = rec.id.replace("chr","")  # e.g. "chr1" → "1"
            ref_dict[key] = rec.seq

print("Loaded chromosomes:", list(ref_dict.keys()))

Loading: ..\data\chr1.fa.gz
Loading: ..\data\chr2.fa.gz
Loading: ..\data\chr3.fa.gz
Loading: ..\data\chr22.fa.gz
Loaded chromosomes: ['1', '2', '3', '22']


In [8]:
import requests
from pathlib import Path

DATA = Path("../data")
DATA.mkdir(exist_ok=True)

url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
dest = DATA / "clinvar.vcf.gz"

print("Downloading ClinVar VCF...")

with requests.get(url, stream=True, allow_redirects=True) as r:
    r.raise_for_status()  # ensures no HTML errors
    with open(dest, "wb") as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

print("Download complete:", dest)


Downloading ClinVar VCF...
Download complete: ..\data\clinvar.vcf.gz


In [5]:
import gzip

with gzip.open("../data/clinvar.vcf.gz", "rt") as f:
    print(f.readline())


##fileformat=VCFv4.1



In [6]:
import pandas as pd
import gzip

vcf_path = "../data/clinvar.vcf.gz"

records = []

with gzip.open(vcf_path, "rt") as f:
    for line in f:
        if line.startswith("#"):
            continue
        
        cols = line.strip().split("\t")
        
        chrom = cols[0].replace("chr","")
        pos   = int(cols[1])
        ref   = cols[3]
        alt   = cols[4]
        info  = cols[7]
        
        # Extract clinical significance
        csig = "Uncertain"
        for item in info.split(";"):
            if item.startswith("CLNSIG="):
                csig = item.split("=")[1]
                break
        
        records.append([chrom, pos, ref, alt, csig])

df = pd.DataFrame(records, columns=["Chromosome","Start","REF","ALT","CLNSIG"])
df.head(), df.shape


(  Chromosome  Start REF ALT                  CLNSIG
 0          1  66926  AG   A  Uncertain_significance
 1          1  69134   A   G           Likely_benign
 2          1  69308   A   G  Uncertain_significance
 3          1  69314   T   G  Uncertain_significance
 4          1  69404   T   C  Uncertain_significance,
 (4125815, 5))

In [7]:
keep_chr = ["1", "2", "3", "22"]
df = df[df["Chromosome"].isin(keep_chr)].reset_index(drop=True)
df.shape


(1048764, 5)

In [8]:
df = df[(df["REF"].str.len() == 1) & (df["ALT"].str.len() == 1)]
df.shape


(977239, 5)

In [9]:
def map_label(x):
    x = x.lower()
    if "pathogenic" in x:
        return 1
    if "benign" in x:
        return 0
    return -1

df["label"] = df["CLNSIG"].apply(map_label)
df = df[df["label"] != -1].reset_index(drop=True)
df.shape


(399216, 6)

In [10]:
df.to_csv("../data/df_multi_chrom.csv", index=False)
print("Saved:", "../data/df_multi_chrom.csv")

Saved: ../data/df_multi_chrom.csv


In [11]:
import pandas as pd

df = pd.read_csv("../data/df_multi_chrom.csv")
df.shape
df.head()


Unnamed: 0,Chromosome,Start,REF,ALT,CLNSIG,label
0,1,69134,A,G,Likely_benign,0
1,1,924518,G,C,Likely_benign,0
2,1,925956,C,T,Likely_benign,0
3,1,925969,C,T,Likely_benign,0
4,1,925980,C,T,Likely_benign,0


In [12]:
import gzip
from Bio import SeqIO
from pathlib import Path

DATA = Path("../data")

chroms = ["chr1", "chr2", "chr3", "chr22"]
ref_dict = {}

for chrom in chroms:
    gz_path = DATA / f"{chrom}.fa.gz"
    print("Loading:", gz_path)

    with gzip.open(gz_path, "rt") as handle:
        for rec in SeqIO.parse(handle, "fasta"):
            key = rec.id.replace("chr","")
            ref_dict[key] = rec.seq

print("Loaded:", list(ref_dict.keys()))

Loading: ..\data\chr1.fa.gz
Loading: ..\data\chr2.fa.gz
Loading: ..\data\chr3.fa.gz
Loading: ..\data\chr22.fa.gz
Loaded: ['1', '2', '3', '22']


In [13]:
WIDTH = 100

def fetch_seq_window(chrom, pos, flank = 100):
    chrom = str(chrom).replace("chr", "")
    if chrom not in ref_dict:
        return None

    seq = ref_dict[chrom]
    pos = int(pos)

    start = max(pos - flank, 0)
    end = pos + flank + 1

    if end > len(seq):
        return None

    return str(seq[start:end]).upper()

In [14]:
df["seq201"] = df.apply(
    lambda r: fetch_seq_window(r["Chromosome"], r["Start"], flank=WIDTH), axis = 1
)

df = df.dropna(subset = ["seq201"]).reset_index(drop=True)
df.shape

(399216, 7)

In [15]:
df.to_csv("../data/df_multi_seq201.csv", index = False)
print("saved: df_multi_seq201.csv")

saved: df_multi_seq201.csv


In [16]:
import pandas as pd

df = pd.read_csv("../data/df_multi_seq201.csv")
df.shape
df.head()

Unnamed: 0,Chromosome,Start,REF,ALT,CLNSIG,label,seq201
0,1,69134,A,G,Likely_benign,0,AGGTAACTGCAGAGGCTATTTCCTGGAATGAATCAACGAGTGAAAC...
1,1,924518,G,C,Likely_benign,0,CCACCGGGGCGCCATGCCGGCGGTCAAGAAGGAGTTCCCGGGCCGC...
2,1,925956,C,T,Likely_benign,0,CTGCCGCTGACTGCGCGCAGAAGCGTGCCGCTCCCTCACAGGGTCT...
3,1,925969,C,T,Likely_benign,0,CGCGCAGAAGCGTGCCGCTCCCTCACAGGGTCTGCCTCGGCTCTGC...
4,1,925980,C,T,Likely_benign,0,GTGCCGCTCCCTCACAGGGTCTGCCTCGGCTCTGCTCGCAGGGAAA...


In [17]:
import numpy as np

# ASCII → index mapping
BASE_TO_INT = np.full(256, -1, dtype=np.int16)
for base, idx in zip("ACGTacgt", [0,1,2,3,0,1,2,3]):
    BASE_TO_INT[ord(base)] = idx

EYE4 = np.eye(4, dtype=np.float32)

def one_hot_vec(seq: str) -> np.ndarray:
    """
    Safe one-hot encoding for A,C,G,T only.
    Handles lowercase, N characters, and ensures mask always exists.
    """
    if not isinstance(seq, str):
        seq = ""

    b = np.frombuffer(seq.encode(), dtype=np.uint8)  # ASCII bytes
    idx = BASE_TO_INT[b]                             # convert ASCII → 0..3

    # Always create mask, even if bad characters exist
    mask = (idx >= 0)

    # Initialize output matrix
    oh = np.zeros((len(idx), 4), dtype=np.float32)

    # Fill only valid positions
    if mask.any():
        oh[mask] = EYE4[idx[mask]]

    return oh


In [18]:
seqs = df["seq201"].tolist()
labels = df["label"].astype(int).to_numpy()

X = np.stack([one_hot_vec(s) for s in seqs], axis=0)
y = labels

X.shape, y.shape


((399216, 201, 4), (399216,))

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size = 0.3, stratify = y, random_state = 42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size = 0.5, stratify = y_tmp, random_state = 42
)

X_train.shape, X_val.shape, X_test.shape, np.bincount(y_train), np.bincount(y_val), np.bincount(y_test)

((279451, 201, 4),
 (59882, 201, 4),
 (59883, 201, 4),
 array([220522,  58929]),
 array([47254, 12628]),
 array([47255, 12628]))

In [20]:
np.savez_compressed(
    "../data/multi_chrom_dataset_seq201.npz",
    X_train=X_train,
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    X_test=X_test,
    y_test=y_test
)

print("Saved compressed dataset.")


Saved compressed dataset.


In [21]:
import torch
from torch.utils.data import TensorDataset, DataLoader

X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)

X_val_t   = torch.tensor(X_val,   dtype=torch.float32)
y_val_t   = torch.tensor(y_val,   dtype=torch.long)

X_test_t  = torch.tensor(X_test,  dtype=torch.float32)
y_test_t  = torch.tensor(y_test,  dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=64, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val_t,   y_val_t),   batch_size=128, shuffle=False)

len(train_loader), len(val_loader)


(4367, 468)

In [22]:
import torch.nn as nn

class BetterCNNv2(nn.Module):
    def __init__(self):
        super().__init__()

        self.net = nn.Sequential(
            # Block 1
            nn.Conv1d(4, 64, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.Conv1d(64, 64, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.MaxPool1d(2),      # 201 -> 100

            # Block 2
            nn.Conv1d(64, 128, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.Conv1d(128, 128, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.MaxPool1d(2),      # 100 -> 50

            # Block 3
            nn.Conv1d(128, 256, kernel_size=7, padding=3),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.MaxPool1d(2)       # 50 -> 25
        )

        L = 25   # final length after 3 pools
        self.fc = nn.Sequential(
            nn.Linear(256 * L, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )

    def forward(self, x):
        # x: (B, 201, 4) → (B, 4, 201)
        x = x.permute(0, 2, 1)
        x = self.net(x)
        x = x.flatten(1)
        return self.fc(x)

model = BetterCNNv2()
model

BetterCNNv2(
  (net): Sequential(
    (0): Conv1d(4, 64, kernel_size=(7,), stride=(1,), padding=(3,))
    (1): ReLU()
    (2): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
    (3): ReLU()
    (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv1d(64, 128, kernel_size=(7,), stride=(1,), padding=(3,))
    (6): ReLU()
    (7): Conv1d(128, 128, kernel_size=(7,), stride=(1,), padding=(3,))
    (8): ReLU()
    (9): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv1d(128, 256, kernel_size=(7,), stride=(1,), padding=(3,))
    (11): ReLU()
    (12): Dropout(p=0.3, inplace=False)
    (13): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc): Sequential(
    (0): Linear(in_features=6400, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [None]:
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
import torch

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()

loss_history = []
val_auc_history = []

EPOCHS = 10

for ep in range(EPOCHS):
    # Train 
    model.train()
    total_loss = 0.0

    for xb, yb in train_loader:
        optimizer.zero_grad()
        out = model(xb)
        loss = loss_fn(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    loss_history.append(avg_loss)

    # Validate
    model.eval()
    with torch.no_grad():
        logits = model(X_val_t)
        probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
        val_auc = roc_auc_score(y_val_t.cpu().numpy(), probs)
        val_auc_history.append(val_auc)

    print(f"Epoch {ep+1:02d} — loss={avg_loss:.4f} — val_AUC={val_auc:.3f}")
