In [1]:
from pathlib import Path
import os, pandas as pd

ROOT = Path("..").resolve()
DATA = ROOT / "data"
DATA.exists(), os.getcwd()
#Expect: True, andd cwd ends with /genomics-dl-snp/notebooks

(True, 'C:\\Users\\Gulsher\\Desktop\\genomics-dl-snp\\notebooks')

In [2]:
clinvar_path = DATA / "variant_summary.txt.gz"
clinvar_path.exists(), clinvar_path

(True,
 WindowsPath('C:/Users/Gulsher/Desktop/genomics-dl-snp/data/variant_summary.txt.gz'))

In [3]:
use_rows = 120_000
skip_head = 0

df = pd.read_csv(
    clinvar_path,
    sep= "\t",
    compression="gzip",
    low_memory=False,
    nrows=use_rows,
    dtype=str #safer; we will cast later
)

df.columns[:20] #peek at column names

Index(['#AlleleID', 'Type', 'Name', 'GeneID', 'GeneSymbol', 'HGNC_ID',
       'ClinicalSignificance', 'ClinSigSimple', 'LastEvaluated', 'RS# (dbSNP)',
       'nsv/esv (dbVar)', 'RCVaccession', 'PhenotypeIDS', 'PhenotypeList',
       'Origin', 'OriginSimple', 'Assembly', 'ChromosomeAccession',
       'Chromosome', 'Start'],
      dtype='object')

In [4]:
# Try likely allele column names
REF_CAND = ["ReferenceAlleleVCF", "ReferenceAllele", "REF"]
ALT_CAND = ["AlternateAlleleVCF", "AlternateAllele", "ALT"]
TYPE_COL = "Type"
CHR_COL  = "Chromosome"
POS_COL  = "Start"
CLN_COL  = "ClinicalSignificance"

def first_present(cands):
    for c in cands:
        if c in df.columns:
            return c
    return None

REF_COL = first_present(REF_CAND)
ALT_COL = first_present(ALT_CAND)
REF_COL, ALT_COL

('ReferenceAlleleVCF', 'AlternateAlleleVCF')

In [5]:
df = df[df[TYPE_COL] == "single nucleotide variant"].copy()

keep = df[CLN_COL].isin(["Benign", "Pathogenic"])
df = df[keep].copy()

df["label"] = (df[CLN_COL] == "Pathogenic").astype(int)

# Keep essentials
cols = [CHR_COL, POS_COL, REF_COL, ALT_COL, CLN_COL, "label"]
df_small = df[cols].dropna().copy()

# Cast numeric position safely
df_small[POS_COL] = pd.to_numeric(df_small[POS_COL], errors="coerce")
df_small = df_small.dropna(subset=[POS_COL]).reset_index(drop=True)

df_small.head()
df_small["label"].value_counts()


label
1    26907
0     7892
Name: count, dtype: int64

In [8]:
import urllib.request, gzip
from Bio import SeqIO

#Download chr22 if not present
chr22_gz = DATA / "chr22.fa.gz"
if not chr22_gz.exists():
    urllib.request.urlretrieve(
        "https://hgdownload.soe.ucsc.edu/goldenPath/hg38/chromosomes/chr22.fa.gz", chr22_gz
    )
#Load FASTA
ref_dict = {}
with gzip.open(chr22_gz, "rt") as handle:
    for record in SeqIO.parse(handle, "fasta"):
        key = record.id.replace("chr","")
        ref_dict[key] = record

def fetch_seq_101(chrom, pos, flank=50):
    chrom = str(chrom).replace("chr","")
    rec = ref_dict.get(chrom)
    if rec is None:
        return None
    start = max(int(pos) - flank, 0)
    end = int(pos) + flank + 1
    if end > len(rec):
        return None
    return str(rec.seq[start:end]).upper()

# Only keep chr22 rows for now (fast + guaranteed reference)
mask22 = df_small[CHR_COL].astype(str).str.replace("chr","") == "22"
df22 = df_small[mask22].copy()

df22["seq101"] = df22.apply(lambda r: fetch_seq_101(r[CHR_COL], r[POS_COL]), axis=1)
df22 = df22.dropna(subset=["seq101"]).reset_index(drop=True)

len(df22), df22.head()

(437,
   Chromosome     Start ReferenceAlleleVCF AlternateAlleleVCF  \
 0         22  41320486                  G                  T   
 1         22  40924482                  G                  T   
 2         22  31011610                  G                  C   
 3         22  30615623                  G                  C   
 4         22  31009031                  T                  G   
 
   ClinicalSignificance  label  \
 0           Pathogenic      1   
 1           Pathogenic      1   
 2               Benign      0   
 3               Benign      0   
 4           Pathogenic      1   
 
                                               seq101  
 0  CCCTTGAGGGTACCACACAATCTAGGAGTCCAGGGGGCCATGGGGG...  
 1  GCCCCGTTCCCTCCCTCTGCAGCCTGGGATGGTAATCACAATTGAG...  
 2  TCTCAAAAGAAGACATTTATACAGCCAAAAAACACATGAAAAAATG...  
 3  TCTCTCTCTTCCTCACTCTATCACCAGTTCCTCATGACTTCCCCCA...  
 4  TGGGTTCAGGTGATTCTCCTGTCTCGGCCCCCCGAGTAGCTGGGAC...  )

In [9]:
from IPython.display import display, HTML

# Show the first 10 rows in a tidy HTML table
html = df22.head(10).to_html(index=False)
display(HTML(f"<h4>Example Variants on Chr22</h4>{html}"))

Chromosome,Start,ReferenceAlleleVCF,AlternateAlleleVCF,ClinicalSignificance,label,seq101
22,41320486,G,T,Pathogenic,1,CCCTTGAGGGTACCACACAATCTAGGAGTCCAGGGGGCCATGGGGGTGGAGGACACTGGCCTGAGGGTGGTCATCGCCCTCAGGGACACGCCTCCCGACAT
22,40924482,G,T,Pathogenic,1,GCCCCGTTCCCTCCCTCTGCAGCCTGGGATGGTAATCACAATTGAGCCCGGTAAGGAGAGGTGTTACAATAGTAGTATGAGGTAAATGTTTGTTTGTTTGT
22,31011610,G,C,Benign,0,TCTCAAAAGAAGACATTTATACAGCCAAAAAACACATGAAAAAATGCTCAACATCACTGGCCATCAGAGAAATGCAAATCAAAACCACAATGAGATACCAT
22,30615623,G,C,Benign,0,TCTCTCTCTTCCTCACTCTATCACCAGTTCCTCATGACTTCCCCCATGCGTGGGGCAGAACTGGGAACAGCATGTCTCAAGGCGAGGGTTGCTTTGCTGGC
22,31009031,T,G,Pathogenic,1,TGGGTTCAGGTGATTCTCCTGTCTCGGCCCCCCGAGTAGCTGGGACTATAGGCACGCACCACCATGCCGGGCTATTTTTTTGTATTTTTTAGTAGAGGCGG
22,30613044,T,G,Pathogenic,1,TGGTCTCACAGCTCAAATGGTTCCTGGAGGATGAGAAGAGAGCCATTGGTGAGCAGACACCATCCGCTGGGGGTGGGGAGCAGCTGGGAGGGCTCATCAGA
22,31011112,A,T,Pathogenic,1,AACTATCTGATCTTTGACAAACCTGAGAAAAACAAGCAATGGGGAAAGGATTCCCTATCTAATAAATGGTGCTGGGAAAACTGGCTAGCCATATGTAGAAA
22,30615125,A,T,Pathogenic,1,AGAAATGAGCAGTTTCCTGGGACACAGGATTTTCAGAGTCCAGACAAGGAAAGTCTTGGGCAGACCAGGTTGAGTTGGTGCCCTTAGCTGATCTGACCATG
22,43024239,A,G,Pathogenic,1,GCCTCCTGGGTAGCTGGGATTACAGGTGCCTGCCACCATGCCCGGCTCATTTTTTATATTTTTGGTAGAGAGGGGGTTTCACCATGTTGGCCAGGCTGGTC
22,42628233,A,G,Pathogenic,1,CCCCGGAACTCAATGGTGTCTCCAATCTGCATGCTCTCCAGGTACTGAGACATCTTCCCTCCAGCGGGAAACTTGGGATGGGTGTCCTTGAAGTAAACCTG


In [19]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch

#reuse vectorized one-hot from earlier:
BASE_TO_INT = np.full(256, -1, dtype=np.int8)
for a,i in zip("ACGTacgt",[0,1,2,3,0,1,2,3]): BASE_TO_INT[ord(a)] = i
EYE4 = np.eye(4, dtype=np.float32)

def one_hot_vec(seq: str) -> np.ndarray:
    b = np.frombuffer(seq.encode(), dtype = np.uint8)
    idx = BASE_TO_INT[b]
    oh = np.zeros((len(idx), 4), dtype = np.uint8)
    mask = idx >= 0
    oh[mask] = EYE4[idx[mask]]
    return oh

seqs = df22["seq101"].tolist()
X = np.stack([one_hot_vec(s) for s in seqs], axis = 0)
y = df22["label"].to_numpy().astype(np.int64)

X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=0.5, random_state=42, stratify=y_tmp
)

X_train.shape, y_train.shape, np.bincount(y_train)

((262, 101, 4), (262,), array([ 79, 183]))

In [31]:
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.long)
X_val_t   = torch.tensor(X_val,   dtype=torch.float32)
y_val_t   = torch.tensor(y_val,   dtype=torch.long)

train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=32, shuffle=True)
val_loader   = DataLoader(TensorDataset(X_val_t,   y_val_t),   batch_size=32)

class SimpleCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Conv1d(4, 32, kernel_size=7)
        self.pool = nn.MaxPool1d(2)
        self.relu = nn.ReLU()
        self.flat = nn.Flatten()
        self.fc = nn.Linear(32*47, 2)
    def forward(self, x):
        x = x.permute(0,2,1)
        x = self.relu(self.conv(x))
        x = self.pool(x)
        x = self.flat(x)
        return self.fc(x)

model = SimpleCNN()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [45]:
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F

EPOCHS=5
for ep in range(EPOCHS):
    model.train()
    tot_loss = 0
    for xb,yb in train_loader:
        optimizer.zero_grad()
        out = model(xb)
        loss = loss_fn(out, yb)
        loss.backward()
        optimizer.step()
        tot_loss += loss.item()
    # Validate
    model.eval()
    with torch.no_grad():
        logits = model(X_val_t)
        probs  = F.softmax(logits, dim=1)[:,1].cpu().numpy()
        auc = roc_auc_score(y_val_t.cpu().numpy(), probs)
    print(f"Epoch {ep+1}: loss={tot_loss/len(train_loader):.4f}, val_AUC={auc:.3f}")


Epoch 1: loss=0.0334, val_AUC=0.581
Epoch 2: loss=0.0343, val_AUC=0.584
Epoch 3: loss=0.0333, val_AUC=0.586
Epoch 4: loss=0.0317, val_AUC=0.586
Epoch 5: loss=0.0293, val_AUC=0.586


In [47]:
# Make sure df22 exists in your notebook before running this
from pathlib import Path

DATA = Path("../data")

save_path = DATA / "df22_sequences.csv"

df22.to_csv(save_path, index=False)

print("df22 saved to:", save_path)
print("Rows:", len(df22))
print("Columns:", df22.columns.tolist())


df22 saved to: ..\data\df22_sequences.csv
Rows: 437
Columns: ['Chromosome', 'Start', 'ReferenceAlleleVCF', 'AlternateAlleleVCF', 'ClinicalSignificance', 'label', 'seq101']
