In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd


df = pd.read_csv('/content/drive/MyDrive/543 Project/clinvar.csv')


  df = pd.read_csv('/content/drive/MyDrive/543 Project/cleaned_clinvar.csv')  # Replace with your actual file path


In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
def encode_allele(allele):
    mapping = {'A': [1, 0, 0, 0],
               'C': [0, 1, 0, 0],
               'G': [0, 0, 1, 0],
               'T': [0, 0, 0, 1]}
    return mapping.get(allele, [0, 0, 0, 0])

df['ref_encoded'] = df['ref'].apply(encode_allele)
df['alt_encoded'] = df['alt'].apply(encode_allele)

In [None]:
df['chrom'] = df['chrom'].astype(str)

chrom_encoder = OneHotEncoder(sparse_output=False)
chrom_encoded = chrom_encoder.fit_transform(df[['chrom']])

df['chrom_encoded'] = list(chrom_encoded)

In [None]:
#normalize position col
df['pos_normalized'] = df['pos'] / df['pos'].max()

In [None]:
def combine_features(row):
    features = []
    features.extend(row['ref_encoded'])
    features.extend(row['alt_encoded'])
    features.extend(row['chrom_encoded'])
    features.append(row['pos_normalized'])
    return features


df['combined_features'] = df.apply(combine_features, axis=1)
df['label'] = df['label'].astype(np.float32)

In [None]:
features = df['combined_features'].tolist()
labels = df['label'].values

features = np.array(features, dtype=np.float32)
labels = np.array(labels, dtype=np.float32)



In [None]:

X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42
)


In [None]:
class VariantDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.labels[idx]
        return feature, label


In [None]:
train_dataset = VariantDataset(X_train, y_train)
test_dataset = VariantDataset(X_test, y_test)

batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)


In [None]:

input_dim = X_train.shape[1]

class SimpleClassifier(nn.Module):
    def __init__(self, input_dim):
        super(SimpleClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)


model = SimpleClassifier(input_dim)

In [None]:
#binary cross entropy loss
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [None]:
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_features, batch_labels in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_features)
        outputs = outputs.squeeze()
        batch_labels = batch_labels.squeeze()
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

Epoch 1/10, Loss: 0.4261
Epoch 2/10, Loss: 0.4227
Epoch 3/10, Loss: 0.4221
Epoch 4/10, Loss: 0.4219
Epoch 5/10, Loss: 0.4215
Epoch 6/10, Loss: 0.4215
Epoch 7/10, Loss: 0.4212
Epoch 8/10, Loss: 0.4212
Epoch 9/10, Loss: 0.4212
Epoch 10/10, Loss: 0.4208


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
with torch.no_grad():
    all_preds = []
    all_labels = []
    for batch_features, batch_labels in test_loader:
        outputs = model(batch_features)
        predicted = (outputs.squeeze() >= 0.5).float()
        all_preds.extend(predicted.numpy())
        all_labels.extend(batch_labels.numpy())


    accuracy = accuracy_score(all_labels, all_preds)

    precision = precision_score(all_labels, all_preds, zero_division=0)
    recall = recall_score(all_labels, all_preds, zero_division=0)
    f1 = f1_score(all_labels, all_preds, zero_division=0)

    print(f"Test Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall: {recall * 100:.2f}%")
    print(f"F1 Score: {f1 * 100:.2f}%")



Test Accuracy: 84.64%
Precision: 86.09%
Recall: 96.80%
F1 Score: 91.13%


For predicting with new variants


In [None]:
new_data = pd.DataFrame({
    'chrom': ['1', '2'],
    'pos': [123456, 234567],
    'ref': ['A', 'G'],
    'alt': ['G', 'C']
})

#preprocess data
new_data['ref_encoded'] = new_data['ref'].apply(encode_allele)
new_data['alt_encoded'] = new_data['alt'].apply(encode_allele)
new_chrom_encoded = chrom_encoder.transform(new_data[['chrom']])
new_data['chrom_encoded'] = list(new_chrom_encoded)
new_data['pos_normalized'] = new_data['pos'] / df['pos'].max()
new_data['combined_features'] = new_data.apply(combine_features, axis=1)

new_features = np.array(new_data['combined_features'].tolist(), dtype=np.float32)
new_features = torch.tensor(new_features, dtype=torch.float32)


model.eval()
with torch.no_grad():
    outputs = model(new_features)
    probabilities = outputs.squeeze().numpy()
    predictions = (probabilities >= 0.5).astype(int)
    print("New Data Predictions (Probabilities):", probabilities)
    print("New Data Predictions (Classes):", predictions)