In [69]:
import sys
import os
import io
import logging

# Python packages for data, stats
import numpy as np
import ast
import pandas as pd
import seaborn as sns
import random

from google.cloud import bigquery
from google.cloud import storage

from google.cloud import bigquery
from dask import delayed
import pandas as pd
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, classification_report, f1_score


# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from xgboost import XGBClassifier


# Set a random seed for reproducibility
random_seed = 42

# Setting the random seed for various libraries
random.seed(random_seed)
np.random.seed(random_seed)
torch.manual_seed(random_seed)

bq_client = bigquery.Client()
storage_client = storage.Client()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [105]:
datasets = ["TRAINING", "VALIDATION", "TESTING", "TRAINING_AND_VALIDATION"]

project = "hmh-em-deepasm"
samples_dataset = "samples_250bp"
ml_dataset = "ml_250bp_db7e6e4"
ml_mode = 'TESTING'
ml_nb_datapoints_for_testing = 200000
metrics = ['asm', 'asm_not_corrected']
bucket = "hmh_deepasm"
label_var = 'asm_not_corrected'
padding_value = 0
batch_size = 1024

# Make sure this is the same as in the config file
samples_dic = {'TRAINING': ['gm12878',
                                  'CD14',
                                  'fibroblast',
                                  'A549',
                                  'spleen_female_adult',
                                  'HeLa_S3'],
            'VALIDATION': ['mammary_epithelial',
                                       'sk_n_sh',
                                       'CD34'],
            'TESTING': ['HepG2',
                                 'righ_lobe_liver',
                                 't_cell_male_adult']}

dataset_types = ["TRAINING", "VALIDATION", "TESTING"]


In [106]:
def compute_classes(dic_data, device):
    class_weight_dic = {"TRAINING": {}, "TRAINING_AND_VALIDATION": {}}
    for data in ["TRAINING", "TRAINING_AND_VALIDATION"]:
        labels = dic_data[data]["labels"]
        weights = np.round(
            compute_class_weight("balanced", classes=[0, 1], y=labels), 2
        )
        class_weight_dic[data]["class_weight"] = {0: weights[0], 1: weights[1]}
        scale_pos_weight = len(labels[labels == 0]) / len(labels[labels == 1])
        class_weight_dic[data]["scale_pos_weight"] = scale_pos_weight
        class_weight_dic[data]["weight_tensor"] = torch.tensor(
            weights, dtype=torch.float32
        ).to(device)
    return class_weight_dic

def evaluate_model(labels, predictions):
  confusion = confusion_matrix(labels, predictions)
  report = classification_report(labels, predictions, output_dict=True)
  sum_f1 = np.round(report['0.0']['f1-score'] + report['1.0']['f1-score'],3)
  report = classification_report(labels, predictions, output_dict=False)
  report = classification_report(labels, predictions, digits = 3)
  return sum_f1, confusion, report

In [107]:
dic_data = {
        **{dataset: {} for dataset in dataset_types},
        **{"TRAINING_AND_VALIDATION": {}},
    }

max_sequence_length = 0
for dataset in dataset_types:
    print(f"Processing {dataset} dataset...")
    quoted_samples = ",".join([f"'{sample}'" for sample in samples_dic[dataset]])
    print(f"Importing the samples: {quoted_samples}")
    query = f"""
        SELECT * EXCEPT (region_sup, clustering_index, region_nb_cpg, cpgs_w_padding)
        FROM {project}.{ml_dataset}.features_wo_hmm
        WHERE
            cpg_directional_fm IS NOT NULL AND
            {label_var} IS NOT NULL AND
            sample IN ({quoted_samples})
        """
    if ml_mode == "TESTING":
        print("In testing mode. Adding a limit to the import.")
        query += f"LIMIT {ml_nb_datapoints_for_testing}"
    df = bq_client.query(query).to_dataframe()
    print(f"Number of rows in DF: {len(df):,}")
    dic_data[dataset]["labels"] = df[label_var].astype(int)
    dic_data[dataset]["region_info"] = df[
        ["asm", "sample", "chr", "region_inf", "nb_cpg_found", "nb_reads"]
    ]
    dic_data[dataset]["1d_seq"] = df["cpg_directional_fm"].apply(
        lambda x: ast.literal_eval(x.strip('"'))
    )
    current_max_sequence_length = max(dic_data[dataset]["1d_seq"].apply(len))
    print(
        f"Max sequence length in the dataset: {current_max_sequence_length}"
    )
    if current_max_sequence_length > max_sequence_length:
        max_sequence_length = current_max_sequence_length

for data in ["labels", "region_info", "1d_seq"]:  # '1d_seq', '2d_seq',
    dic_data["TRAINING_AND_VALIDATION"][data] = pd.concat(
        [dic_data["TRAINING"][data], dic_data["VALIDATION"][data]]
    )

print("Adding a dummy row of variables to adjust the max sequence length")
zero_sequence = [0] * max_sequence_length  # This is the sequence to append
for dataset in dic_data.keys():
    # Append a new sequence of zeros to the '1d_seq' array
    dic_data[dataset]["1d_seq"] = pd.concat(
        [dic_data[dataset]["1d_seq"], pd.Series([zero_sequence])], ignore_index=True
    )
    # Assuming you need to add a corresponding new label
    dic_data[dataset]["labels"] = pd.concat(
        [dic_data[dataset]["labels"], pd.Series([0])], ignore_index=True
    )

for dataset in datasets:
    print(f"Processing Data Loader for: {dataset}")
    sequences = [list(row) for row in dic_data[dataset]["1d_seq"]]
    labels = [int(label) for label in dic_data[dataset]["labels"]]
    print(f"Unique labels: {np.unique(labels)}")
    # Convert sequences to tensors and pad them
    padded_sequences = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(s) for s in sequences],
        batch_first=True,
        padding_value=padding_value,
    )
    #padded_sequences = padded_sequences.squeeze(-1)
    print(f"Shape of the Tensor for the data: {padded_sequences.shape}")
    # Convert labels to a tensor
    labels = torch.tensor(labels, dtype=torch.float32)
    print(f"Shape of the Tensor for the labels: {labels.shape}")
    # Create DataLoader for batch processing
    data = TensorDataset(padded_sequences, labels)
    dic_data[dataset]["dataloader"] = DataLoader(
        data, batch_size=batch_size, shuffle=True
    )

print("Computing class weights")
class_weight_dic = compute_classes(dic_data, device)

Processing TRAINING dataset...
Importing the samples: 'gm12878','CD14','fibroblast','A549','spleen_female_adult','HeLa_S3'
In testing mode. Adding a limit to the import.
Number of rows in DF: 200,000
Max sequence length in the dataset: 56
Processing VALIDATION dataset...
Importing the samples: 'mammary_epithelial','sk_n_sh','CD34'
In testing mode. Adding a limit to the import.
Number of rows in DF: 200,000
Max sequence length in the dataset: 57
Processing TESTING dataset...
Importing the samples: 'HepG2','righ_lobe_liver','t_cell_male_adult'
In testing mode. Adding a limit to the import.
Number of rows in DF: 200,000
Max sequence length in the dataset: 56
Adding a dummy row of variables to adjust the max sequence length
Processing Data Loader for: TRAINING
Unique labels: [0 1]
Shape of the Tensor for the data: torch.Size([200001, 57])
Shape of the Tensor for the labels: torch.Size([200001])
Processing Data Loader for: VALIDATION
Unique labels: [0 1]
Shape of the Tensor for the data: to

In [146]:
# Hyperparameters
input_dim = max_sequence_length
hidden_dim = 2000
latent_dim = 50
num_epochs = 60


# class VAE(nn.Module):
#     def __init__(self, input_dim, hidden_dim, latent_dim):
#         super(VAE, self).__init__()

#         # Encoder layers
#         self.fc1 = nn.Linear(input_dim, hidden_dim)
#         self.fc_mu = nn.Linear(hidden_dim, latent_dim)  # Mean of the latent space
#         self.fc_logvar = nn.Linear(hidden_dim, latent_dim)  # Log variance of the latent space

#         # Decoder layers
#         self.fc3 = nn.Linear(latent_dim, hidden_dim)
#         self.fc4 = nn.Linear(hidden_dim, input_dim)

#     def encode(self, x):
#         h1 = F.relu(self.fc1(x))
#         return self.fc_mu(h1), self.fc_logvar(h1)

#     def reparameterize(self, mu, logvar):
#         std = torch.exp(0.5 * logvar)
#         eps = torch.randn_like(std)
#         return mu + eps * std

#     def decode(self, z):
#         h3 = F.relu(self.fc3(z))
#         return self.fc4(h3)  # Raw logits, no sigmoid

#     def forward(self, x):
#         mu, logvar = self.encode(x)
#         z = self.reparameterize(mu, logvar)
#         return self.decode(z), mu, logvar


class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim, dropout_rate=0.5):
        super(VAE, self).__init__()

        # Dropout layer
        self.dropout = nn.Dropout(dropout_rate)

        # Encoder layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc_mu = nn.Linear(hidden_dim, latent_dim)  # Mean of the latent space
        self.fc_logvar = nn.Linear(hidden_dim, latent_dim)  # Log variance of the latent space

        # Decoder layers
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc_mu(h1), self.fc_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return self.fc4(h3)  # Raw logits, no sigmoid

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

    def get_latent_representation(self, x):
        mu, logvar = self.encode(x)
        return self.reparameterize(mu, logvar)

def loss_function(recon_x, x, mu, logvar):
    BCE = F.binary_cross_entropy_with_logits(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD

def train(model, dataloader, optimizer):
    model.train()
    train_loss = 0
    for batch_idx, (data, _) in enumerate(dataloader):
        data = data.to(device)
        optimizer.zero_grad()
        recon_batch, mu, logvar = model(data)
        loss = loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    return train_loss / len(dataloader.dataset)

# Sample usage (assuming `dataset` is your dataset)
dataloader = dic_data['TRAINING']["dataloader"]
model = VAE(input_dim, hidden_dim, latent_dim).to(device)
optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=0.005,
            weight_decay=1e-3,
        )
for epoch in range(num_epochs):
    print(f"epoch: {epoch}")
    train_loss = train(model, dataloader, optimizer)
    print(f'Epoch {epoch+1}, Loss: {train_loss:.4f}')


epoch: 0
Epoch 1, Loss: 3.6885
epoch: 1
Epoch 2, Loss: 3.3176
epoch: 2
Epoch 3, Loss: 3.3181
epoch: 3
Epoch 4, Loss: 3.3222
epoch: 4
Epoch 5, Loss: 3.3206
epoch: 5
Epoch 6, Loss: 3.3201
epoch: 6
Epoch 7, Loss: 3.3210
epoch: 7
Epoch 8, Loss: 3.3189
epoch: 8
Epoch 9, Loss: 3.3184
epoch: 9
Epoch 10, Loss: 3.3180
epoch: 10
Epoch 11, Loss: 3.3181
epoch: 11
Epoch 12, Loss: 3.3151
epoch: 12
Epoch 13, Loss: 3.3142
epoch: 13
Epoch 14, Loss: 3.3128
epoch: 14
Epoch 15, Loss: 3.3109
epoch: 15
Epoch 16, Loss: 3.3124
epoch: 16
Epoch 17, Loss: 3.3084
epoch: 17
Epoch 18, Loss: 3.3083
epoch: 18
Epoch 19, Loss: 3.3058
epoch: 19
Epoch 20, Loss: 3.3063
epoch: 20
Epoch 21, Loss: 3.3061
epoch: 21
Epoch 22, Loss: 3.3035
epoch: 22
Epoch 23, Loss: 3.3014
epoch: 23
Epoch 24, Loss: 3.3037
epoch: 24
Epoch 25, Loss: 3.3013
epoch: 25
Epoch 26, Loss: 3.2979
epoch: 26
Epoch 27, Loss: 3.2955
epoch: 27
Epoch 28, Loss: 3.2955
epoch: 28
Epoch 29, Loss: 3.2923
epoch: 29
Epoch 30, Loss: 3.2949
epoch: 30
Epoch 31, Loss: 3.2

In [147]:
# def extract_latent_representations(model, dataloader):
#     model.eval()
#     latent_vectors = []
#     labels = []
#     with torch.no_grad():
#         for data, target in dataloader:
#             data = data.to(device)
#             target = target.to(device)
#             mu, _ = model.encode(data)
#             latent_vectors.append(mu.cpu())
#             labels.append(target.cpu())
#     return torch.cat(latent_vectors), torch.cat(labels)

# def extract_latent_vectors(model, dataloader):
#   # Returns tensors
#     model.eval()
#     latent_vectors = []
#     labels = []
#     with torch.no_grad():
#         for data, target in dataloader:
#             data = data.to(device)
#             latent_vector = model.get_latent_representation(data)
#             latent_vectors.append(latent_vector.cpu())
#             labels.append(target.cpu())
#     return torch.cat(latent_vectors), torch.cat(labels)

def extract_latent_vectors(model, dataloader):
    model.eval()
    latent_vectors = []
    labels = []
    with torch.no_grad():
        for data, target in dataloader:
            data = data.to(device)
            latent_vector = model.get_latent_representation(data)
            latent_vectors.append(latent_vector.cpu().numpy())
            labels.append(target.cpu().numpy())

    latent_vectors_np = np.concatenate(latent_vectors, axis=0)
    labels_np = np.concatenate(labels, axis=0)

    # Create DataFrame
    df_latent_vectors = pd.DataFrame(latent_vectors_np)
    df_latent_vectors['label'] = labels_np

    return df_latent_vectors

# Assuming dataloader is the DataLoader for your entire dataset
latent_vectors_training = extract_latent_vectors(model, dic_data['TRAINING_AND_VALIDATION']["dataloader"])
latent_vectors_testing = extract_latent_vectors(model, dic_data['TESTING']["dataloader"])

In [136]:
def evaluate_model(labels, predictions):
  confusion = confusion_matrix(labels, predictions)
  report = classification_report(labels, predictions, output_dict=True)
  sum_f1 = np.round(report['0']['f1-score'] + report['1']['f1-score'],3)
  report = classification_report(labels, predictions, output_dict=False)
  report = classification_report(labels, predictions, digits = 3)
  return sum_f1, confusion, report

In [148]:
#var_for_training = 'hs_features'
params = {
    'max_depth': 100,
    'eta': 0.001, # learning rate
    'objective': 'binary:logistic',
    'nthread': 8,
    'eval_metric': ['logloss'],
    'subsample': 0.5,
    #'scale_pos_weight': 0.02  # Adjust based on your exact dataset
}

train_labels = latent_vectors_training['label'].astype(int)
scale_pos_weight = len(train_labels[train_labels == 0]) / len(train_labels[train_labels == 1])
print(f"Scale pos weight: {scale_pos_weight}")

model_w_params = XGBClassifier(scale_pos_weight=scale_pos_weight, **params)
model_w_params.fit(latent_vectors_training.drop(columns=['label']), train_labels)

labels = latent_vectors_testing['label'].astype(int)
predictions = model_w_params.predict(latent_vectors_testing.drop(columns=['label']))

Scale pos weight: 90.86977491961414


In [149]:

#sum_f1, confusion, report = evaluate_model(np.array(labels), predictions)
#print(f"SUm of F1s: {sum_f1}\nConfusion matrix: {confusion}\nReport:{report}")

confusion = confusion_matrix(np.array(labels), predictions)
report = classification_report(np.array(labels), predictions, output_dict=True)

print(confusion)
print(report)

[[196105    220]
 [  3652     24]]
{'0': {'precision': 0.9817177871113403, 'recall': 0.9988794091430027, 'f1-score': 0.9902242464944128, 'support': 196325}, '1': {'precision': 0.09836065573770492, 'recall': 0.006528835690968444, 'f1-score': 0.012244897959183673, 'support': 3676}, 'accuracy': 0.980640096799516, 'macro avg': {'precision': 0.5400392214245227, 'recall': 0.5027041224169856, 'f1-score': 0.5012345722267982, 'support': 200001}, 'weighted avg': {'precision': 0.9654817642168074, 'recall': 0.980640096799516, 'f1-score': 0.972249075944188, 'support': 200001}}


In [151]:
sum_f1 = np.round(report['0']['f1-score'] + report['1']['f1-score'],3)
print(sum_f1)

1.002


In [56]:
class Classifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, 100)
        self.fc2 = nn.Linear(100, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Hyperparameters for the classifier
num_classes = 2  # Assuming binary classification
classifier_input_dim = latent_dim  # Latent dimension from VAE
classifier_learning_rate = 1e-3
classifier_num_epochs = 10

classifier = Classifier(classifier_input_dim, num_classes).to(device)
classifier_optimizer = torch.optim.AdamW(
            classifier.parameters(),
            lr=0.001,
            weight_decay=1e-3,
        )
criterion = nn.BCEWithLogitsLoss(
        pos_weight=class_weight_dic["TRAINING_AND_VALIDATION"]["weight_tensor"][1]
    )

# Convert latent vectors and labels to DataLoader
latent_dataset = TensorDataset(latent_vectors, labels)
latent_dataloader = DataLoader(latent_dataset, batch_size=batch_size, shuffle=True)

# Training loop for the classifier
def train_classifier(classifier, dataloader, optimizer, criterion):
    classifier.train()
    train_loss = 0
    for batch_idx, (data, target) in enumerate(dataloader):
        data = data.to(device)
        target = target.to(device).long()
        optimizer.zero_grad()
        output = classifier(data)
        loss = criterion(output, target)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    return train_loss / len(dataloader.dataset)

for epoch in range(classifier_num_epochs):
    train_loss = train_classifier(classifier, latent_dataloader, classifier_optimizer, criterion)
    print(f'Classifier Epoch {epoch+1}, Loss: {train_loss:.4f}')


ValueError: Target size (torch.Size([1024])) must be the same as input size (torch.Size([1024, 2]))

In [None]:
sumf1, report, confusion = evaluate_model(
        classifier, dic_data["TESTING"]["dataloader"], criterion, device
    )