In [2]:
import torch
from torch import nn 
from torch.utils.data import Dataset , DataLoader
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import average_precision_score, roc_auc_score, confusion_matrix
import os 
import pandas as pd
import numpy as np

from collections import Counter
import warnings
warnings.filterwarnings("ignore") 


In [5]:
# *****************************************************************************
# Load the Dataframes :
#path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_work = "/media/concha-eloko/Linux/PPT_clean"

    # Open the DF
DF_info = pd.read_csv(f"{path_work}/DF_Dpo.final.2705.tsv", sep = "\t" ,  header = 0 )
    # Open the embeddings
DF_embeddings = pd.read_csv(f"{path_work}/Dpo.2705.embeddings.ultimate.csv", sep = ",", header= None )
DF_embeddings.rename(columns={0: 'index'}, inplace=True)

    # Filter the DF :
DF_info_filtered = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_ToReLabel = DF_info[DF_info["KL_type_LCA"].str.contains("\\|")]
all_data = pd.merge(DF_info_filtered , DF_embeddings , on = "index")


In [29]:
# *****************************************************************************
# Preprocess the dataframe :

    # Remove "same" Dpos
all_data = all_data.drop_duplicates(subset = ["Infected_ancestor","index"] , keep = "first").reset_index(drop=True)

    # Create new categories for the punctual phages and ancestors :
dico_infected_a = dict(Counter(all_data["Infected_ancestor"]))
dico_phage = dict(Counter(all_data["Phage"]))

all_data["labeled_ancestor"] = all_data["Infected_ancestor"].apply(lambda x : x if dico_infected_a[x] > 1 else "punctual_ancestor")
all_data["labeled_phage"] = all_data["Phage"].apply(lambda x : x if dico_phage[x] > 1 else "punctual_phage")

    # Label / OneHotEncode encode the categories and the labels :
LE  = LabelEncoder()
OHE = OneHotEncoder()

all_data["KL_type_LCA"] = LE.fit_transform(all_data["KL_type_LCA"])
labeled_ancestor_OHE = OHE.fit_transform(all_data[["labeled_ancestor"]]).toarray()
labeled_phage_OHE = OHE.fit_transform(all_data[["labeled_phage"]]).toarray()

df_labeled_ancestor_OHE = pd.DataFrame(labeled_ancestor_OHE, columns=[f"OHE_ancestor_{i}" for i in range(labeled_ancestor_OHE.shape[1])])
df_labeled_phage_OHE = pd.DataFrame(labeled_phage_OHE, columns=[f"OHE_phage_{i}" for i in range(labeled_phage_OHE.shape[1])])

final_all_data = pd.concat([all_data ,df_labeled_ancestor_OHE, df_labeled_phage_OHE], axis = 1)

In [33]:
len(labeled_ancestor_OHE[0]) , len(labeled_phage_OHE[0])

(2179, 1625)

In [48]:
# *****************************************************************************
# The dataloader : 

class MultiDomainDataset(Dataset):
    def __init__(self, final_all_data):
        self.embeddings = torch.tensor(final_all_data[[int(i) for i in range(1, 1281)]].values, dtype=torch.float)
        self.ancestor = torch.tensor(final_all_data[[f"OHE_ancestor_{i}" for i in range(labeled_ancestor_OHE.shape[1])]].values, dtype=torch.float)
        self.prophage_instance = torch.tensor(final_all_data[[f"OHE_phage_{i}" for i in range(labeled_phage_OHE.shape[1])]].values, dtype=torch.float)
        self.labels = torch.tensor(final_all_data["KL_type_LCA"].values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item_domain1 = self.embeddings[idx]
        item_domain2 = self.ancestor[idx]
        item_domain3 = self.prophage_instance[idx]
        item_domain4 = self.labels[idx]
        return item_domain1, item_domain2, item_domain3 , item_domain4

X_train, X_test, y_train, y_test = train_test_split(final_all_data, final_all_data["KL_type_LCA"], test_size=0.25, random_state=42)

train_singledata = MultiDomainDataset(X_train)
test_singledata = MultiDomainDataset(X_test)

train_loader = DataLoader(train_singledata, batch_size=12, shuffle=True, num_workers=4)
test_loader = DataLoader(test_singledata, batch_size=12, shuffle=True, num_workers=4)


In [49]:
for item_domain1, item_domain2, item_domain3, item_domain4 in train_loader:
    print(item_domain1.shape, item_domain2.shape, item_domain3.shape, item_domain4.shape)
    break

torch.Size([12, 1280]) torch.Size([12, 2179]) torch.Size([12, 1625]) torch.Size([12])


In [41]:
train_singledata[0]

(tensor([ 0.0284,  0.0405, -0.0099,  ..., -0.0841,  0.0540,  0.1031]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 tensor(69))

In [None]:
# *****************************************************************************
# The model : 

class MultiBranchModel(nn.Module):
    def __init__(self):
        super(MultiBranchModel, self).__init__()
        self.lstm = nn.LSTM(input_size=1280, hidden_size=512, num_layers=2, batch_first=True, bidirectional=True)
        self.lstm_branch = nn.Sequential(
            nn.ReLU(),
            nn.Linear(512*2, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=256)
        )
        self.fnn_branch1 = nn.Sequential(
            nn.Linear(2179, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        self.fnn_branch2 = nn.Sequential(
            nn.Linear(1625, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        self.pre_classifier = nn.Sequential(
            nn.Linear(256*3, 256),
            nn.ReLU()
        )
        self.classifier = nn.Linear(256, 127)  # as we concatenate the output from all three branches
    def forward(self, embeddings, ancestor, prophage):
        # unsqueeze embeddings to add sequence length dimension
        embeddings = embeddings.unsqueeze(1)  # assuming embeddings have shape (batch_size, embedding_dim)
        lstm_out, _ = self.lstm(embeddings)
        lstm_out = lstm_out.squeeze(1)  # squeeze out the sequence length dimension
        lstm_out = self.lstm_branch(lstm_out)  # Apply the remaining layers
        fnn_out1 = self.fnn_branch1(ancestor)
        fnn_out2 = self.fnn_branch2(prophage)
        out = torch.cat((lstm_out, fnn_out1, fnn_out2), dim=1)
        out = self.pre_classifier(out)
        out = self.classifier(out)
        return out


In [None]:
# *****************************************************************************
# Train / Eval : 

# Adding the weights : 
class_samples = np.bincount(all_data.KL_type_LCA)
class_weights = 1. / torch.tensor(np.sqrt(class_samples), dtype=torch.float)
# define loss function
criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use GPU if available

model = MultiBranchModel()
model = model.to(device)  # Move model to device

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Increase initial learning rate
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)  # Decrease learning rate every 30 epochs by 0.1

history = {
    "train_loss": [],
    "train_accuracy": [],
    "val_loss": [],
    "val_accuracy": [],
    "val_auprc": {},  # Store AUPRC for each class
    "val_auroc": {},  # Store AUROC for each class
    "confusion_matrix": {},  # Store confusion matrix for each class
}

for epoch in range(100):
    # Switch model to training mode
    model.train()
    epoch_loss = 0.0
    for i, (embeddings, ancestor, prophage, labels) in enumerate(train_loader):
        embeddings = embeddings.to(device)
        ancestor = ancestor.to(device)
        prophage = prophage.to(device)
        labels = labels.to(device)
        
        outputs = model(embeddings, ancestor, prophage)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    epoch_loss /= len(train_loader)
    history["train_loss"].append(epoch_loss)
    scheduler.step()

    # Switch model to evaluation mode
    model.eval()
    val_loss = 0.0
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for embeddings, ancestor, prophage, labels in test_loader:
            embeddings = embeddings.to(device)
            ancestor = ancestor.to(device)
            prophage = prophage.to(device)
            labels = labels.to(device)

            outputs = model(embeddings, ancestor, prophage)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_predictions.append(outputs.cpu().numpy())
            val_true_labels.append(labels.cpu().numpy())
    val_loss /= len(val_loader)
    history["val_loss"].append(val_loss)
    val_predictions = np.concatenate(val_predictions, axis=0)
    val_true_labels = np.concatenate(val_true_labels, axis=0)
    #auprc = average_precision_score(val_true_labels, val_predictions, average="macro")  # Use macro-average to handle multiclass/multilabel
    #history["val_auprc"].append(auprc)


In [None]:
# *****************************************************************************
# Save model after training
torch.save(model.state_dict(), f"{path_work}/train_nn/MultiDomain.LSTM.model")

import json
with open(f"{path_work}/train_nn/MultiDomain.LSTM.model.out" , "w") as outfile :
    outfile.write(json.dumps(history))

