In [19]:
import torch
from torch import nn 
from torch.utils.data import Dataset , DataLoader
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import matthews_corrcoef, precision_score, recall_score, accuracy_score

import os 
import pandas as pd
import numpy as np

from collections import Counter
import warnings
import random
warnings.filterwarnings("ignore") 


In [7]:
# *****************************************************************************
# Load the Dataframes :
#path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_work = "/media/concha-eloko/Linux/PPT_clean"

    # Open the DF
DF_info = pd.read_csv(f"{path_work}/DF_Dpo.final.2705.tsv", sep = "\t" ,  header = 0 )
    # Open the embeddings
DF_embeddings = pd.read_csv(f"{path_work}/Dpo.2705.embeddings.ultimate.csv", sep = ",", header= None )
DF_embeddings.rename(columns={0: 'index'}, inplace=True)

    # Filter the DF :
DF_info_filtered = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_ToReLabel = DF_info[DF_info["KL_type_LCA"].str.contains("\\|")]
all_data = pd.merge(DF_info_filtered , DF_embeddings , on = "index")


In [35]:
# *****************************************************************************
# Preprocess the dataframe :

    # Remove "same" Dpos
all_data = all_data.drop_duplicates(subset = ["Infected_ancestor","index"] , keep = "first").reset_index(drop=True)

    # Create new categories for the punctual phages and ancestors :
dico_infected_a = dict(Counter(all_data["Infected_ancestor"]))
dico_phage = dict(Counter(all_data["Phage"]))

all_data["labeled_ancestor"] = all_data["Infected_ancestor"].apply(lambda x : x if dico_infected_a[x] > 1 else "punctual_ancestor")
all_data["labeled_phage"] = all_data["Phage"].apply(lambda x : x if dico_phage[x] > 1 else "punctual_phage")

    # Label / OneHotEncode encode the categories and the labels :
LE  = LabelEncoder()
OHE = OneHotEncoder()

embeddings = all_data[[int(i) for i in range(1, 1281)]].values 
KL_type_OHE = OHE.fit_transform(all_data[["KL_type_LCA"]]).toarray()
labeled_ancestor_OHE = OHE.fit_transform(all_data[["labeled_ancestor"]]).toarray()
labeled_phage_OHE = OHE.fit_transform(all_data[["labeled_phage"]]).toarray()

positive_data = {"embeddings" : embeddings, "KL_types" : KL_type_OHE, "ancestor" : labeled_ancestor_OHE , "phage" : labeled_phage_OHE}
positive_data["labels"] = [1] * len(data["embeddings"])



In [29]:
def generate_negative_samples(data, neg_pos_ratio):
    pos_samples = len(data['embeddings'])
    neg_samples = int(pos_samples * neg_pos_ratio)

    negative_data = {"embeddings": [], "KL_types": [], "ancestor": [], "phage": []}

    for _ in range(neg_samples):
        # Randomly select a pair from embeddings and phage
        emb_phage_index = random.randint(0, pos_samples-1)
        negative_data["embeddings"].append(data["embeddings"][emb_phage_index])
        negative_data["phage"].append(data["phage"][emb_phage_index])

        # Randomly select a pair from ancestor and KL_types ensuring the KL_type is different
        while True:
            anc_kl_index = random.randint(0, pos_samples-1)
            if not np.all(data["KL_types"][anc_kl_index] == data["KL_types"][emb_phage_index]):
                negative_data["ancestor"].append(data["ancestor"][anc_kl_index])
                negative_data["KL_types"].append(data["KL_types"][anc_kl_index])
                break

    # Convert lists to numpy arrays for consistency
    for key in negative_data.keys():
        negative_data[key] = np.array(negative_data[key])

    return negative_data

negative_data = generate_negative_samples(data, 2)
negative_data["labels"] = [0] * len(negative_data["embeddings"])


In [38]:
data = {item : np.concatenate((positive_data[item], negative_data[item])) for item in positive_data}
data

{'embeddings': array([[-0.01841583,  0.02238694,  0.00239867, ..., -0.07089869,
          0.0160682 ,  0.06533931],
        [ 0.03601578,  0.00593843, -0.0435346 , ..., -0.13660194,
         -0.19137819,  0.13565759],
        [ 0.02600367,  0.02437204, -0.00023745, ..., -0.02442309,
         -0.02799782,  0.0280894 ],
        ...,
        [-0.01253937,  0.04425263, -0.02174671, ..., -0.07589217,
         -0.08787988,  0.10967518],
        [ 0.00080826,  0.01944812, -0.01075577, ..., -0.1129315 ,
         -0.11068641,  0.1909886 ],
        [ 0.02741208,  0.0601121 , -0.02414271, ..., -0.08106059,
          0.060077  ,  0.07810764]]),
 'KL_types': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 'ancestor': array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        

In [43]:
# *****************************************************************************
# The dataloader : 
class MultiDomainDataset(Dataset):
    def __init__(self, data):
        self.embeddings = torch.tensor(data["embeddings"], dtype=torch.float)
        self.ancestor = torch.tensor(data["ancestor"], dtype=torch.float)
        self.phage = torch.tensor(data["phage"], dtype=torch.float)
        self.KLtypes = torch.tensor(data["KL_types"], dtype=torch.float)
        self.labels = torch.tensor(data["labels"], dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item_domain1 = self.embeddings[idx]
        item_domain2 = self.ancestor[idx]
        item_domain3 = self.phage[idx]
        item_domain4 = self.KLtypes[idx]
        item_domain5 = self.labels[idx]
        return item_domain1, item_domain2, item_domain3 , item_domain4, item_domain5

X_train, X_tmp, X_test, X_val = {}, {}, {}
for key in data.keys():
    X_train[key], X_tmp[key], _, _ = train_test_split(data[key], data["labels"], test_size=0.30, random_state=42)

for key in X_train.keys():
    X_val[key], X_test[key], _, _ = train_test_split(X_tmp[key], X_tmp["labels"], test_size=0.25, random_state=42)

train_singledata = MultiDomainDataset(X_train)
test_singledata = MultiDomainDataset(X_test)
val_singledata = MultiDomainDataset(X_val)

train_loader = DataLoader(train_singledata, batch_size=12, shuffle=True, num_workers=4)
test_loader = DataLoader(test_singledata, batch_size=12, shuffle=True, num_workers=4)
val_loader = DataLoader(val_singledata, batch_size=12, shuffle=True, num_workers=4)


In [47]:
len(data["KL_types"][0])

127

In [None]:
# *****************************************************************************
# The model : 
class MultiBranchModel(nn.Module):
    def __init__(self):
        super(MultiBranchModel, self).__init__()
        self.lstm = nn.LSTM(input_size=1280, hidden_size=512, num_layers=2, batch_first=True, bidirectional=True)
        self.lstm_branch = nn.Sequential(
            nn.ReLU(),
            nn.Linear(512*2, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=256)
        )
        self.fnn_branch2 = nn.Sequential(
            nn.Linear(2179, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        self.fnn_branch3 = nn.Sequential(
            nn.Linear(1625, 512),
            nn.ReLU(),
            nn.Linear(512, 256),
            nn.ReLU()
        )
        self.fnn_branch4 = nn.Sequential(
            nn.Linear(127, 127),
        )        
        self.pre_classifier = nn.Sequential(
            nn.Linear(256*3+127, 256),
            nn.ReLU()
        )
        self.classifier = nn.Linear(256, 2)  
    def forward(self, embeddings, ancestor, phage, KLtypes):
        # unsqueeze embeddings to add sequence length dimension
        embeddings = embeddings.unsqueeze(1)  # assuming embeddings have shape (batch_size, embedding_dim)
        lstm_out, _ = self.lstm(embeddings)
        lstm_out = lstm_out.squeeze(1)  # squeeze out the sequence length dimension
        lstm_out = self.lstm_branch(lstm_out)  # Apply the remaining layers
        fnn_out2 = self.fnn_branch2(ancestor)
        fnn_out3 = self.fnn_branch3(phage)
        fnn_out4 = self.fnn_branch4(KLtypes)
        out = torch.cat((lstm_out, fnn_out4,fnn_out2, fnn_out3), dim=1)
        out = self.pre_classifier(out)
        out = self.classifier(out)
        return out


In [None]:
# *****************************************************************************
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Your Model
model = MultiBranchModel().to(device)

# Loss Function and Optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary cross-entropy loss with sigmoid function applied
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Function to calculate metrics
def calculate_metrics(preds, labels):
    preds = preds.cpu().numpy()
    labels = labels.cpu().numpy()
    mcc = matthews_corrcoef(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    accuracy = accuracy_score(labels, preds)
    return mcc, precision, recall, accuracy

# Training Loop
n_epochs = 20
for epoch in range(n_epochs):
    model.train()
    for embeddings, ancestor, phage, KLtypes, labels in train_loader:
        embeddings, ancestor, phage, KLtypes, labels = embeddings.to(device), ancestor.to(device), phage.to(device), KLtypes.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(embeddings, ancestor, phage, KLtypes)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()

    # Evaluation Loop
    model.eval()
    with torch.no_grad():
        preds = []
        truths = []
        for embeddings, ancestor, phage, KLtypes, labels in val_loader:
            embeddings, ancestor, phage, KLtypes, labels = embeddings.to(device), ancestor.to(device), phage.to(device), KLtypes.to(device), labels.to(device)
            outputs = model(embeddings, ancestor, phage, KLtypes)
            predicted = torch.sigmoid(outputs).data > 0.5
            preds.extend(predicted.cpu().numpy())
            truths.extend(labels.cpu().numpy())
        mcc, precision, recall, accuracy = calculate_metrics(np.array(preds), np.array(truths))
        print(f'Epoch {epoch+1}/{n_epochs}')
        print(f'Validation MCC: {mcc}, Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}')

# Testing Loop
model.eval()
with torch.no_grad():
    preds = []
    truths = []
    for embeddings, ancestor, phage, KLtypes, labels in test_loader:
        embeddings, ancestor, phage, KLtypes, labels = embeddings.to(device), ancestor.to(device), phage.to(device), KLtypes.to(device), labels.to(device)
        outputs = model(embeddings, ancestor, phage, KLtypes)
        predicted = torch.sigmoid(outputs).data > 0.5
        preds.extend(predicted.cpu().numpy())
        truths.extend(labels.cpu().numpy())
    mcc, precision, recall, accuracy = calculate_metrics(np.array(preds), np.array(truths))
    print(f'Test MCC: {mcc}, Precision: {precision}, Recall: {recall}, Accuracy: {accuracy}')
    


In [None]:
# *****************************************************************************
# Save model after training
torch.save(model.state_dict(), f"{path_work}/train_nn/MultiDomain.LSTM.2207.model")

import json
with open(f"{path_work}/train_nn/MultiDomain.LSTM.model.out" , "w") as outfile :
    outfile.write(json.dumps(history))



In [None]:
#!/bin/bash
#BATCH --job-name=MultibranchNN
#SBATCH --qos=short
#SBATCH --ntasks=1 
#SBATCH --cpus-per-task=10 
#SBATCH --mem=80gb 
#SBATCH --time=1-00:00:00 
#SBATCH --output=MultibranchNN%j.log 

source /storage/apps/ANACONDA/anaconda3/etc/profile.d/conda.sh
conda activate torch_geometric

python /home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023/train_nn/script_files/NN/multibranchmodel.LSTM.py
