# Goal here is to correct the label before the finetuning of ESM2
***
## I. Generate the DF_info and the DF_embeddings 
## II. Tryout multiple algoritms : 
> NN : Graph Neural Networks  <br>
***
https://theaisummer.com/gnn-architectures/
***

### I.

In [42]:
import torch
from torch_geometric.data import Data

from torch import nn 
from torch.utils.data import Dataset , DataLoader
import torch.nn.functional as F
import torch.optim as optim

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder , label_binarize , OneHotEncoder
from sklearn.metrics import average_precision_score
import os 
import pandas as pd
import numpy as np
from tqdm import tqdm

from collections import Counter
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning) 


> Open the Dataframe

In [18]:
#path_work = "/home/conchae/prediction_depolymerase_tropism/prophage_prediction/depolymerase_decipher/ficheros_28032023"
path_work = "/media/concha-eloko/Linux/PPT_clean"

# Open the DF
DF_info = pd.read_csv(f"{path_work}/DF_Dpo.final.2705.tsv", sep = "\t" ,  header = 0 )
# Open the embeddings
DF_embeddings = pd.read_csv(f"{path_work}/Dpo.2705.embeddings.ultimate.csv", sep = ",", header= None )
DF_embeddings.rename(columns={0: 'index'}, inplace=True)

# Filter the DF :
DF_info_filtered = DF_info[~DF_info["KL_type_LCA"].str.contains("\\|")]
DF_info_ToReLabel = DF_info[DF_info["KL_type_LCA"].str.contains("\\|")]
all_data = pd.merge(DF_info_filtered , DF_embeddings , on = "index")

# Mind the over representation of outbreaks :
all_data = all_data.drop_duplicates(subset = ["Infected_ancestor","index","prophage_id"] , keep = "first").reset_index(drop=True)


In [45]:
DF_embeddings

Unnamed: 0,index,1,2,3,4,5,6,7,8,9,...,1271,1272,1273,1274,1275,1276,1277,1278,1279,1280
0,ppt__2930,-0.000061,-0.017329,0.012884,0.037123,-0.123747,0.004186,-0.061367,-0.056718,-0.037215,...,0.098806,0.012989,-0.001155,0.139749,-0.030987,0.059306,0.107041,-0.041463,-0.085581,0.114973
1,ppt__3300,0.004044,0.040011,-0.001234,-0.095745,-0.058056,-0.002394,0.007648,-0.059740,0.060850,...,-0.020369,0.016287,0.062586,-0.024336,0.019276,0.069623,0.035261,-0.118962,0.035672,0.085582
2,ppt__1182,0.018767,0.068116,-0.009109,-0.012598,-0.107001,0.011569,-0.030943,-0.045359,0.048923,...,0.014524,-0.024645,0.071878,0.018206,0.042790,0.088410,0.031970,-0.124592,0.070040,0.065348
3,ppt__3540,-0.028261,-0.047253,-0.027340,-0.052824,-0.089644,-0.023079,0.094861,0.026104,0.024001,...,0.051728,0.005634,-0.077874,0.030336,-0.037648,0.050625,0.046142,-0.158841,-0.007670,0.034556
4,ppt__942,0.014863,0.028030,0.014927,-0.025997,-0.096138,0.016290,0.015008,-0.066254,0.077959,...,0.008521,-0.019820,0.123201,-0.040306,0.030893,0.051362,0.047316,-0.102698,0.044830,0.084530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3603,anubis__304,0.006264,0.006471,-0.031665,0.078502,-0.131247,0.077167,0.043005,-0.183636,-0.022181,...,0.044299,-0.061847,0.017696,0.054798,-0.035830,-0.030202,0.039051,-0.127020,-0.113630,0.211258
3604,anubis__1273,-0.019114,0.063302,0.006635,-0.060343,-0.034054,-0.003895,0.033920,-0.080352,0.073579,...,-0.004504,-0.007906,0.075141,-0.052423,0.027127,0.073984,0.030664,-0.096409,0.011906,0.124885
3605,anubis__1311,0.051261,0.067942,0.005061,-0.019131,-0.060296,0.000984,0.037515,-0.033887,0.091774,...,0.044678,0.052609,0.112994,-0.000592,0.027122,0.086020,0.013660,-0.055491,0.021665,0.049301
3606,anubis__1525,-0.010655,0.083864,0.009084,-0.042220,-0.066479,0.008724,0.010109,-0.078033,0.065285,...,0.020752,0.024543,0.071302,0.035980,0.012171,0.054399,0.032167,-0.151018,0.042541,0.035221


In [52]:
len(DF_embeddings[DF_embeddings["index"] == "ppt__942"].values[0][1:1281])

1280

***
## Build the Graph Data

> Indexation process (shall add the N phages to predict)

In [28]:
indexation = all_data["Infected_ancestor"].unique().tolist() + all_data["Phage"].unique().tolist() + all_data["index"].unique().tolist() + [f"Dpo_to_predict_{n}" for n in DF_info["index"].unique().tolist()]

dico_ID = {item:index for index, item in enumerate(indexation)}


> Make edge file 

In [58]:
edge_index = []

# Node A (bacteria) - Node B1 (prophage) :
for _, row in all_data.iterrows() :
    edge_index.append([dico_ID[row["Infected_ancestor"]], dico_ID[row["Phage"]]])
    edge_index.append([dico_ID[row["Infected_ancestor"]], dico_ID[row["Infected_ancestor"]]])
    
# Node B1 - Node B2 (depolymerase) :
for phage in all_data.Phage.unique() :
    all_data_phage = all_data[all_data["Phage"] == phage]
    for _, row in all_data_phage.iterrows() :
        edge_index.append([dico_ID[row["Phage"]], dico_ID[row["index"]]])

# Transform into tensor : 
edge_index_tensor = torch.tensor(edge_index , dtype=torch.long)

# Write file : 
numpy_array = edge_index_tensor.numpy()
df = pd.DataFrame(numpy_array)
df.to_csv(f"{path_work}/edge_index.csv", index=False, header=False)

> Make the node feature file : 

In [39]:
LE  = LabelEncoder()
di = LE.fit_transform(all_data["KL_type_LCA"])
label_mapping = dict(zip(LE.classes_, LE.transform(LE.classes_)))
label_mapping

{'KL1': 0,
 'KL10': 1,
 'KL101': 2,
 'KL102': 3,
 'KL103': 4,
 'KL104': 5,
 'KL105': 6,
 'KL106': 7,
 'KL107': 8,
 'KL108': 9,
 'KL109': 10,
 'KL11': 11,
 'KL110': 12,
 'KL111': 13,
 'KL112': 14,
 'KL113': 15,
 'KL114': 16,
 'KL115': 17,
 'KL116': 18,
 'KL117': 19,
 'KL118': 20,
 'KL119': 21,
 'KL12': 22,
 'KL121': 23,
 'KL122': 24,
 'KL123': 25,
 'KL124': 26,
 'KL125': 27,
 'KL126': 28,
 'KL127': 29,
 'KL128': 30,
 'KL13': 31,
 'KL130': 32,
 'KL131': 33,
 'KL132': 34,
 'KL134': 35,
 'KL136': 36,
 'KL137': 37,
 'KL139': 38,
 'KL14': 39,
 'KL140': 40,
 'KL141': 41,
 'KL142': 42,
 'KL143': 43,
 'KL144': 44,
 'KL145': 45,
 'KL146': 46,
 'KL147': 47,
 'KL148': 48,
 'KL149': 49,
 'KL15': 50,
 'KL150': 51,
 'KL151': 52,
 'KL152': 53,
 'KL153': 54,
 'KL154': 55,
 'KL155': 56,
 'KL157': 57,
 'KL158': 58,
 'KL159': 59,
 'KL16': 60,
 'KL162': 61,
 'KL163': 62,
 'KL164': 63,
 'KL166': 64,
 'KL169': 65,
 'KL17': 66,
 'KL170': 67,
 'KL18': 68,
 'KL19': 69,
 'KL2': 70,
 'KL20': 71,
 'KL21': 72,
 'KL

In [59]:
node_feature = []

for index, item in tqdm(enumerate(indexation)) :
    features = [index]
    if item in all_data["Infected_ancestor"].unique() : 
        KL_type = all_data[all_data["Infected_ancestor"] == item]["KL_type_LCA"].values[0]
        features = features + [label_mapping[KL_type]] + [-1]*1280
    elif item in all_data["Phage"].unique() : 
        features = features + [-1]*1281
    elif item in all_data["index"].unique() : 
        features = features + [-1] + DF_embeddings[DF_embeddings["index"] == item].values[0][1:1281].tolist()
    elif item in [f"Dpo_to_predict_{n}" for n in DF_info["index"].unique().tolist()] : 
        features = features + [-1]*1281
    node_feature.append(features)
    
# Transform into tensor : 
node_feature_tensor = torch.tensor(node_feature , dtype=torch.float)

# Write file : 
numpy_array = node_feature_tensor.numpy()
df = pd.DataFrame(numpy_array)
df.to_csv(f"{path_work}/node_features.csv", index=False, header=False)

26766it [02:01, 219.57it/s]


> Make the Y file : 

In [56]:
y_file = [1] * len(edge_index)

# Transform into tensor : 
y_tensor = torch.tensor(y_file , dtype=torch.float)

# Write file : 
numpy_array = y_tensor.numpy()
df = pd.DataFrame(numpy_array)
df.to_csv(f"{path_work}/y_file.csv", index=False, header=False)

***
## Create the Data instance 

In [60]:
import torch
from torch_geometric.data import Data

edge_index = edge_index_tensor

x = node_feature_tensor
y = y_tensor

# create Data instance
data = Data(x=x, edge_index=edge_index.t().contiguous(), y=y)

# print out the data instance
print(data)

Data(x=[26766, 1282], edge_index=[2, 19354], y=[19354])


In [47]:
from torch_geometric.data import Dataset, Data
import torch
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

class CustomDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(CustomDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        return ['ancestors.csv', 'prophages.csv', 'edges.csv']

    @property
    def processed_file_names(self):
        return ['data.pt']

    def download(self):
        # This method is where the raw data files would be downloaded to.
        # If you already have the files, you don't need to implement this.
        pass

    def process(self):
        # This method reads the raw files and saves them into a Data object.

        # Read the data files
        ancestors = pd.read_csv(self.raw_paths[0])
        prophages = pd.read_csv(self.raw_paths[1])
        edges = pd.read_csv(self.raw_paths[2])

        # Prepare OneHotEncoder for ancestor features
        OHE = OneHotEncoder(sparse=False)
        OHE.fit(ancestors[['KL_type']].values)

        # Convert the ancestor and prophage features to a PyG x feature matrix
        ancestor_features = OHE.transform(ancestors[['KL_type']].values)
        prophage_features = np.array(prophages[[str(i) for i in range(1, 1281)]].values.tolist())
        
        # Combine the two sets of features
        x = torch.tensor(np.concatenate([ancestor_features, prophage_features], axis=0), dtype=torch.float)

        # Convert the edge list to a PyG edge index
        edge_index = torch.tensor(edges.values, dtype=torch.long).t().contiguous()

        # In this case, there are no edge labels, so we'll just use a placeholder
        y = torch.tensor([0]*edges.shape[0], dtype=torch.long)

        # Save the data
        data = Data(x=x, edge_index=edge_index, y=y)
        torch.save(data, self.processed_paths[0])

    def len(self):
        return 1

    def get(self, idx):
        # This method loads and returns a Data object.
        data = torch.load(self.processed_paths[0])
        return data

In [None]:
import torch
from torch_geometric.data import Data

# let's assume you have 4 nodes: 1 bacterial strain (A), 1 prophage (B1), and 2 depolymerases (B2)
# and they are assigned indices as follows: bacterial strain: 0, prophage: 1, depolymerases: 2, 3

# edge_index tensor
edge_index = torch.tensor([[0, 1],  # edge from bacterial strain to prophage
                           [1, 2],  # edge from prophage to first depolymerase
                           [1, 3]],  # edge from prophage to second depolymerase
                          dtype=torch.long)

# node feature tensor
x = torch.tensor([[1.0],  # feature of bacterial strain (e.g., KL type)
                  [0.0],  # feature of prophage (no specific feature)
                  [-1.0],  # feature of first depolymerase (embedding representation)
                  [-1.0]],  # feature of second depolymerase (embedding representation)
                 dtype=torch.float)

# target variable tensor
y = torch.tensor([1,  # connection between bacterial strain and prophage
                  0,  # no specific target variable for prophage
                  0, 0],  # no specific target variable for depolymerases
                 dtype=torch.float)

# create Data instance
data = Data(x=x, edge_index=edge_index.t().contiguous(), y=y)

# print out the data instance
print(data)

In [None]:
import torch
from torch_geometric.data import Data

# edge_index tensor
edge_index = torch.tensor([
    [0, 2],  # edge from 1st bacterial strain to 1st prophage
    [0, 3],  # edge from 1st bacterial strain to 2nd prophage
    [1, 3],  # edge from 2nd bacterial strain to 2nd prophage
    [1, 4],  # edge from 2nd bacterial strain to 3rd prophage
    [2, 5], [2, 6],  # edges from 1st prophage to 1st and 2nd depolymerases
    [3, 7], [3, 8], [3, 9],  # edges from 2nd prophage to 3rd, 4th, 5th depolymerases
    [4, 10], [4, 11], [4, 12], [4, 13], [4, 14],  # edges from 3rd prophage to 6th-10th depolymerases
], dtype=torch.long)

# node feature tensor
# pad with zeros so that all nodes have the same number of features
x = torch.zeros(15, 1280)  # 15 nodes, maximum F is 1280
x[0, 0] = 1  # 1st bacterial strain with KL type 1
x[1, 0] = 2  # 2nd bacterial strain with KL type 2
x[5:15] = torch.randn(10, 1280)  # depolymerases with random embeddings

# target variable tensor
# here, we assume a binary link prediction task: does a link exist between a prophage and a bacterial strain?
# we need to structure this according to our specific needs and model
# for this example, let's assume we have a binary label for each prophage-bacterial strain pair
y = torch.tensor([1,  # link exists between 1st bacterial strain and 1st prophage
                  1,  # link exists between 1st bacterial strain and 2nd prophage
                  0,  # no link between 1st bacterial strain and 3rd prophage
                  1,  # link exists between 2nd bacterial strain and 2nd prophage
                  0,  # no link between 2nd bacterial strain and 1st prophage
                  1],  # link exists between 2nd bacterial strain and 3rd prophage
                 dtype=torch.float)

# create Data instance
data = Data(x=x, edge_index=edge_index.t().contiguous(), y=y)

# print out the data instance
print(data)

In [None]:
import torch
from torch.nn import Linear
from torch_geometric.nn import GCNConv

class GCN(torch.nn.Module):
    def __init__(self, num_node_features, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_node_features, 128)
        self.conv2 = GCNConv(128, num_classes)
        self.classifier = Linear(128, num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = x.relu()
        x = self.conv2(x, edge_index)
        
        # Here, we assume that we're only interested in the output of the B1 nodes
        # For this, we'll gather their outputs using their indices in `data`
        x_b1 = x[data.b1_indices]
        out = self.classifier(x_b1)

        return out

A1: Inductive Learning

In [None]:
from torch_geometric.data import DataLoader

# Define your model and optimizer
model = GCN(num_node_features=1280, num_classes=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Create DataLoader instances for your training and test datasets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Training loop
for epoch in range(100):  # 100 epochs
    for data in train_loader:
        optimizer.zero_grad()
        out = model(data)
        loss = torch.nn.functional.cross_entropy(out, data.y)
        loss.backward()
        optimizer.step()
        
model.eval()
for data in test_loader:
    with torch.no_grad():
        predictions = model(data)

A2: Transductive Learning

In [None]:
model = GCN(num_node_features=1280, num_classes=3)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(100):  # 100 epochs
    optimizer.zero_grad()
    out = model(data)
    loss = torch.nn.functional.cross_entropy(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    
model.eval()
with torch.no_grad():
    predictions = model(data)
unlabeled_predictions = predictions[data.test_mask]

In [None]:
Ok I do not think your answer makes sense to me for the following reasons : 
It seems to me that you forgot that the goal is to predict the link between the nodes A (bacteria to which are assigned KL_types) and the nodes B1 (prophages, with no features). So that renders your node feature file incorrect (unless I am wrong).
Secondly, I thought the y file had only "1" and "0" with n rows equal to the number of edges, and "0" not a label and "1" a label. Can you confirm/ correct the y file ? 




In [None]:
It is clearer. However I still have questions : 
1. If I am trying to predict the link between the node B1 and a node A that I ignore, what index for the node A should I put ?
2. So if I understand correctly, as I only know the edges that exist, all the values in the y file are going to be "1" ?
That leads to another question : how should I encode the fact that the links that I want to predict are between the nodes A and B1 ?


In [None]:
Let's dive into that a little more as it does not make complete sense to me. Each node A is assigned a KL_type. There are 127 KL_types in total. If I want the score, not only between a node B1 and a given node A but a node B1 with a KL_type, can I get the mean score of all the nodes for a given KL_type ? Or is there a better way to go about it ?


