In [1]:
import torch
from torch.utils.data import Dataset
import os 
import pandas as pd
import warnings


In [45]:
path_work = "/home/robbyce/Documents/bioinformatics/PPT"

DF_info = pd.read_csv(f"{path_work}/DF_Dpo.final.2705.tsv", sep = "\t" ,  header = 0 )
DF_embeddings = pd.read_csv(f"{path_work}/Dpo.2705.embeddings.ultimate.csv", sep = ",", header= None )
DF_embeddings.rename(columns={0: 'index'}, inplace=True)

merged_embeddings = pd.merge(DF_info["index"] , DF_embeddings , on = "index")
#merged_embeddings = merged_embeddings.drop("index" , axis = 1)
#merged_embeddings.set_index("index" , inplace = True)


> Single domain model eg 

In [42]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn 

import pandas as pd

class MultiDomainDataset(Dataset):
    def __init__(self, merged_embeddings, DF_info):
        self.embeddings = merged_embeddings
        self.ancestor = DF_info["Infected_ancestor"]
        self.prophage_id = DF_info["prophage_id"]
        self.prophage_instance = DF_info["Phage"]
        self.labels = DF_info["KL_type_LCA"]


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item_domain1 = self.embeddings.iloc[idx]
        item_domain2 = self.ancestor.iloc[idx]
        item_domain3 = self.prophage_id.iloc[idx]
        item_domain4 = self.prophage_instance.iloc[idx]
        item_domain5 = self.labels.iloc[idx]
        return item_domain1, item_domain2, item_domain3 , item_domain4 , item_domain5
    
multi_domain_dataset = MultiDomainDataset(merged_embeddings, DF_info)
dataloader = DataLoader(multi_domain_dataset, batch_size=4, shuffle=True, num_workers=4)


In [43]:
multi_domain_dataset.__getitem__(1)

(index    anubis__0
 1         0.036016
 2         0.005938
 3        -0.043535
 4         0.008279
            ...    
 1276     -0.012549
 1277      0.092624
 1278     -0.136602
 1279     -0.191378
 1280      0.135658
 Name: 1, Length: 1281, dtype: object,
 'GCF_016651625.1',
 'prophage_12186',
 'GCF_016651625.1__phage29',
 'KL43')

In [None]:
class SingleDomainModel(nn.Module):
    def __init__(self):
        super(SingleDomainModel, self).__init__()
        self.layer1 = nn.Linear(in_features=10, out_features=20)
        self.layer2 = nn.Linear(in_features=20, out_features=1)

    def forward(self, domain1_data):
        out = F.relu(self.layer1(domain1_data))
        out = self.layer2(out)
        return out

In [12]:
class MultiDomainModel(nn.Module):
    def __init__(self):
        super(MultiDomainModel, self).__init__()
        self.branch_embeddings = nn.Linear(in_features=10, out_features=20)
        self.branch_ancestor = nn.Linear(in_features=10, out_features=20)
        self.branch_prophage_id = nn.Linear(in_features=10, out_features=20)
        self.branch_prophage_instance = nn.Linear(in_features=10, out_features=20)
        self.final_layer = nn.Linear(in_features=80, out_features=1)
        
    def forward(self, domain1_data, domain2_data, domain3_data):
        out1 = F.relu(self.branch_embeddings(domain1_data))
        out2 = F.relu(self.branch_ancestor(domain2_data))
        out3 = F.relu(self.branch_prophage_id(domain3_data))
        out = torch.cat((out1, out2, out3), dim=1)
        out = self.final_layer(out)
        return out

class MultiDomainModel(nn.Module):
    def __init__(self):
        super(MultiDomainModel, self).__init__()
        self.branch1 = nn.Sequential(
            nn.Linear(in_features=10, out_features=20),
            nn.ReLU(),
            nn.Linear(in_features=20, out_features=30),
            nn.ReLU()
        )
        self.branch2 = nn.Sequential(
            nn.Linear(in_features=10, out_features=20),
            nn.ReLU(),
            nn.Linear(in_features=20, out_features=30),
            nn.ReLU()
        )
        self.branch3 = nn.Sequential(
            nn.Linear(in_features=10, out_features=20),
            nn.ReLU(),
            nn.Linear(in_features=20, out_features=30),
            nn.ReLU()
        )
        self.final_layer = nn.Linear(in_features=90, out_features=1)

    def forward(self, domain1_data, domain2_data, domain3_data):
        out1 = self.branch1(domain1_data)
        out2 = self.branch2(domain2_data)
        out3 = self.branch3(domain3_data)
        out = torch.cat((out1, out2, out3), dim=1)
        out = self.final_layer(out)
        out = F.softmax(out, dim=1)
        return out

In [None]:
model = MultiDomainModel()

for domain1_data, domain2_data, domain3_data, labels in dataloader:
    outputs = model(domain1_data, domain2_data, domain3_data)
    loss = criterion(outputs, labels)

***
# Explore the data

In [8]:
from collections import Counter

occurences_prophage = dict(Counter(DF_info["Phage"].tolist()))
counts = {value for key,value in occurences_prophage.items()}

In [9]:
counts

{1, 2, 3, 4, 5, 6, 7, 11, 13}

In [11]:
dico_effectif = {}
for count in counts :
    n = 0
    for key, value in occurences_prophage.items() :
        if value == count :
            n+=1
    dico_effectif[f"Effectif_{count}"] = n

In [12]:
dico_effectif

{'Effectif_1': 24180,
 'Effectif_2': 8877,
 'Effectif_3': 2925,
 'Effectif_4': 1468,
 'Effectif_5': 806,
 'Effectif_6': 147,
 'Effectif_7': 14,
 'Effectif_11': 1,
 'Effectif_13': 1}

In [35]:
test = DF_info[DF_info["index"] == "ppt__12"]
test.drop_duplicates(subset = [""])

In [33]:
DF_info["index"].tolist()

['ppt__0',
 'ppt__1',
 'ppt__2',
 'anubis__0',
 'ppt__3',
 'ppt__4',
 'ppt__5',
 'ppt__6',
 'ppt__7',
 'ppt__8',
 'ppt__6',
 'ppt__5',
 'ppt__9',
 'ppt__10',
 'ppt__11',
 'ppt__12',
 'ppt__13',
 'ppt__14',
 'ppt__15',
 'ppt__16',
 'ppt__17',
 'ppt__18',
 'anubis__5',
 'ppt__19',
 'anubis__7',
 'ppt__20',
 'ppt__21',
 'ppt__22',
 'ppt__23',
 'ppt__24',
 'anubis__12',
 'ppt__25',
 'anubis__13',
 'ppt__26',
 'ppt__21',
 'ppt__27',
 'anubis__12',
 'ppt__25',
 'ppt__28',
 'ppt__29',
 'anubis__14',
 'ppt__30',
 'ppt__31',
 'ppt__32',
 'anubis__16',
 'ppt__33',
 'ppt__34',
 'minibatch__37',
 'minibatch__139',
 'ppt__35',
 'ppt__36',
 'ppt__37',
 'minibatch__847',
 'ppt__38',
 'ppt__39',
 'ppt__40',
 'anubis__17',
 'ppt__41',
 'ppt__42',
 'ppt__43',
 'ppt__44',
 'ppt__3',
 'ppt__45',
 'ppt__46',
 'ppt__47',
 'ppt__48',
 'ppt__49',
 'ppt__50',
 'ppt__51',
 'ppt__52',
 'ppt__53',
 'ppt__54',
 'anubis__18',
 'ppt__55',
 'ppt__56',
 'ppt__57',
 'ppt__58',
 'ppt__37',
 'ppt__59',
 'ppt__60',
 'ppt_

In [29]:
dict(Counter(DF_info[DF_info["prophage_id"] == "prophage_18"]["index"].tolist()))

{'ppt__19': 1,
 'anubis__7': 1,
 'ppt__476': 22,
 'ppt__65': 1,
 'ppt__630': 11,
 'ppt__298': 1,
 'ppt__1969': 1,
 'minibatch__1165': 1,
 'ppt__2969': 1,
 'ppt__2970': 1,
 'ppt__2971': 1,
 'ppt__63': 1,
 'anubis__47': 1,
 'ppt__76': 1,
 'ppt__3594': 1,
 'ppt__91': 1,
 'ppt__3027': 1,
 'minibatch__281': 1,
 'ppt__196': 1,
 'ppt__157': 1,
 'ppt__23': 2,
 'ppt__154': 1,
 'anubis__40': 1,
 'ppt__5052': 1,
 'ppt__316': 2,
 'ppt__5413': 1,
 'ppt__37': 1,
 'ppt__35': 1,
 'ppt__36': 1,
 'minibatch__604': 1,
 'anubis__11': 1}

In [18]:
DF_info["prophage_id"].unique()

array(['prophage_12186', 'prophage_117', 'prophage_924', ...,
       'prophage_15319', 'prophage_11944', 'prophage_11522'], dtype=object)