In [1]:
from model import LDM
import torch
import torch.nn as nn
from torch.distributions import Normal
import pandas as pd
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import dendrogram
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def load_data(path_to_csv, device):
    df = pd.read_csv(path_to_csv, index_col=0)
    Aij = torch.tensor(df.values, dtype=torch.float32).to(device)
    return Aij

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
csv_path = "/Users/christine/Bachelor/src/data/adj_matrix.csv" 
Aij_real = load_data(csv_path, device)
print(Aij_real.shape)

torch.Size([968, 3964])


In [3]:
#importing the data
feature_vec = pd.read_csv('/Users/christine/LatentDistanceModel/data/feature_vector.tsv', sep='\t')
feature_vec.drop(columns=['Unnamed: 0'], inplace=True)
feature_vec

Unnamed: 0,0,Chewing gum,Inhal,Inhal.aerosol,Inhal.powder,Inhal.solution,N,O,P,R,...,V08AB05,V08AB06,V08AB07,V08AB09,V08CA03,V08CA04,V08CA06,V08CA08,V08CA09,V09AB03
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1090,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
#find indexing for the two vectors
adj_matrix_names = pd.read_csv('/Users/christine/LatentDistanceModel/data/adj_matrix_names.csv', sep='\t')
adj_matrix_idx = adj_matrix_names['Stitch flat']
adj_matrix_idx = adj_matrix_idx.to_numpy()

#index of feature vector
feature_vector_names = pd.read_csv('/Users/christine/LatentDistanceModel/data/feature_vector_names.tsv', sep = '\t', )
feature_vector_names['ID Adm.Rs'] = feature_vector_names['ID Adm.Rs'].str.split('_').str[0]
feature_vector_idx = feature_vector_names['ID Adm.Rs'].to_numpy()
adj_matrix_idx, feature_vector_idx

(array(['CID100000085', 'CID100000137', 'CID100000143', 'CID100000158',
        'CID100000159', 'CID100000160', 'CID100000191', 'CID100000214',
        'CID100000232', 'CID100000247', 'CID100000271', 'CID100000311',
        'CID100000444', 'CID100000450', 'CID100000453', 'CID100000581',
        'CID100000596', 'CID100000598', 'CID100000699', 'CID100000700',
        'CID100000727', 'CID100000738', 'CID100000750', 'CID100000772',
        'CID100000813', 'CID100000861', 'CID100000923', 'CID100000937',
        'CID100000942', 'CID100001003', 'CID100001065', 'CID100001125',
        'CID100001134', 'CID100001301', 'CID100001546', 'CID100001690',
        'CID100001727', 'CID100001775', 'CID100001805', 'CID100001971',
        'CID100001972', 'CID100001978', 'CID100001983', 'CID100001990',
        'CID100002019', 'CID100002022', 'CID100002082', 'CID100002083',
        'CID100002088', 'CID100002092', 'CID100002094', 'CID100002099',
        'CID100002118', 'CID100002130', 'CID100002153', 'CID1000

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_dim = 14
n_epochs = 100
Aij = torch.tensor([[0, 2, 0, 3, 1, 2, 0, 0, 2, 0, 1, 0], 
                    [0, 0, 2, 0, 1, 0, 3, 0, 0, 1, 0, 0],
                    [3, 3, 0, 0, 0, 1, 0, 3, 0, 0, 0, 1],
                    [3, 3, 0, 0, 0, 2, 0, 0, 1, 0, 1, 0],
                    [0, 0, 2, 0, 0, 0, 3, 0, 1, 0, 0, 0],
                    [1, 2, 0, 3, 1, 2, 0, 0, 2, 0, 1, 0], 
                    [0, 0, 2, 0, 1, 0, 0, 1, 0, 1, 0, 0],
                    [0, 3, 1, 0, 0, 1, 0, 3, 0, 0, 0, 1],
                    [3, 3, 0, 0, 0, 1, 0, 0, 0, 0, 1, 2],
                    [0, 0, 2, 1, 0, 0, 0, 0, 1, 0, 0, 0]],dtype=torch.float32, device=device)
lr = 0.01
seed = 20
ldm_trained = LDM(Aij, embedding_dim, device, n_epochs, lr, seed)
ldm_trained.train()
Aij_probs_true = ldm_trained.probit()  # Compute the probit probability matrix
loss_out = ldm_trained.train()
w, v = ldm_trained.get_embeddings()

In [19]:
f_vec = torch.tensor([[0, 1, 0, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0],
                      [0, 0, 1, 0, 0, 0, 0, 20, 0, 0, 0, 0, 0, 0, 0],
                      [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 300, 0, 0, 0], 
                      [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 300, 0, 0, 0], 
                      [0, 0, 0, 0, 1, 0, 0, 0, 0, 10, 0, 0, 0, 0, 0], 
                      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 0], 
                      [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 25, 0, 0],
                      [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30, 0, 0, 0, 0], 
                      [0, 0, 0, 1, 0, 0, 0, 0, 47, 0, 0, 0, 0, 0, 0], 
                      [0, 1, 0, 0, 0, 0, 34, 0, 0, 0, 0, 0, 0, 0, 0], 
                      [1, 0, 0, 0, 0, 18, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                      [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17]
                      ], dtype=torch.float32, device=device)
Aij_names = adj_matrix_idx[0:Aij.shape[0]]
Aij_idx = {drug_id: idx for idx, drug_id in enumerate(Aij_names)}
Aij_idx = np.array([Aij_idx[drug_id] for drug_id in Aij_names])
f_vec_idx = np.array([0, 0, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9])
print(f'Aij_idx: {Aij_idx}. There are {Aij.shape[0]} drugs in Aij and {len(Aij_idx)} drugs in the index.\nf_vec_idx: {f_vec_idx}. There are {f_vec.shape[0]} drugs in f_vec and {len(f_vec_idx)} drugs in the index.')

Aij_idx: [0 1 2 3 4 5 6 7 8 9]. There are 10 drugs in Aij and 10 drugs in the index.
f_vec_idx: [0 0 1 1 2 3 4 5 6 7 8 9]. There are 12 drugs in f_vec and 12 drugs in the index.


In [5]:
#for real data
Aij_dic = {drug_id: idx for idx, drug_id in enumerate(adj_matrix_idx)}
unique_drugs_Aij = pd.unique(adj_matrix_idx)
Aij_idx = np.array([Aij_dic[drug_id] for drug_id in adj_matrix_names['Stitch flat']])
unique_drugs_f = pd.unique(feature_vector_idx)
f_dic = {drug_id: idx for idx, drug_id in enumerate(unique_drugs_f)}
f_idx = np.array([f_dic[drug_id] for drug_id in feature_vector_names['ID Adm.Rs']])
f_idx

array([  0,   0,   1, ..., 742, 743, 744], shape=(1095,))

In [6]:
only_in_Aij = set(unique_drugs_Aij) - set(unique_drugs_f)
only_in_f = set(unique_drugs_f) - set(unique_drugs_Aij)
not_in_both = only_in_Aij.union(only_in_f)
print(f"Only in Aij: {only_in_Aij}\nOnly in feature vector: {only_in_f}\nNot in either: {not_in_both}")
print(f"{len(only_in_Aij)} are missing from feature vector")

Only in Aij: {'CID111234049', 'CID100644241', 'CID103081361', 'CID100160352', 'CID109887712', 'CID116158207', 'CID109941444', 'CID110163178', 'CID100042395', 'CID124812758', 'CID116126651', 'CID100000923', 'CID110107393', 'CID100065840', 'CID100005515', 'CID106918366', 'CID100069512', 'CID109831783', 'CID100002308', 'CID116004692', 'CID154677977', 'CID100003750', 'CID100082146', 'CID100003399', 'CID116220172', 'CID116131310', 'CID106918638', 'CID106918430', 'CID116134956', 'CID100062956', 'CID100160036', 'CID100125889', 'CID100047419', 'CID110324367', 'CID103006171', 'CID100107969', 'CID100063001', 'CID100158781', 'CID109966051', 'CID100003446', 'CID109831414', 'CID109800339', 'CID124762228', 'CID106331630', 'CID100023926', 'CID109912092', 'CID105328940', 'CID100061799', 'CID109940864', 'CID100656892', 'CID109865528', 'CID100134780', 'CID103086685', 'CID103086686', 'CID111304743', 'CID116139605', 'CID100004695', 'CID111597571', 'CID106477186', 'CID100003899', 'CID100062965', 'CID100214

In [20]:
#convert to tensor
feature_tensor = torch.tensor(feature_vec.astype(np.float32).to_numpy(), dtype=torch.float32)

In [21]:
class FeatureMapper(nn.Module):
    def __init__(self, input_dim, embedding_dim, dropout = 0.1):
        super(FeatureMapper, self).__init__()
        self.input_dim = input_dim
        self.embedding_dim = embedding_dim

        self.feature_net = nn.Sequential(
            nn.Linear(self.input_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(16, self.embedding_dim)
        )

    def forward(self, x):
        return self.feature_net(x)

In [22]:
mapper = FeatureMapper(input_dim=f_vec.shape[1], embedding_dim=w.shape[1])
optimizer = torch.optim.Adam(mapper.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
num_epochs = 100

w_frozen = ldm_trained.w.detach().clone()
f_vec_tensor = torch.tensor(f_vec, dtype=torch.float32)
drug_idx_tensor = torch.tensor(f_vec_idx, dtype=torch.long)
w_tensor = torch.tensor(w, dtype=torch.float32)

for epoch in range(num_epochs):
    mapper.train()
    optimizer.zero_grad()

    z_pred = mapper(f_vec_tensor)
    z_true = w_tensor[drug_idx_tensor]
    loss = loss_fn(z_pred, z_true)
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 9.3058
Epoch 1, Loss: 5.4871
Epoch 2, Loss: 3.8101
Epoch 3, Loss: 4.2218
Epoch 4, Loss: 3.0747
Epoch 5, Loss: 3.3336
Epoch 6, Loss: 3.2125
Epoch 7, Loss: 2.1656
Epoch 8, Loss: 2.3584
Epoch 9, Loss: 2.3068
Epoch 10, Loss: 2.5082
Epoch 11, Loss: 1.9071
Epoch 12, Loss: 1.7170
Epoch 13, Loss: 1.4988
Epoch 14, Loss: 1.3508
Epoch 15, Loss: 1.4064
Epoch 16, Loss: 1.3740
Epoch 17, Loss: 1.2781
Epoch 18, Loss: 1.4407
Epoch 19, Loss: 1.1021
Epoch 20, Loss: 1.4355
Epoch 21, Loss: 1.1805
Epoch 22, Loss: 0.9681
Epoch 23, Loss: 1.1237
Epoch 24, Loss: 1.0525
Epoch 25, Loss: 1.1001
Epoch 26, Loss: 1.0438
Epoch 27, Loss: 0.9619
Epoch 28, Loss: 1.0205
Epoch 29, Loss: 1.1321
Epoch 30, Loss: 0.9217
Epoch 31, Loss: 0.9684
Epoch 32, Loss: 1.0532
Epoch 33, Loss: 0.8894
Epoch 34, Loss: 0.9480
Epoch 35, Loss: 0.9277
Epoch 36, Loss: 1.1275
Epoch 37, Loss: 0.9097
Epoch 38, Loss: 0.8111
Epoch 39, Loss: 0.9594
Epoch 40, Loss: 0.9886
Epoch 41, Loss: 0.8919
Epoch 42, Loss: 0.8071
Epoch 43, Loss: 0.906

  f_vec_tensor = torch.tensor(f_vec, dtype=torch.float32)
  w_tensor = torch.tensor(w, dtype=torch.float32)


In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embedding_dim = 140
lr = 0.01
seed = 20
ldm_trained_r = LDM(Aij_real, embedding_dim, device, n_epochs, lr, seed)
ldm_trained_r.train()
Aij_probs_true_r = ldm_trained_r.probit()  # Compute the probit probability matrix
loss_out_r = ldm_trained_r.train()
w_r, v_r = ldm_trained_r.get_embeddings()

KeyboardInterrupt: 

In [None]:
w_frozen = ldm_trained.w.detach().clone()
feature_vec_tensor = torch.tensor(feature_vec.astype(np.float32).to_numpy(), dtype=torch.float32)
feature_idx_tensor = torch.tensor(f_idx, dtype=torch.long)
w_tensor = torch.tensor(w, dtype=torch.float32)