This notebook prepared for Digital Drug Twin Study

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Modelling

In [None]:
 # Installing the pykeen
%%capture
pip install pykeen

### R-GCN Preparing the dataset

In [None]:
from pykeen.pipeline import pipeline
from pykeen.triples import TriplesFactory
import pandas as pd
from pykeen.triples import TriplesFactory

training_df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/positive_dataset.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/test_dataset.csv')

training_df = training_df[['SUBJECT_CUI', 'PREDICATE', 'OBJECT_CUI']]
test_df = test_df[['SUBJECT_CUI', 'PREDICATE', 'OBJECT_CUI']]

combined_triples = pd.concat([training_df, test_df]).values

combined_factory = TriplesFactory.from_labeled_triples(triples=combined_triples)

triples_factory = TriplesFactory.from_labeled_triples(
    triples=training_df.values,
    entity_to_id=combined_factory.entity_to_id,
    relation_to_id=combined_factory.relation_to_id
)

testing = TriplesFactory.from_labeled_triples(
    triples=test_df.values,
    entity_to_id=combined_factory.entity_to_id,
    relation_to_id=combined_factory.relation_to_id
)

training, validation = triples_factory.split(ratios=(.9, .1))

INFO:pykeen.utils:Using opt_einsum
INFO:pykeen.triples.splitting:done splitting triples to groups of sizes [50435, 7220]


### R-GCN Modelling

#### Layer: 01 - Embedding Dim: 100

In [None]:
result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='RGCN',  # Specify RGCN model
    model_kwargs={
        'embedding_dim': 100,    # Set embedding dimensions
        'num_layers': 1,         # Number of R-GCN layers
    #    'interaction': 'DistMult' # Specify interaction function
    },
    training_kwargs={
        'num_epochs': 100,       # Set the number of epochs
    },
    stopper='early'              # Use early stopping
)

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 10000)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 10000)
      )
    )
  )
  (self_loop): Linear(in_features=100, out_features=100, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) has parameters, but no reset_parameters.
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /root/.data/pykeen/checkpoints/best-model-weights-fd326622-bbd2-4554-9241-1529ee8b4f69.pt
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
 

Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.19s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.1834487534626039. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-fd326622-bbd2-4554-9241-1529ee8b4f69.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.09s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 20: 0.22486149584487536. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-fd326622-bbd2-4554-9241-1529ee8b4f69.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 20.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.11s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 30: 0.23767313019390582. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-fd326622-bbd2-4554-9241-1529ee8b4f69.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 30.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.24s seconds


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

KeyboardInterrupt: 

In [None]:
# Saving the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model = result.model

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/01-100/01_100_trained_rgcn_model.pt'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/01-100/01_100_trained_rgcn_model.pt


In [None]:
# Loading the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch


model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/01-100/01_100_trained_rgcn_model.pt'

loaded_model = RGCN(
    triples_factory=combined_factory,
    embedding_dim=100,
     num_layers=1,
   # interaction="DistMult",
)
loaded_model.load_state_dict(torch.load(model_path))
print("Model reloaded successfully")

INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 10000)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 10000)
      )
    )
  )
  (self_loop): Linear(in_features=100, out_features=100, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) has parameters, but no reset_parameters.
  loaded_model.load_state_dict(torch.load(model_path))


RuntimeError: Error(s) in loading state_dict for RGCN:
	size mismatch for entity_representations.0.sources: copying a param with shape torch.Size([64973]) from checkpoint, the shape in current model is torch.Size([73775]).
	size mismatch for entity_representations.0.targets: copying a param with shape torch.Size([64973]) from checkpoint, the shape in current model is torch.Size([73775]).
	size mismatch for entity_representations.0.edge_types: copying a param with shape torch.Size([64973]) from checkpoint, the shape in current model is torch.Size([73775]).

In [None]:
# Saving the model representations as a NumPy array after training
import numpy as np

trained_model = result.model
# trained_model = model
node_embeddings = trained_model.entity_representations[0]
node_embeddings_numpy = node_embeddings().cpu().detach().numpy()

np.save('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-100/node_embeddings_100_1.npy', node_embeddings_numpy)

#### Layer: 01 - Embedding Dim: 150

In [None]:
result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='RGCN',  # Specify RGCN model
    model_kwargs={
        'embedding_dim': 150,    # Set embedding dimensions
        'num_layers': 1,         # Number of R-GCN layers
    #    'interaction': 'DistMult' # Specify interaction function
    },
    training_kwargs={
        'num_epochs': 100,       # Set the number of epochs
    },
    stopper='early'              # Use early stopping
)

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 22500)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 22500)
      )
    )
  )
  (self_loop): Linear(in_features=150, out_features=150, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) has parameters, but no reset_parameters.
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /root/.data/pykeen/checkpoints/best-model-weights-a08baf7d-274e-4192-bd58-41a90f634b7e.pt
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
 

Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.78s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.1948060941828255. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-a08baf7d-274e-4192-bd58-41a90f634b7e.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.71s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 20: 0.20297783933518004. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-a08baf7d-274e-4192-bd58-41a90f634b7e.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 20.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.78s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 30: 0.2350415512465374. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-a08baf7d-274e-4192-bd58-41a90f634b7e.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 30.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.93s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 40: 0.25761772853185594. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-a08baf7d-274e-4192-bd58-41a90f634b7e.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 40.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.55s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 50: 0.27479224376731304. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-a08baf7d-274e-4192-bd58-41a90f634b7e.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 50.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.78s seconds


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.75s seconds
INFO:pykeen.stoppers.early_stopping:Stopping early at epoch 70. The best result 0.27479224376731304 occurred at epoch 50.
INFO:pykeen.stoppers.early_stopping:Re-loading weights from best epoch from /root/.data/pykeen/checkpoints/best-model-weights-a08baf7d-274e-4192-bd58-41a90f634b7e.pt
  self.model.load_state_dict(torch.load(self.best_model_path))


Evaluating on cuda:0:   0%|          | 0.00/1.58k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 1.53s seconds


In [None]:
# Saving the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model = result.model

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/01-150/01_150_trained_rgcn_model.pt'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/01-150/01_150_trained_rgcn_model.pt


In [None]:
# Loading the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

loaded_model = RGCN(
    triples_factory=result.training,
    embedding_dim=150,
     num_layers=1,
    interaction="DistMult",
)
loaded_model.load_state_dict(torch.load(model_path))
print("Model reloaded successfully")

#### Layer: 01 - Embedding Dim: 200

In [None]:
result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='RGCN',  # Specify RGCN model
    model_kwargs={
        'embedding_dim': 200,    # Set embedding dimensions
        'num_layers': 1,         # Number of R-GCN layers
    #    'interaction': 'DistMult' # Specify interaction function
    },
    training_kwargs={
        'num_epochs': 100,       # Set the number of epochs
    },
    stopper='early'              # Use early stopping
)

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 40000)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 40000)
      )
    )
  )
  (self_loop): Linear(in_features=200, out_features=200, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) has parameters, but no reset_parameters.
INFO:pykeen.stoppers.early_stopping:Inferred checkpoint path for best model weights: /root/.data/pykeen/checkpoints/best-model-weights-d115e93c-28e0-49e0-8443-0491d7d3ed09.pt
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
 

Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.05s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.2025623268698061. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-d115e93c-28e0-49e0-8443-0491d7d3ed09.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.28s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 20: 0.2299168975069252. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-d115e93c-28e0-49e0-8443-0491d7d3ed09.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 20.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.99s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 30: 0.2443905817174515. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-d115e93c-28e0-49e0-8443-0491d7d3ed09.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 30.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.04s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 40: 0.2680747922437673. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-d115e93c-28e0-49e0-8443-0491d7d3ed09.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 40.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.00s seconds


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.04s seconds
INFO:pykeen.stoppers.early_stopping:Stopping early at epoch 60. The best result 0.2680747922437673 occurred at epoch 40.
INFO:pykeen.stoppers.early_stopping:Re-loading weights from best epoch from /root/.data/pykeen/checkpoints/best-model-weights-d115e93c-28e0-49e0-8443-0491d7d3ed09.pt
  self.model.load_state_dict(torch.load(self.best_model_path))


Evaluating on cuda:0:   0%|          | 0.00/1.58k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 1.53s seconds


In [None]:
# Saving the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model = result.model

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/01-200/01_200_trained_rgcn_model.pt'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/01-200/01_200_trained_rgcn_model.pt


In [None]:
# Loading the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/01-200/01_200_trained_rgcn_model.pt'


loaded_model = RGCN(
    triples_factory=result.training,
    embedding_dim=200,
     num_layers=1,
    interaction="DistMult",
)
loaded_model.load_state_dict(torch.load(model_path))
print("Model reloaded successfully")

#### Layer 02- Embedding Dim: 100

In [None]:
result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='RGCN',  # Specify RGCN model
    model_kwargs={
        'embedding_dim': 100,    # Set embedding dimensions
        'num_layers': 2,         # Number of R-GCN layers
    #    'interaction': 'DistMult' # Specify interaction function
    },
    training_kwargs={
        'num_epochs': 100,       # Set the number of epochs
    },
    stopper='early'              # Use early stopping
)

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 10000)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 10000)
      )
    )
  )
  (self_loop): Linear(in_features=100, out_features=100, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) has parameters, but no reset_parameters.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
  

Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.46s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.2154432132963989. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-c7d4439c-dbc6-4fd0-94d3-ab333fa5543f.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.45s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 20: 0.24826869806094182. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-c7d4439c-dbc6-4fd0-94d3-ab333fa5543f.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 20.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.21s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 30: 0.2693213296398892. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-c7d4439c-dbc6-4fd0-94d3-ab333fa5543f.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 30.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.21s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 40: 0.3025623268698061. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-c7d4439c-dbc6-4fd0-94d3-ab333fa5543f.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 40.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.19s seconds


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.20s seconds
INFO:pykeen.stoppers.early_stopping:Stopping early at epoch 60. The best result 0.3025623268698061 occurred at epoch 40.
INFO:pykeen.stoppers.early_stopping:Re-loading weights from best epoch from /root/.data/pykeen/checkpoints/best-model-weights-c7d4439c-dbc6-4fd0-94d3-ab333fa5543f.pt
  self.model.load_state_dict(torch.load(self.best_model_path))


Evaluating on cuda:0:   0%|          | 0.00/1.58k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.71s seconds


In [None]:
# Saving the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model = result.model

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-100/02_100_trained_rgcn_model.pt'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-100/02_100_trained_rgcn_model.pt


In [None]:
# Loading the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-100/02_100_trained_rgcn_model.pt'


loaded_model = RGCN(
    triples_factory=result.training,
    embedding_dim=200,
     num_layers=1,
    interaction="DistMult",
)
loaded_model.load_state_dict(torch.load(model_path))
print("Model reloaded successfully")





#### Layer:02 - Embedding Dim: 150

In [None]:
result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='RGCN',  # Specify RGCN model
    model_kwargs={
        'embedding_dim': 150,    # Set embedding dimensions
        'num_layers': 2,         # Number of R-GCN layers
    #    'interaction': 'DistMult' # Specify interaction function
    },
    training_kwargs={
        'num_epochs': 100,       # Set the number of epochs
    },
    stopper='early'              # Use early stopping
)

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 22500)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 22500)
      )
    )
  )
  (self_loop): Linear(in_features=150, out_features=150, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) has parameters, but no reset_parameters.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
  

Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.69s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.24889196675900277. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-5f5ca07c-8a0e-4e2f-bb50-1b1cb3959f9f.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.65s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 20: 0.2551246537396122. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-5f5ca07c-8a0e-4e2f-bb50-1b1cb3959f9f.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 20.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.67s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 30: 0.2835872576177285. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-5f5ca07c-8a0e-4e2f-bb50-1b1cb3959f9f.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 30.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.64s seconds


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.67s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 50: 0.30692520775623267. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-5f5ca07c-8a0e-4e2f-bb50-1b1cb3959f9f.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 50.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.67s seconds


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 2.66s seconds
INFO:pykeen.stoppers.early_stopping:Stopping early at epoch 70. The best result 0.30692520775623267 occurred at epoch 50.
INFO:pykeen.stoppers.early_stopping:Re-loading weights from best epoch from /root/.data/pykeen/checkpoints/best-model-weights-5f5ca07c-8a0e-4e2f-bb50-1b1cb3959f9f.pt
  self.model.load_state_dict(torch.load(self.best_model_path))


Evaluating on cuda:0:   0%|          | 0.00/1.58k [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 1.12s seconds


In [None]:
# Saving the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model = result.model

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-150/02_150_trained_rgcn_model.pt'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-150/02_150_trained_rgcn_model.pt


In [None]:
# Loading the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-150/02_150_trained_rgcn_model.pt'


loaded_model = RGCN(
    triples_factory=result.training,
    embedding_dim=150,
     num_layers=1,
    interaction="DistMult",
)
loaded_model.load_state_dict(torch.load(model_path))
print("Model reloaded successfully")

#### Layer:02 - Embedding Dim: 200

In [None]:
result = pipeline(
    training=training,
    testing=testing,
    validation=validation,
    model='RGCN',  # Specify RGCN model
    model_kwargs={
        'embedding_dim': 200,    # Set embedding dimensions
        'num_layers': 2,         # Number of R-GCN layers
    #    'interaction': 'DistMult' # Specify interaction function
    },
    training_kwargs={
        'num_epochs': 100,       # Set the number of epochs
    },
    stopper='early'              # Use early stopping
)

INFO:pykeen.pipeline.api:Using device: None
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
INFO:pykeen.nn.message_passing:No num_bases was provided. Using sqrt(num_relations)=4.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 40000)
      )
    )
  )
  (bwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
        (_embeddings): Embedding(4, 40000)
      )
    )
  )
  (self_loop): Linear(in_features=200, out_features=200, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
) has parameters, but no reset_parameters.
  (fwd): BasesDecomposition(
    (relation_representations): LowRankRepresentation(
      (bases): Embedding(
  

Training epochs on cuda:0:   0%|          | 0/100 [00:00<?, ?epoch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 3.13s seconds
INFO:pykeen.stoppers.early_stopping:New best result at epoch 10: 0.22894736842105262. Saved model weights to /root/.data/pykeen/checkpoints/best-model-weights-e6f197ef-988a-4898-a722-0154d1758c57.pt
INFO:pykeen.training.training_loop:=> Saved checkpoint after having finished epoch 10.


Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

Training batches on cuda:0:   0%|          | 0/254 [00:00<?, ?batch/s]

In [None]:
# Saving the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model = result.model

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-200/02_200_trained_rgcn_model.pt'
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-150/02_150_trained_rgcn_model.pt


In [None]:
# Loading the model

from pykeen.pipeline import pipeline
from pykeen.models import RGCN
import torch

model_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Models/02-200/02_200_trained_rgcn_model.pt'


loaded_model = RGCN(
    triples_factory=result.training,
    embedding_dim=200,
     num_layers=1,
    interaction="DistMult",
)
loaded_model.load_state_dict(torch.load(model_path))
print("Model reloaded successfully")

### Getting the Node Representations

#### 01-100


In [None]:
# Saving the model representations as a NumPy array after training
import numpy as np

# trained_model = result.model
trained_model = result.model
node_embeddings = trained_model.entity_representations[0]
node_embeddings_numpy = node_embeddings().cpu().detach().numpy()

np.save('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-100/node_embeddings_100_1.npy', node_embeddings_numpy)

In [None]:
print(f'Node Embeddings Shape: {node_embeddings_numpy.shape}')

print(f'Embedding Dimension (from shape): {node_embeddings().shape[1]}')  # This gives the embedding dimension


node_names = list(result.training.entity_to_id.keys())  # Get node names
for name in node_names[:5]:  # Print the first 5 node names and their embeddings
    entity_id = result.training.entity_to_id[name]
    print(f'Embedding for node "{name}" (ID: {entity_id}): {node_embeddings_numpy[entity_id]}')


Node Embeddings Shape: (15518, 100)
Embedding Dimension (from shape): 100
Embedding for node "C0000096" (ID: 0): [-0.11289769  0.00999163 -0.39415392  0.19607046 -0.03759082 -0.20682581
 -0.08060362 -0.00707026  0.02205646  0.40811375 -0.365069    0.71018845
  0.09294644  0.00217488 -0.1133211   0.4646202   0.3484602   0.51715726
  0.17154676  0.39260274  0.837311    0.0165761   0.01905302 -0.6333302
 -0.01312902 -0.4055386   0.02670068 -0.35558492  0.4614056   0.43922892
  0.03548956  0.08128919 -0.13223088 -0.3134276   0.19797435  0.00767466
 -0.11503783 -0.08591489 -0.21942385  0.40063524  0.2113705   0.24561095
 -0.3038859   0.34154826 -0.0185794   0.22447497  0.19757234  0.09893186
 -0.01112168  0.45918602  0.06952477 -0.4362905  -0.31074387  0.2011947
  0.04621235  0.02038645 -0.18426879  0.2824593  -0.1117462   0.00733418
  0.06038496 -0.22983168 -0.3435099   0.14704192 -0.2952311   0.8874371
  0.07755027 -0.08342972  0.0484646   0.28922927 -0.00533857 -0.46373722
 -0.3478379   

In [None]:
# For saving the nodes as a csv file
# Claude 3.5 Sonnet Part: 14.10.2024

import numpy as np
import csv
import json

node_names = list(result.training.entity_to_id.keys())

trained_model = result.model
node_embeddings = trained_model.entity_representations[0]
node_embeddings_numpy = node_embeddings().cpu().detach().numpy()

with open('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-100/node_embeddings_100_1_with_names_array.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header
    writer.writerow(['Node Name', 'Embedding'])

    # Write each node name and its corresponding embedding
    for name in node_names:
        entity_id = result.training.entity_to_id[name]
        embedding = node_embeddings_numpy[entity_id]

        # Convert the numpy array to a list and then to a JSON string
        embedding_json = json.dumps(embedding.tolist())

        # Write the node name and its embedding to the CSV
        writer.writerow([name, embedding_json])

print("Node embeddings with names have been saved successfully.")

Node embeddings with names have been saved successfully.


#### 01-150

In [None]:
# Saving the model representations as a NumPy array after training
import numpy as np

# trained_model = result.model
trained_model = result.model
node_embeddings = trained_model.entity_representations[0]
node_embeddings_numpy = node_embeddings().cpu().detach().numpy()

np.save('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-150/node_embeddings_150_1.npy', node_embeddings_numpy)

In [None]:
print(f'Node Embeddings Shape: {node_embeddings_numpy.shape}')
print(f'Embedding Dimension (from shape): {node_embeddings().shape[1]}')  # This gives the embedding dimension


node_names = list(result.training.entity_to_id.keys())  # Get node names
for name in node_names[:5]:  # Print the first 5 node names and their embeddings
    entity_id = result.training.entity_to_id[name]
    print(f'Embedding for node "{name}" (ID: {entity_id}): {node_embeddings_numpy[entity_id]}')


Node Embeddings Shape: (15518, 150)
Embedding Dimension (from shape): 150
Embedding for node "C0000096" (ID: 0): [-0.65645707 -0.35677192  0.04226214 -0.5761852  -0.04424713 -0.3136382
 -0.18982059  0.42088905  0.2067413   0.34918132 -0.15542683  0.09093646
 -0.04941174 -0.6316112   0.16140373  0.5102934  -0.40652403  0.3959045
  0.2731803   0.1353044  -0.19180359 -0.02093718 -0.35321528  0.37403935
  0.40576613 -0.14315224  0.23171075 -0.33504188 -0.23487385 -0.15755114
 -0.19873437 -0.2863714   0.2192438  -0.01706611 -0.18424726  0.06720024
 -0.09231669 -0.78592026 -0.26768875 -0.40341887  0.44765636  0.23036312
  0.10213154 -0.22573395  0.24552006  0.4974863  -0.21962781  0.09780585
  0.16321018 -0.2170775   0.36787277  0.4343975   0.2179236   0.04961491
 -0.09486363 -0.2507771   0.03253027  0.28903842 -0.55439746 -0.04602926
  0.24576502  0.05057489 -0.01700646 -0.02531819 -0.541708    0.33237547
 -0.75003755 -0.08031674 -0.52622545  0.10108993  0.5384226  -0.4496648
 -0.06424493  

In [None]:
# For saving the nodes as a csv file
# Claude 3.5 Sonnet Part: 14.10.2024

import numpy as np
import csv
import json

node_names = list(result.training.entity_to_id.keys())

trained_model = result.model
node_embeddings = trained_model.entity_representations[0]
node_embeddings_numpy = node_embeddings().cpu().detach().numpy()

with open('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-150/node_embeddings_150_1_with_names_array.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header
    writer.writerow(['Node Name', 'Embedding'])

    # Write each node name and its corresponding embedding
    for name in node_names:
        entity_id = result.training.entity_to_id[name]
        embedding = node_embeddings_numpy[entity_id]

        # Convert the numpy array to a list and then to a JSON string
        embedding_json = json.dumps(embedding.tolist())

        # Write the node name and its embedding to the CSV
        writer.writerow([name, embedding_json])

print("Node embeddings with names have been saved successfully.")

Node embeddings with names have been saved successfully.


#### 01-200

In [None]:
# Saving the model representations as a NumPy array after training
import numpy as np

# trained_model = result.model
trained_model = result.model
node_embeddings = trained_model.entity_representations[0]
node_embeddings_numpy = node_embeddings().cpu().detach().numpy()

np.save('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-200/node_embeddings_200_1.npy', node_embeddings_numpy)

In [None]:
print(f'Node Embeddings Shape: {node_embeddings_numpy.shape}')
print(f'Embedding Dimension (from shape): {node_embeddings().shape[1]}')  # This gives the embedding dimension


node_names = list(result.training.entity_to_id.keys())  # Get node names
for name in node_names[:5]:  # Print the first 5 node names and their embeddings
    entity_id = result.training.entity_to_id[name]
    print(f'Embedding for node "{name}" (ID: {entity_id}): {node_embeddings_numpy[entity_id]}')


Node Embeddings Shape: (15518, 200)
Embedding Dimension (from shape): 200
Embedding for node "C0000096" (ID: 0): [ 4.81862664e-01  1.15577251e-01 -3.89112830e-01 -1.16667032e-01
  3.30075800e-01  3.41593295e-01  1.07627623e-01 -1.60101682e-01
  1.73498198e-01 -4.65439856e-01 -2.24998221e-02 -2.09287062e-01
 -4.87710759e-02 -8.70119259e-02  9.54701364e-01 -1.53305739e-01
 -2.09575087e-01 -1.00276709e+00 -3.67170662e-01 -2.81005412e-01
 -4.34104234e-01 -2.92191118e-01 -2.55441070e-01  1.41325817e-01
  1.14594489e-01  2.64843971e-01 -2.31377229e-01 -2.61286676e-01
 -3.68940830e-01  1.85181960e-01 -2.28712007e-01 -2.58183867e-01
  4.79366392e-01  6.62248671e-01  6.65932298e-01  4.15172428e-04
 -7.38146603e-02 -2.45148629e-01 -1.24390692e-01 -2.07830563e-01
  1.18526079e-01 -2.60365993e-01  1.84749827e-01  2.87288249e-01
  6.48682639e-02 -2.48235270e-01 -3.55777666e-02  9.17391106e-02
 -1.69480443e-01  1.05122373e-01  5.45383990e-03  5.36244988e-01
  5.51152825e-02 -1.00760065e-01  4.901920

In [None]:
# For saving the nodes as a csv file
# Claude 3.5 Sonnet Part: 14.10.2024

import numpy as np
import csv
import json

node_names = list(result.training.entity_to_id.keys())

trained_model = result.model
node_embeddings = trained_model.entity_representations[0]
node_embeddings_numpy = node_embeddings().cpu().detach().numpy()

with open('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-200/node_embeddings_200_1_with_names_array.csv', mode='w', newline='') as file:
    writer = csv.writer(file)

    # Write the header
    writer.writerow(['Node Name', 'Embedding'])

    # Write each node name and its corresponding embedding
    for name in node_names:
        entity_id = result.training.entity_to_id[name]
        embedding = node_embeddings_numpy[entity_id]

        # Convert the numpy array to a list and then to a JSON string
        embedding_json = json.dumps(embedding.tolist())

        # Write the node name and its embedding to the CSV
        writer.writerow([name, embedding_json])

print("Node embeddings with names have been saved successfully.")

Node embeddings with names have been saved successfully.


### Getting the Drug Treatments

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/positive_dataset_repoDB.csv')

total_elements_original = df.shape[0]

df_unique = df.drop_duplicates(subset='OBJECT_CUI')
df_unique = df_unique.rename(columns={'OBJECT_CUI': 'UNIQUE_DRUG'})

# Collect drugs (SUBJECT_CUI) that treat each disease (OBJECT_CUI)
# Group by 'OBJECT_CUI' and aggregate 'SUBJECT_CUI' into a list
drug_treatments = df.groupby('OBJECT_CUI')['SUBJECT_CUI'].apply(list).reset_index()

drug_treatments.to_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/drug_treatments_dataset.csv', index=False)


drug_treatments.head()

total_subject_cui_count = drug_treatments['SUBJECT_CUI'].apply(len).sum()


print(f"Total number of elements in SUBJECT_CUI: {total_subject_cui_count}")
print(f"Total number of elements in SUBJECT_CUI: {drug_treatments.shape[0]}")

Total number of elements in SUBJECT_CUI: 7231
Total number of elements in SUBJECT_CUI: 1244


### Statistics


#### 01. 100-01

In [None]:
# Calculate and add drug distances and add to the dataset 14.10.2024
# Euclidian Distance - Manhattan Distance - Cosine Sim
import pandas as pd
import numpy as np
import json
import csv
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

def read_embeddings_from_csv(file_path):
    embeddings_dict = {}
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            if len(row) == 2:
                name, embedding_json = row
                embedding = np.array(json.loads(embedding_json))
                embeddings_dict[name] = embedding
    return embeddings_dict

embeddings_file = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-100/node_embeddings_100_1_with_names_array.csv'
embeddings = read_embeddings_from_csv(embeddings_file)

drug_dataset_file = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/drug_treatments_dataset.csv"
drug_df = pd.read_csv(drug_dataset_file)

def calculate_drug_metrics(drug_list):
    if len(drug_list) < 2:
        return {}, {}, {}  # Return empty dicts if there's only one or no drugs

    drug_embeddings = [embeddings.get(drug, []) for drug in drug_list]

    valid_drugs = [drug for drug, emb in zip(drug_list, drug_embeddings) if len(emb) > 0]
    valid_embeddings = [emb for emb in drug_embeddings if len(emb) > 0]

    if len(valid_drugs) < 2:
        return {}, {}, {}  # Return empty dicts if there aren't at least 2 valid drugs

    euclidean_distances = pdist(valid_embeddings, metric='euclidean')
    manhattan_distances = pdist(valid_embeddings, metric='cityblock')  # 'cityblock' is Manhattan distance

    cosine_similarities = cosine_similarity(valid_embeddings)

    euclidean_matrix = squareform(euclidean_distances)
    manhattan_matrix = squareform(manhattan_distances)

    euclidean_dict = {}
    manhattan_dict = {}
    cosine_dict = {}
    for i in range(len(valid_drugs)):
        for j in range(i+1, len(valid_drugs)):
            pair = tuple(sorted([valid_drugs[i], valid_drugs[j]]))
            euclidean_dict[pair] = euclidean_matrix[i, j]
            manhattan_dict[pair] = manhattan_matrix[i, j]
            cosine_dict[pair] = cosine_similarities[i, j]

    return euclidean_dict, manhattan_dict, cosine_dict

# Apply the function to each row and add the results as new columns
drug_df['SUBJECT_CUI'] = drug_df['SUBJECT_CUI'].apply(eval)  # Convert string representation of list to actual list
drug_df['Euclidean_Distances'], drug_df['Manhattan_Distances'], drug_df['Cosine_Similarities'] = zip(*drug_df['SUBJECT_CUI'].apply(calculate_drug_metrics))

# Save the updated DataFrame
output_file = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/drugs_with_distances_and_similarities_100_01.csv"
drug_df.to_csv(output_file, index=False)

print(f"Updated dataset saved to {output_file}")

# Display a sample of the results
print(drug_df[['OBJECT_CUI', 'SUBJECT_CUI', 'Euclidean_Distances', 'Manhattan_Distances', 'Cosine_Similarities']].head())

Updated dataset saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/drugs_with_distances_and_similarities_100_01.csv
  OBJECT_CUI                                        SUBJECT_CUI  \
0   C0000810                     [C0012471, C0012472, C0030095]   
1   C0000880                                         [C0062648]   
2   C0001126                                         [C0137996]   
3   C0001144  [C0002607, C0002679, C0052761, C0008947, C0055...   
4   C0001206                                         [C0023863]   

                                 Euclidean_Distances  \
0  {('C0012471', 'C0012472'): 5.412913485405073, ...   
1                                                 {}   
2                                                 {}   
3  {('C0002607', 'C0002679'): 10.712140264558668,...   
4                                                 {}   

                                 Manhattan_Distances  \
0  {('C0012471', 'C0012472'): 42.26382644288242, ...   
1    

##### Euclidean


In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 100 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/drugs_with_distances_and_similarities_100_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Euclidean_Distances'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Euclidean Distances')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Euclidean Distances')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Euclidean Distances')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Euclidean_Distances_01-100.png')
plt.close()

# Print statistics


##### Manhattan

In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 100 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/drugs_with_distances_and_similarities_100_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Manhattan_Distances'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Manhattan Distances')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Manhattan Distances')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Manhattan Distances')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Manhattan_Distances_01-100.png')
plt.close()

# Print statistics


##### Cosine

In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 100 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/drugs_with_distances_and_similarities_100_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Cosine_Similarities'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Cosine Similarities')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Cosine Similarities')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Cosine Similarities')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Cosine_Similarities_01-100.png')
plt.close()

# Print statistics


#### 02. 150-01

In [None]:
# Calculate and add drug distances and add to the dataset 14.10.2024
# Euclidian Distance - Manhattan Distance - Cosine Sim
import pandas as pd
import numpy as np
import json
import csv
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

def read_embeddings_from_csv(file_path):
    embeddings_dict = {}
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            if len(row) == 2:
                name, embedding_json = row
                embedding = np.array(json.loads(embedding_json))
                embeddings_dict[name] = embedding
    return embeddings_dict

embeddings_file = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-150/node_embeddings_150_1_with_names_array.csv'
embeddings = read_embeddings_from_csv(embeddings_file)

drug_dataset_file = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/drug_treatments_dataset.csv"
drug_df = pd.read_csv(drug_dataset_file)

def calculate_drug_metrics(drug_list):
    if len(drug_list) < 2:
        return {}, {}, {}  # Return empty dicts if there's only one or no drugs

    drug_embeddings = [embeddings.get(drug, []) for drug in drug_list]

    valid_drugs = [drug for drug, emb in zip(drug_list, drug_embeddings) if len(emb) > 0]
    valid_embeddings = [emb for emb in drug_embeddings if len(emb) > 0]

    if len(valid_drugs) < 2:
        return {}, {}, {}  # Return empty dicts if there aren't at least 2 valid drugs

    euclidean_distances = pdist(valid_embeddings, metric='euclidean')
    manhattan_distances = pdist(valid_embeddings, metric='cityblock')  # 'cityblock' is Manhattan distance

    cosine_similarities = cosine_similarity(valid_embeddings)

    euclidean_matrix = squareform(euclidean_distances)
    manhattan_matrix = squareform(manhattan_distances)

    euclidean_dict = {}
    manhattan_dict = {}
    cosine_dict = {}
    for i in range(len(valid_drugs)):
        for j in range(i+1, len(valid_drugs)):
            pair = tuple(sorted([valid_drugs[i], valid_drugs[j]]))
            euclidean_dict[pair] = euclidean_matrix[i, j]
            manhattan_dict[pair] = manhattan_matrix[i, j]
            cosine_dict[pair] = cosine_similarities[i, j]

    return euclidean_dict, manhattan_dict, cosine_dict

# Apply the function to each row and add the results as new columns
drug_df['SUBJECT_CUI'] = drug_df['SUBJECT_CUI'].apply(eval)  # Convert string representation of list to actual list
drug_df['Euclidean_Distances'], drug_df['Manhattan_Distances'], drug_df['Cosine_Similarities'] = zip(*drug_df['SUBJECT_CUI'].apply(calculate_drug_metrics))

# Save the updated DataFrame
output_file = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01.csv"
drug_df.to_csv(output_file, index=False)

print(f"Updated dataset saved to {output_file}")

# Display a sample of the results
print(drug_df[['OBJECT_CUI', 'SUBJECT_CUI', 'Euclidean_Distances', 'Manhattan_Distances', 'Cosine_Similarities']].head())

Updated dataset saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01.csv
  OBJECT_CUI                                        SUBJECT_CUI  \
0   C0000810                     [C0012471, C0012472, C0030095]   
1   C0000880                                         [C0062648]   
2   C0001126                                         [C0137996]   
3   C0001144  [C0002607, C0002679, C0052761, C0008947, C0055...   
4   C0001206                                         [C0023863]   

                                 Euclidean_Distances  \
0  {('C0012471', 'C0012472'): 5.628151803946335, ...   
1                                                 {}   
2                                                 {}   
3  {('C0002607', 'C0002679'): 10.516206011606151,...   
4                                                 {}   

                                 Manhattan_Distances  \
0  {('C0012471', 'C0012472'): 55.38754403684288, ...   
1    

##### Euclidean


In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 150 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Euclidean_Distances'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Euclidean Distances')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Euclidean Distances')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Euclidean Distances')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Euclidean_Distances_01-150.png')
plt.close()

# Print statistics


##### Manhattan

In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 150 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Manhattan_Distances'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Manhattan Distances')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Manhattan Distances')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Manhattan Distances')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Manhattan_Distances_01-150.png')
plt.close()

# Print statistics


##### Cosine

In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 150 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Cosine_Similarities'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Cosine Similarities')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Cosine Similarities')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Cosine Similarities')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Cosine_Similarities_01-150.png')
plt.close()

# Print statistics


#### 03. 200-01

In [None]:
# Calculate and add drug distances and add to the dataset 14.10.2024
# Euclidian Distance - Manhattan Distance - Cosine Sim
import pandas as pd
import numpy as np
import json
import csv
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity

def read_embeddings_from_csv(file_path):
    embeddings_dict = {}
    with open(file_path, mode='r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip the header row
        for row in reader:
            if len(row) == 2:
                name, embedding_json = row
                embedding = np.array(json.loads(embedding_json))
                embeddings_dict[name] = embedding
    return embeddings_dict

embeddings_file = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-200/node_embeddings_200_1_with_names_array.csv'
embeddings = read_embeddings_from_csv(embeddings_file)

drug_dataset_file = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/drug_treatments_dataset.csv"
drug_df = pd.read_csv(drug_dataset_file)

def calculate_drug_metrics(drug_list):
    if len(drug_list) < 2:
        return {}, {}, {}  # Return empty dicts if there's only one or no drugs

    drug_embeddings = [embeddings.get(drug, []) for drug in drug_list]

    valid_drugs = [drug for drug, emb in zip(drug_list, drug_embeddings) if len(emb) > 0]
    valid_embeddings = [emb for emb in drug_embeddings if len(emb) > 0]

    if len(valid_drugs) < 2:
        return {}, {}, {}  # Return empty dicts if there aren't at least 2 valid drugs

    euclidean_distances = pdist(valid_embeddings, metric='euclidean')
    manhattan_distances = pdist(valid_embeddings, metric='cityblock')  # 'cityblock' is Manhattan distance

    cosine_similarities = cosine_similarity(valid_embeddings)

    euclidean_matrix = squareform(euclidean_distances)
    manhattan_matrix = squareform(manhattan_distances)

    euclidean_dict = {}
    manhattan_dict = {}
    cosine_dict = {}
    for i in range(len(valid_drugs)):
        for j in range(i+1, len(valid_drugs)):
            pair = tuple(sorted([valid_drugs[i], valid_drugs[j]]))
            euclidean_dict[pair] = euclidean_matrix[i, j]
            manhattan_dict[pair] = manhattan_matrix[i, j]
            cosine_dict[pair] = cosine_similarities[i, j]

    return euclidean_dict, manhattan_dict, cosine_dict

# Apply the function to each row and add the results as new columns
drug_df['SUBJECT_CUI'] = drug_df['SUBJECT_CUI'].apply(eval)  # Convert string representation of list to actual list
drug_df['Euclidean_Distances'], drug_df['Manhattan_Distances'], drug_df['Cosine_Similarities'] = zip(*drug_df['SUBJECT_CUI'].apply(calculate_drug_metrics))

# Save the updated DataFrame
output_file = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv"
drug_df.to_csv(output_file, index=False)

print(f"Updated dataset saved to {output_file}")

# Display a sample of the results
print(drug_df[['OBJECT_CUI', 'SUBJECT_CUI', 'Euclidean_Distances', 'Manhattan_Distances', 'Cosine_Similarities']].head())

Updated dataset saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv
  OBJECT_CUI                                        SUBJECT_CUI  \
0   C0000810                     [C0012471, C0012472, C0030095]   
1   C0000880                                         [C0062648]   
2   C0001126                                         [C0137996]   
3   C0001144  [C0002607, C0002679, C0052761, C0008947, C0055...   
4   C0001206                                         [C0023863]   

                                 Euclidean_Distances  \
0  {('C0012471', 'C0012472'): 7.218873740645879, ...   
1                                                 {}   
2                                                 {}   
3  {('C0002607', 'C0002679'): 12.91133515471207, ...   
4                                                 {}   

                                 Manhattan_Distances  \
0  {('C0012471', 'C0012472'): 79.93982968479395, ...   
1    

##### Euclidean

In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 200 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Euclidean_Distances'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Euclidean Distances')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Euclidean Distances')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Euclidean Distances')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Euclidean_Distances_01-200.png')
plt.close()

# Print statistics


##### Manhattan

In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 200 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Manhattan_Distances'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Manhattan Distances')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Manhattan Distances')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Manhattan Distances')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Manhattan_Distances_01-200.png')
plt.close()

# Print statistics


##### Cosine

In [None]:
# 21.10.2024 - Statistical Metrics - Embedding Dim: 200 - One layer Implementation

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv"
df = pd.read_csv(file_path)

# Function to extract distance values from the dictionary string
def extract_distances(distance_dict):
    # Convert string representation of dict to actual dict
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

# Extract all distance values
all_distances = df['Cosine_Similarities'].apply(extract_distances).explode().dropna().tolist()

# Calculate statistics
max_dist = np.max(all_distances)
min_dist = np.min(all_distances)
median_dist = np.median(all_distances)
mean_dist = np.mean(all_distances)

# Set up the matplotlib figure
plt.figure(figsize=(15, 15))

# 1. Bar Chart
plt.subplot(2, 2, 1)
plt.bar(['Max', 'Min', 'Median', 'Mean'], [max_dist, min_dist, median_dist, mean_dist])
plt.title('Statistics of Cosine Similarities')
plt.ylabel('Distance')

# 2. Distribution Graph (Histogram)
plt.subplot(2, 2, 2)
sns.histplot(all_distances, kde=True)
plt.title('Distribution of Cosine Similarities')
plt.xlabel('Distance')
plt.ylabel('Frequency')

# 3. Box Plot
plt.subplot(2, 2, 3)
sns.boxplot(y=all_distances)
plt.title('Box Plot of Cosine Similarities')
plt.ylabel('Distance')


total_values = len(all_distances)
values_above_mean = sum(1 for x in all_distances if x > mean_dist)
values_above_median = sum(1 for x in all_distances if x > median_dist)

percentage_above_mean = (values_above_mean / total_values) * 100
percentage_above_median = (values_above_median / total_values) * 100

plt.subplot(2, 2, 4)
plt.text(0.1, 0.9, f'Max: {max_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.8, f'Min: {min_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.7, f'Median: {median_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.6, f'Mean: {mean_dist:.4f}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.5, f'Total Values: {total_values:,}', fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.4, f'Above Mean: {values_above_mean:,} ({percentage_above_mean:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.text(0.1, 0.3, f'Above Median: {values_above_median:,} ({percentage_above_median:.2f}%)',
         fontsize=12, transform=plt.gca().transAxes)
plt.axis('off')
plt.title('Statistical Metrics')



# Adjust layout and save the figure
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Visualizations/Cosine_Similarities_01-200.png')
plt.close()

# Print statistics


### Dataset Filtering

#### 01. 100-01

##### Euclidean

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/drugs_with_distances_and_similarities_100_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Euclidean_Distances'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Euclidean_Distances'] = df['Euclidean_Distances'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Euclidean_Distances'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Euclidean_Distances']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/Euclidean_Distances_above_mean_100_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 595
Mean distance threshold: 5.4778


##### Manhattan

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/drugs_with_distances_and_similarities_100_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Manhattan_Distances'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Manhattan_Distances'] = df['Manhattan_Distances'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Manhattan_Distances'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Manhattan_Distances']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/Manhattan_Distances_above_mean_100_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 597
Mean distance threshold: 43.7232


##### Cosine

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/drugs_with_distances_and_similarities_100_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Cosine_Similarities'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Cosine_Similarities'] = df['Cosine_Similarities'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Cosine_Similarities'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Cosine_Similarities']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-100/Cosine_Similarities_above_mean_100_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 680
Mean distance threshold: 0.6550


#### 02. 150-01

##### Euclidean

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Euclidean_Distances'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Euclidean_Distances'] = df['Euclidean_Distances'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Euclidean_Distances'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Euclidean_Distances']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/Euclidean_Distances_above_mean_150_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 597
Mean distance threshold: 6.1952


##### Manhattan

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Manhattan_Distances'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Manhattan_Distances'] = df['Manhattan_Distances'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Manhattan_Distances'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Manhattan_Distances']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/Manhattan_Distances_above_mean_150_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 599
Mean distance threshold: 60.5927


##### Cosine

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Cosine_Similarities'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Cosine_Similarities'] = df['Cosine_Similarities'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Cosine_Similarities'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Cosine_Similarities']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/Cosine_Similarities_above_mean_150_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 680
Mean distance threshold: 0.6362


#### 03. 200-01

##### Euclidean

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Euclidean_Distances'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Euclidean_Distances'] = df['Euclidean_Distances'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Euclidean_Distances'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Euclidean_Distances']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/Euclidean_Distances_above_mean_200_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 617
Mean distance threshold: 7.0065


##### Manhattan

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Manhattan_Distances'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Manhattan_Distances'] = df['Manhattan_Distances'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Manhattan_Distances'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Manhattan_Distances']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/Manhattan_Distances_above_mean_200_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 620
Mean distance threshold: 78.7988


##### Cosine

In [None]:
import pandas as pd
import ast

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv"
df = pd.read_csv(file_path)

def extract_distances(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    return list(distance_dict.values())

all_distances = df['Cosine_Similarities'].apply(extract_distances).explode().dropna().tolist()
mean_dist = np.mean(all_distances)

def filter_distances_above_mean(distance_dict):
    distance_dict = ast.literal_eval(distance_dict)
    filtered_dict = {k: v for k, v in distance_dict.items() if v > mean_dist}
    return filtered_dict if filtered_dict else None

df['Filtered_Cosine_Similarities'] = df['Cosine_Similarities'].apply(filter_distances_above_mean)

df_filtered = df[df['Filtered_Cosine_Similarities'].notna()]
df_filtered = df_filtered[['OBJECT_CUI', 'SUBJECT_CUI', 'Filtered_Cosine_Similarities']]

output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/Cosine_Similarities_above_mean_200_01.csv"
df_filtered.to_csv(output_path, index=False)

print(f"Original number of rows: {len(df)}")
print(f"Number of rows in filtered dataset: {len(df_filtered)}")
print(f"Mean distance threshold: {mean_dist:.4f}")

Original number of rows: 1244
Number of rows in filtered dataset: 682
Mean distance threshold: 0.6388


### Testing


#### Controlling

In [None]:
import pandas as pd
# Load the output file
output_df = pd.read_csv("/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv")

# Load test dataset
test_df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/test_dataset.csv')

# Convert OBJECT_CUI columns to sets to easily find unique and common values
output_object_cuis = set(output_df['SUBJECT_CUI'].unique())
test_object_cuis = set(test_df['SUBJECT_CUI'].unique())

# Find common OBJECT_CUI values
common_cuis = output_object_cuis.intersection(test_object_cuis)

# Find OBJECT_CUI values unique to each dataset
unique_to_output = output_object_cuis - test_object_cuis
unique_to_test_df = test_object_cuis - output_object_cuis

# Display results
print(f"Number of OBJECT_CUI values in output file: {len(output_object_cuis)}")
print(f"Number of OBJECT_CUI values in test dataset: {len(test_object_cuis)}")
print(f"Number of common OBJECT_CUI values: {len(common_cuis)}")
print(f"Unique OBJECT_CUI values in output file: {unique_to_output}")
print(f"Unique OBJECT_CUI values in test dataset: {unique_to_test_df}")


Number of OBJECT_CUI values in output file: 1064
Number of OBJECT_CUI values in test dataset: 553
Number of common OBJECT_CUI values: 0
Unique OBJECT_CUI values in output file: {"['C0010582', 'C0062922', 'C0036442', 'C0041190']", "['C0003143', 'C0728803', 'C0032483', 'C0027373', 'C0030071', 'C0031408', 'C0031469', 'C0032623', 'C3652618', 'C0078794']", "['C0019134']", "['C0010961']", "['C0041031', 'C0002083', 'C0718495', 'C0003143', 'C0052759', 'C0005025', 'C0005041', 'C0005059', 'C0005100', 'C0006246', 'C0006400', 'C0054235', 'C0054672', 'C0008281', 'C0012050', 'C0008929', 'C0056519', 'C0010620', 'C0057605', 'C0057606', 'C0058389', 'C0013085', 'C0013092', 'C0058831', 'C0064263', 'C0060240', 'C0020404', 'C0064582', 'C0771655', 'C0023660', 'C4542172', 'C0025625', 'C0066101', 'C0070563', 'C0031408', 'C0070570', 'C0070709', 'C0071810', 'C0033399', 'C0073085', 'C0039542', 'C0039629', 'C0041090', 'C0041098', 'C0149368', 'C0043491']", "['C3713906', 'C4519114']", "['C0007257']", "['C0540776', 

In [None]:
import pandas as pd
# Load the output file
output_df = pd.read_csv("/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-200/drugs_with_distances_and_similarities_200_01.csv")

# Function to convert SUBJECT_CUI strings to sets
def format_subject_cui(subject_cui_str):
    # Remove extra quotes around the list string and evaluate it as a list
    subject_cui_list = eval(subject_cui_str.strip("\""))
    # Convert list to set
    subject_cui_set = set(subject_cui_list)
    return subject_cui_set

# Temporarily format the SUBJECT_CUI column for comparison
temp_subject_cuis = output_df['SUBJECT_CUI'].apply(format_subject_cui)

# Load the test dataset
test_df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/test_dataset.csv')

# Now you can compare the formatted SUBJECT_CUI sets with those in test_df
# Example: check if a specific SUBJECT_CUI in test_df matches those in temp_subject_cuis
# Assuming test_df also has a 'SUBJECT_CUI' column to compare against

# Example comparison for unique values
output_subject_cuis = set(temp_subject_cuis.explode())  # Flatten list of sets
test_subject_cuis = set(test_df['SUBJECT_CUI'].explode())  # Flatten if in similar list format

# Find common and unique SUBJECT_CUIs
common_subject_cuis = output_subject_cuis.intersection(test_subject_cuis)
unique_to_output = output_subject_cuis - test_subject_cuis
unique_to_test = test_subject_cuis - output_subject_cuis

# Display the results
print(f"Number of SUBJECT_CUI values in output: {len(output_subject_cuis)}")
print(f"Number of SUBJECT_CUI values in test dataset: {len(test_subject_cuis)}")
print(f"Number of common SUBJECT_CUI values: {len(common_subject_cuis)}")
print(f"Unique SUBJECT_CUI values in output file: {unique_to_output}")
print(f"Unique SUBJECT_CUI values in test dataset: {unique_to_test}")


Number of SUBJECT_CUI values in output: 1937
Number of SUBJECT_CUI values in test dataset: 553
Number of common SUBJECT_CUI values: 383
Unique SUBJECT_CUI values in output file: {'C0033447', 'C3714801', 'C0018033', 'C2000261', 'C0052585', 'C0031392', 'C0108101', 'C3851350', 'C0030040', 'C0130137', 'C0028158', 'C0020264', 'C0287721', 'C0031441', 'C0674432', 'C2697961', 'C0002421', 'C0068992', 'C0070455', 'C0039840', 'C0137996', 'C0012093', 'C0025387', 'C0006466', 'C3854019', 'C0055461', 'C2980094', 'C0004599', 'C4519232', 'C3467876', 'C4542175', 'C0056391', 'C4043480', 'C4547051', 'C0085379', 'C0123047', 'C0004969', 'C0078643', 'C3652978', 'C0537670', 'C2717174', 'C0045093', 'C1609686', 'C0053526', 'C0073601', 'C0012384', 'C4086264', 'C0013065', 'C0025631', 'C0030077', 'C0038665', 'C0054120', 'C0076189', 'C1456409', 'C2827241', 'C0016225', 'C0005099', 'C0007080', 'C2933904', 'C1435444', 'C0014921', 'C0052761', 'C0048038', 'C2346836', 'C0292819', 'C0065760', 'C1101838', 'C4291409', 'C454

#### Test Dataset Analysis

In [None]:
# 24.11.2024

import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean, cityblock
from scipy.spatial.distance import cosine
import ast
from tqdm import tqdm

def load_embeddings(embeddings_path):
    """
    Load and process embeddings from CSV file.
    Returns a dictionary mapping CUI to embedding vector.
    """
    embeddings_df = pd.read_csv(embeddings_path)
    embeddings_dict = {}

    for _, row in embeddings_df.iterrows():
        # Extract base CUI (remove anything after |)
        cui = row['Node Name'].split('|')[0]
        # Convert string representation of embedding to numpy array
        embedding = np.array(ast.literal_eval(row['Embedding']))
        embeddings_dict[cui] = embedding

    return embeddings_dict

def calculate_distances(vec1, vec2):
    """
    Calculate Manhattan distance, Euclidean distance, and Cosine similarity between two vectors.
    """
    manhattan = cityblock(vec1, vec2)
    euclidean_dist = euclidean(vec1, vec2)
    cosine_sim = 1 - cosine(vec1, vec2)  # Convert cosine distance to similarity

    return manhattan, cosine_sim, euclidean_dist

def create_pair_key(test_drug, subject_drug):
    """
    Create a standardized key for a drug pair.
    """
    return f"({test_drug},{subject_drug})"

def process_drug_distances(treatments_path, embeddings_path, output_path):
    """
    Process drug treatments dataset and calculate distances between all combinations of drug embeddings.
    Store results using pair names as keys.
    """
    treatments_df = pd.read_csv(treatments_path)
    embeddings_dict = load_embeddings(embeddings_path)

    treatments_df['SUBJECT_CUI'] = treatments_df['SUBJECT_CUI'].apply(ast.literal_eval)
    treatments_df['TEST_DRUGS'] = treatments_df['TEST_DRUGS'].apply(ast.literal_eval)

    manhattan_distances = []
    cosine_similarities = []
    euclidean_distances = []

    for _, row in tqdm(treatments_df.iterrows(), total=len(treatments_df), desc="Calculating distances"):
        subject_drugs = row['SUBJECT_CUI']
        test_drugs = row['TEST_DRUGS']

        row_manhattan = {}
        row_cosine = {}
        row_euclidean = {}

        for test_drug in test_drugs:
            if test_drug not in embeddings_dict:
                continue

            test_embedding = embeddings_dict[test_drug]

            for subject_drug in subject_drugs:
                if subject_drug not in embeddings_dict:
                    continue

                subject_embedding = embeddings_dict[subject_drug]

                manhattan, cosine_sim, eucl = calculate_distances(test_embedding, subject_embedding)

                pair_key = create_pair_key(test_drug, subject_drug)

                row_manhattan[pair_key] = round(manhattan, 6)
                row_cosine[pair_key] = round(cosine_sim, 6)
                row_euclidean[pair_key] = round(eucl, 6)

        manhattan_distances.append(str(row_manhattan))
        cosine_similarities.append(str(row_cosine))
        euclidean_distances.append(str(row_euclidean))

    treatments_df['MANHATTAN_DIST'] = manhattan_distances
    treatments_df['COSINE_SIM'] = cosine_similarities
    treatments_df['EUCLIDEAN_DIST'] = euclidean_distances

    treatments_df.to_csv(output_path, index=False)

    return treatments_df

def print_example_distances(df, row_index=0):
    """
    Print detailed example of distance calculations for a specific row
    """
    print(f"\nDetailed example for row {row_index}:")

    # Convert string representations back to dictionaries
    manhattan_dict = ast.literal_eval(df['MANHATTAN_DIST'].iloc[row_index])
    cosine_dict = ast.literal_eval(df['COSINE_SIM'].iloc[row_index])
    euclidean_dict = ast.literal_eval(df['EUCLIDEAN_DIST'].iloc[row_index])

    print("\nTest Drugs:", df['TEST_DRUGS'].iloc[row_index])
    print("Subject Drugs:", df['SUBJECT_CUI'].iloc[row_index])

    print("\nDistance measurements:")
    print("\nManhattan distances:")
    for pair, distance in manhattan_dict.items():
        print(f"{pair}: {distance}")

    print("\nCosine similarities:")
    for pair, similarity in cosine_dict.items():
        print(f"{pair}: {similarity}")

    print("\nEuclidean distances:")
    for pair, distance in euclidean_dict.items():
        print(f"{pair}: {distance}")

if __name__ == "__main__":
# Define file paths
    treatments_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/drug_treatments_dataset_all.csv"
    embeddings_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Node Embeddings/01-200/node_embeddings_200_1_with_names_array.csv"
    output_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_200.csv"

    # Process the datasets and calculate distances
    result_df = process_drug_distances(treatments_path, embeddings_path, output_path)

    # Print detailed example for the first row
    print_example_distances(result_df, 0)

    print(f"\nProcessed {len(result_df)} rows")

Calculating distances: 100%|██████████| 1244/1244 [00:01<00:00, 1019.96it/s]



Detailed example for row 0:

Test Drugs: []
Subject Drugs: ['C0012471', 'C0012472', 'C0030095']

Distance measurements:

Manhattan distances:

Cosine similarities:

Euclidean distances:

Processed 1244 rows


In [None]:
import pandas as pd
import ast

treatments_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/drug_treatments_dataset_all.csv"
df = pd.read_csv(treatments_path)

df['TEST_DRUGS'] = df['TEST_DRUGS'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
total_rows = len(df['TEST_DRUGS'])
empty_arrays = df['TEST_DRUGS'].apply(lambda x: len(x) == 0).sum()
non_empty_arrays = total_rows - empty_arrays

print(f"Total rows: {total_rows}")
print(f"Empty arrays: {empty_arrays}")
print(f"Non-empty arrays: {non_empty_arrays}")

Total rows: 1244
Empty arrays: 987
Non-empty arrays: 257


In [None]:
# Clear the columns
import pandas as pd

file_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_200.csv'  # Replace with your file path
df = pd.read_csv(file_path)

filtered_df = df[df['TEST_DRUGS'] != '[]']

filtered_file_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_200_clear.csv'  # Replace with your desired output path
filtered_df.to_csv(filtered_file_path, index=False)

row_count = filtered_df.shape[0]

print(f"Number of rows after filtering: {row_count}")


Number of rows after filtering: 257


### 100

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_100_clear_meaned_new.csv')

def determine_pass_fail(row):
    pass_fail_results = []

    test_drugs = eval(row['TEST_DRUGS'])
    subject_drugs = eval(row['SUBJECT_CUI'])
    manhattan_distances = eval(row['MANHATTAN_DIST'])
    cosine_similarities = eval(row['COSINE_SIM'])
    euclidean_distances = eval(row['EUCLIDEAN_DIST'])

    manhattan_mean = row['MANHATTAN_MEAN']
    cosine_mean = row['COSINE_MEAN']
    euclidean_mean = row['EUCLIDEAN_MEAN']

    for test_drug in test_drugs:
        pass_found = False
        fail_conditions = 3  # Start with all conditions failing

        for subject_drug in subject_drugs:
            pair = f'({test_drug},{subject_drug})'

            manhattan_pass = manhattan_distances.get(pair, 0) < manhattan_mean
            cosine_pass = cosine_similarities.get(pair, 0) > cosine_mean
            euclidean_pass = euclidean_distances.get(pair, 0) < euclidean_mean

            conditions_met = sum([manhattan_pass, cosine_pass, euclidean_pass])
            fail_conditions = 3 - conditions_met  # Calculate fail conditions

            if conditions_met >= 2:
                pass_found = True
                break

        if pass_found:
            pass_fail_results.append(f'PASS_{conditions_met}_{test_drug}')
        else:
            pass_fail_results.append(f'FAIL_{fail_conditions}_{test_drug}')

    return pass_fail_results

df['PASS_FAIL'] = df.apply(determine_pass_fail, axis=1)

df.to_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_100_last.csv', index=False)


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_100_last.csv')

# Function to count total PASS and FAIL

def count_pass_fail(pass_fail_column):
    pass_count = 0
    fail_count = 0
    for entry in pass_fail_column:
        for result in entry.split(', '):
            if 'PASS' in result:
                pass_count += 1
            elif 'FAIL' in result:
                fail_count += 1
    return pass_count, fail_count

pass_count, fail_count = count_pass_fail(df['PASS_FAIL'])

print(f"Total PASS: {pass_count}")
print(f"Total FAIL: {fail_count}")


Total PASS: 549
Total FAIL: 630


###150

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_150_clear_updated.csv')

def determine_pass_fail(row):
    pass_fail_results = []

    test_drugs = eval(row['TEST_DRUGS'])
    subject_drugs = eval(row['SUBJECT_CUI'])
    manhattan_distances = eval(row['MANHATTAN_DIST'])
    cosine_similarities = eval(row['COSINE_SIM'])
    euclidean_distances = eval(row['EUCLIDEAN_DIST'])

    manhattan_mean = row['MANHATTAN_MEAN']
    cosine_mean = row['COSINE_MEAN']
    euclidean_mean = row['EUCLIDEAN_MEAN']

    for test_drug in test_drugs:
        pass_found = False
        fail_conditions = 3  # Start with all conditions failing

        for subject_drug in subject_drugs:
            pair = f'({test_drug},{subject_drug})'

            manhattan_pass = manhattan_distances.get(pair, 0) < manhattan_mean
            cosine_pass = cosine_similarities.get(pair, 0) > cosine_mean
            euclidean_pass = euclidean_distances.get(pair, 0) < euclidean_mean

            conditions_met = sum([manhattan_pass, cosine_pass, euclidean_pass])
            fail_conditions = 3 - conditions_met  # Calculate fail conditions

            if conditions_met >= 2:
                pass_found = True
                break

        if pass_found:
            pass_fail_results.append(f'PASS_{conditions_met}_{test_drug}')
        else:
            pass_fail_results.append(f'FAIL_{fail_conditions}_{test_drug}')

    return pass_fail_results

df['PASS_FAIL'] = df.apply(determine_pass_fail, axis=1)

df.to_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_150_new.csv', index=False)


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_200_new.csv')

# Function to count total PASS and FAIL

def count_pass_fail(pass_fail_column):
    pass_count = 0
    fail_count = 0
    for entry in pass_fail_column:
        for result in entry.split(', '):
            if 'PASS' in result:
                pass_count += 1
            elif 'FAIL' in result:
                fail_count += 1
    return pass_count, fail_count

pass_count, fail_count = count_pass_fail(df['PASS_FAIL'])

print(f"Total PASS: {pass_count}")
print(f"Total FAIL: {fail_count}")


Total PASS: 1141
Total FAIL: 358


### 200

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_200_clear_updated.csv')

def determine_pass_fail(row):
    pass_fail_results = []

    test_drugs = eval(row['TEST_DRUGS'])
    subject_drugs = eval(row['SUBJECT_CUI'])
    manhattan_distances = eval(row['MANHATTAN_DIST'])
    cosine_similarities = eval(row['COSINE_SIM'])
    euclidean_distances = eval(row['EUCLIDEAN_DIST'])

    manhattan_mean = row['MANHATTAN_MEAN']
    cosine_mean = row['COSINE_MEAN']
    euclidean_mean = row['EUCLIDEAN_MEAN']

    for test_drug in test_drugs:
        pass_found = False
        fail_conditions = 3  # Start with all conditions failing

        for subject_drug in subject_drugs:
            pair = f'({test_drug},{subject_drug})'

            manhattan_pass = manhattan_distances.get(pair, 0) < manhattan_mean
            cosine_pass = cosine_similarities.get(pair, 0) > cosine_mean
            euclidean_pass = euclidean_distances.get(pair, 0) < euclidean_mean

            conditions_met = sum([manhattan_pass, cosine_pass, euclidean_pass])
            fail_conditions = 3 - conditions_met  # Calculate fail conditions

            if conditions_met >= 2:
                pass_found = True
                break

        if pass_found:
            pass_fail_results.append(f'PASS_{conditions_met}_{test_drug}')
        else:
            pass_fail_results.append(f'FAIL_{fail_conditions}_{test_drug}')

    return pass_fail_results

df['PASS_FAIL'] = df.apply(determine_pass_fail, axis=1)

df.to_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_200_new.csv', index=False)


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_200_clear_with_pass_fail.csv')

# Function to count total PASS and FAIL

def count_pass_fail(pass_fail_column):
    pass_count = 0
    fail_count = 0
    for entry in pass_fail_column:
        for result in entry.split(', '):
            if 'PASS' in result:
                pass_count += 1
            elif 'FAIL' in result:
                fail_count += 1
    return pass_count, fail_count

pass_count, fail_count = count_pass_fail(df['PASS_FAIL'])

print(f"Total PASS: {pass_count}")
print(f"Total FAIL: {fail_count}")


Total PASS: 707
Total FAIL: 792


### Clean the Dataset's Brackets

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/Final/all_distances_emb_200.csv')

data['OBJECT_CUI'] = data['OBJECT_CUI'].str.strip("[]")
data['SUBJECT_CUI'] = data['SUBJECT_CUI'].str.strip("[]")
data['TEST_DRUGS'] = data['TEST_DRUGS'].str.strip("[]")
data['PASS_FAIL'] = data['PASS_FAIL'].str.strip("[]")

data['MANHATTAN_DIST'] = data['MANHATTAN_DIST'].str.strip("{}")
data['EUCLIDEAN_DIST'] = data['EUCLIDEAN_DIST'].str.strip("{}")
data['COSINE_SIM'] = data['COSINE_SIM'].str.strip("{}")


cleaned_file_path = '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/Final/all_distances_emb_200_clean.csv'  # Replace with your desired output path
data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned file saved to {cleaned_file_path}")


Cleaned file saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/Final/all_distances_emb_200_clean.csv


In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/Temp/all_distances_emb_150_clear_with_pass_fail_cleaned.csv')

df['THRESHOLD_MANHATTAN'] = 60.5927
df['THRESHOLD_COSINE_SIM'] = 0.6362
df['THRESHOLD_EUCLIDEAN'] = 6.1952

df['THRESHOLD_MANHATTAN'] = df['THRESHOLD_MANHATTAN'].map('{:.4f}'.format)
df['THRESHOLD_COSINE_SIM'] = df['THRESHOLD_COSINE_SIM'].map('{:.4f}'.format)
df['THRESHOLD_EUCLIDEAN'] = df['THRESHOLD_EUCLIDEAN'].map('{:.4f}'.format)

df.to_csv(
    '/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/Temp/all_distances_emb_150_clear_with_pass_fail_cleaned.csv',
    index=False
)




THRESHOLD sütunları doğru formatta eklendi ve kaydedildi.


In [None]:
import pandas as pd
import ast  # To safely parse string representations of dictionaries

input_file = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01_cleaned.csv"
output_file = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01_cleaned_updated.csv"

df = pd.read_csv(input_file)

def calculate_means(row):
    manhattan_dist = ast.literal_eval(row["Manhattan_Distances"])
    cosine_sim = ast.literal_eval(row["Cosine_Similarities"])
    euclidean_dist = ast.literal_eval(row["Euclidean_Distances"])

    manhattan_mean = sum(manhattan_dist.values()) / len(manhattan_dist)
    cosine_mean = sum(cosine_sim.values()) / len(cosine_sim)
    euclidean_mean = sum(euclidean_dist.values()) / len(euclidean_dist)

    return pd.Series([manhattan_mean, cosine_mean, euclidean_mean])

df[["MANHATTAN_MEAN", "COSINE_MEAN", "EUCLIDEAN_MEAN"]] = df.apply(calculate_means, axis=1)
df.to_csv(output_file, index=False)

print(f"Updated dataset saved to {output_file}")


Updated dataset saved to /content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01_cleaned_updated.csv


In [None]:
import pandas as pd

file_path_1 =  "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/01-150/drugs_with_distances_and_similarities_150_01_cleaned_updated.csv"
file_path_2 = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_150_clear.csv"

df1 = pd.read_csv(file_path_1)
df2 = pd.read_csv(file_path_2)

merged_df = df2.merge(
    df1[['OBJECT_CUI', 'MANHATTAN_MEAN', 'COSINE_MEAN', 'EUCLIDEAN_MEAN']],
    on='OBJECT_CUI',
    how='left'
)

merged_df.to_csv("/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_150_clear_meaned.csv", index=False)
print(merged_df)


    OBJECT_CUI                                        SUBJECT_CUI  \
0     C0001206                                       ['C0023863']   
1     C0001627  ['C0005308', 'C0056391', 'C0025815', 'C0600901...   
2     C0002171   ['C0040864', 'C0040866', 'C0040867', 'C0077004']   
3     C0002395               ['C0001040', 'C3714499', 'C0051691']   
4     C0002726               ['C0001047', 'C0010654', 'C2745274']   
..         ...                                                ...   
252   C2267227                                       ['C0016365']   
253   C2316304                                       ['C3853822']   
254   C2585890                                       ['C0796392']   
255   C2973725  ['C1176329', 'C0053336', 'C0252643', 'C0033567...   
256   C3495559  ['C0018033', 'C0005308', 'C0538927', 'C0056391...   

                                            TEST_DRUGS  \
0                 ['C0013030', 'C0023570', 'C0037659']   
1    ['C0014563', 'C0544368', 'C0025819', 'C0032952... 

In [None]:
import pandas as pd

file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_150_clear_meaned.csv"

df = pd.read_csv(file_path)

df_cleaned = df.dropna(subset=['MANHATTAN_MEAN'])
cleaned_file_path = "/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_150_clear_meaned_new.csv"
df_cleaned.to_csv(cleaned_file_path, index=False)

print(df_cleaned)


    OBJECT_CUI                                        SUBJECT_CUI  \
1     C0001627  ['C0005308', 'C0056391', 'C0025815', 'C0600901...   
2     C0002171   ['C0040864', 'C0040866', 'C0040867', 'C0077004']   
3     C0002395               ['C0001040', 'C3714499', 'C0051691']   
4     C0002726               ['C0001047', 'C0010654', 'C2745274']   
5     C0002874                           ['C0030072', 'C2364481']   
..         ...                                                ...   
243   C1269683  ['C3885614', 'C1880288', 'C0013065', 'C0066624...   
247   C1384606   ['C0014938', 'C0012145', 'C1313616', 'C0011185']   
249   C1527383                           ['C0000473', 'C0053225']   
255   C2973725  ['C1176329', 'C0053336', 'C0252643', 'C0033567...   
256   C3495559  ['C0018033', 'C0005308', 'C0538927', 'C0056391...   

                                            TEST_DRUGS  \
1    ['C0014563', 'C0544368', 'C0025819', 'C0032952...   
2                                         ['C1619966'] 

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_dim_200_clear_updated.csv')

# Define the function to determine pass/fail
def determine_pass_fail(row):
    pass_fail_results = []

    test_drugs = eval(row['TEST_DRUGS'])
    subject_drugs = eval(row['SUBJECT_CUI'])
    manhattan_distances = eval(row['MANHATTAN_DIST'])
    cosine_similarities = eval(row['COSINE_SIM'])
    euclidean_distances = eval(row['EUCLIDEAN_DIST'])

    # Use dynamic thresholds from the row
    manhattan_mean = row['MANHATTAN_MEAN']
    cosine_mean = row['COSINE_MEAN']
    euclidean_mean = row['EUCLIDEAN_MEAN']

    for test_drug in test_drugs:
        pass_found = False
        fail_conditions = 3  # Start with all conditions failing

        for subject_drug in subject_drugs:
            pair = f'({test_drug},{subject_drug})'

            # Apply the dynamic thresholds
            manhattan_pass = manhattan_distances.get(pair, 0) < manhattan_mean
            cosine_pass = cosine_similarities.get(pair, 0) > cosine_mean
            euclidean_pass = euclidean_distances.get(pair, 0) < euclidean_mean

            # Count conditions met
            conditions_met = sum([manhattan_pass, cosine_pass, euclidean_pass])
            fail_conditions = 3 - conditions_met  # Calculate fail conditions

            if conditions_met >= 2:
                pass_found = True
                break

        # Append result for the test drug
        if pass_found:
            pass_fail_results.append(f'PASS_{conditions_met}_{test_drug}')
        else:
            pass_fail_results.append(f'FAIL_{fail_conditions}_{test_drug}')

    return pass_fail_results

# Apply the function to the DataFrame
df['PASS_FAIL'] = df.apply(determine_pass_fail, axis=1)

# Save the updated DataFrame to a new CSV file
df.to_csv('/content/drive/MyDrive/Digital Twin Study/Mehmet Can/Datasets/Test/all_distances_emb_200_new.csv', index=False)
