<a href="https://colab.research.google.com/github/dikraMasrour/Breast_Cancer_Risk_Factor_Prediction_KG/blob/main/Preproc_Modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Breast cancer biomedical KG Preprocessing

In [1]:
%%capture
!pip install torchkge

#### Mounting drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### Imports

In [3]:
# basics
import pandas as pd
import numpy as np
from os.path import join
from os import makedirs, remove
from os.path import exists
from tqdm.autonotebook import tqdm
import pickle
import gc

# torchkge related
import torch
from torch import cuda
from torch.optim import Adam
from torchkge.models import TransEModel
from torchkge.sampling import BernoulliNegativeSampler
from torchkge.utils import MarginLoss, DataLoader
from torchkge.data_structures import KnowledgeGraph
from torchkge.utils import get_data_home
from torchkge.utils.operations import extend_dicts
from torchkge.evaluation import LinkPredictionEvaluator

MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/models'

  from tqdm.autonotebook import tqdm


In [4]:
# set up cuda if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
dataPath = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/preprocessed_KG.pkl"
df = pd.read_pickle(dataPath)
df.head()

Unnamed: 0,rel,from,to
2,ISA,C0318627,C0206590
3,ISA,C0446169,C0003725
4,PROCESS_OF,C0012634,C0020114
5,CAUSES,C0042776,C0012634
12,PRODUCES,C0007523,C0019878


In [None]:
df.shape

(8279795, 3)

In [None]:
def split(df_):
    # group by is used, so the model is able to train on all the nodes
    train_df = df_.groupby(['from', 'to']).sample(frac = 0.8) # samples triples from each group
    test_val = df_[~df_.index.isin(train_df.index)]
    test_df = df_.sample(frac = 0.5)
    val_df = test_val[~test_val.index.isin(test_df.index)]
    print(train_df.shape, test_df.shape, val_df.shape)
    return train_df, test_df, val_df


In [None]:
# RUN ONCE
train_df, test_df, val_df = split(df)

(7192092, 3) (4139898, 3) (543861, 3)


In [None]:
# RUN ONCE : save splits as pickle files
train_df.to_pickle('/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/splits/train_df.pkl')
test_df.to_pickle('/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/splits/test_df.pkl')
val_df.to_pickle('/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/splits/val_df.pkl')

In [None]:
gc.collect()

0

In [5]:
def load_kgs_from_df(data_home=None):
    data_home = '/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/splits'
    df1 = pd.read_pickle(data_home + '/train_df.pkl')
    df2 = pd.read_pickle(data_home + '/val_df.pkl')
    df3 = pd.read_pickle(data_home + '/test_df.pkl')
    df = pd.concat([df1, df2, df3])
    kg = KnowledgeGraph(df)

    return kg.split_kg(sizes=(len(df1), len(df2), len(df3)))

In [6]:
# RUN ONCE
kg_train, kg_val, kg_test = load_kgs_from_df()

In [7]:
gc.collect()

0

In [33]:
class TrainLoop():
    def __init__(self, args , kg_train, kg_val, kg_test):
        self.n_epochs = args.n_epochs
        self.kg_train, self.kg_val, self.kg_test = kg_train, kg_val, kg_test
        self.train_dataloader = DataLoader(kg_train, batch_size=args.b_size, use_cuda='all')
        self.val_dataloader = DataLoader(kg_val, batch_size=args.b_size, use_cuda='all')
        self.test_dataloader = DataLoader(kg_test, batch_size=args.b_size, use_cuda='all')
        self.model_path = args.model_path
        self.model = TransEModel(args.emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type='L2')
        self.criterion = MarginLoss(args.margin)
        # Move everything to CUDA if available
        if cuda.is_available():
            cuda.empty_cache()
            self.model.cuda()
            self.criterion.cuda()

        # Define the torch optimizer to be used
        self.optimizer = Adam(self.model.parameters(), lr=args.lr, weight_decay=1e-5)

        self.sampler = BernoulliNegativeSampler(self.kg_train)

    def fit_step(self):
        running_loss = 0.0
        for i, batch in enumerate(self.train_dataloader):
            h, t, r = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            n_h, n_t = self.sampler.corrupt_batch(h.to('cpu'), t.to('cpu'), r.to('cpu'))
            n_h, n_t = n_h.to(device), n_t.to(device)
            self.optimizer.zero_grad()

            # forward + backward + optimize
            pos, neg = self.model(h, t, r, n_h, n_t)
            print(pos, neg)
            loss = self.criterion(pos, neg)
            loss.backward()
            self.optimizer.step()

            running_loss += loss.item()
            print(running_loss)
        return running_loss
        
    def test_step(self):
        self.model = torch.load(self.model_path)
        self.model.eval()
        evaluator = LinkPredictionEvaluator(self.model, self.kg_test)
        evaluator.evaluate(200, verbose=True)
        evaluator.print_results()

    def eval_step(self):
        evaluator = LinkPredictionEvaluator(self.model, self.kg_val)
        evaluator.evaluate(200, verbose=False)
        return evaluator.mrr()[1]

    def fit(self):
        iterator = tqdm(range(self.n_epochs), unit='epoch')
        best_val = -np.inf
        patience= 10
        patience_count = 0
        for epoch in iterator:
            running_loss = self.fit_step()
            val_ = self.eval_step()

            if val_> best_val:
                best_val = val_
                torch.save(self.model, self.model_path)
                patience_count = 0
            else:
                if patience_count == patience:
                    break
                else:
                    patience_count += 1

            iterator.set_description(
                'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                                      running_loss / len(self.train_dataloader)))

        self.model.normalize_parameters()

In [23]:
class Params():
    def __init__(self):
        pass

In [24]:
args = Params()

In [29]:
args.emb_dim = 100
args.lr = 0.0004
args.n_epochs = 1
args.b_size = 9143 # following the proportion of the batch size for 28M samples
args.margin = 0.5
args.model_path = join(MODEL_PATH, "basic.bt")

In [30]:
tloop = TrainLoop(args, kg_train, kg_val, kg_test)

In [31]:
gc.collect()

319

In [None]:
tloop.fit()

In [None]:
tloop.test_step()