<a href="https://colab.research.google.com/github/dikraMasrour/Breast_Cancer_Risk_Factor_Prediction_KG/blob/main/untitled.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!pip install torchkge

#### Mounting drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Imports

In [None]:
from torch import cuda
from torch.optim import Adam
from torchkge.models import TransEModel
from torchkge.sampling import BernoulliNegativeSampler
from torchkge.utils import MarginLoss, DataLoader
from torchkge.utils.datasets import load_fb15k
from tqdm.autonotebook import tqdm
import torch
from torchkge.evaluation import LinkPredictionEvaluator
import numpy as np
from os.path import join
from os import makedirs, remove
from os.path import exists
from pandas import concat, DataFrame, merge, read_csv
from urllib.request import urlretrieve
from torchkge.data_structures import KnowledgeGraph
# from torchkge.utils import get_data_home, safe_extract
from torchkge.utils import get_data_home
from torchkge.utils.operations import extend_dicts
from pandas import concat, DataFrame, merge, read_csv

In [None]:
# set up cuda is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
dataPath = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/Copie de all_data_triples_can.csv"
df = read_csv(dataPath, compression='gzip')
df.head()

In [None]:
def split(df_):
    train_df = df_.groupby(['from', 'to']).sample(frac = 0.8)
    test_val = df_[~df_.index.isin(train_df.index)]
    test_df = df_.sample(frac = 0.5)
    val_df = test_val[~test_val.index.isin(test_df.index)]
    print(train_df.shape, test_df.shape, val_df.shape)
    return train_df, test_df, val_df


In [None]:
df1 = df.rename(columns={'SUBJECT_CUI': 'from'})
df2 = df1.rename(columns={'OBJECT_CUI': 'to'})
df3 = df2.rename(columns={'PREDICATE': 'rel'})

In [None]:
df3.head()

In [None]:
del df3["ORIGIN_ID"]
df3.head()

In [None]:
# train_df, test_df, val_df = split(df3)

In [None]:
#  In general
def load_biodata(data_home=None):
    """Load FB13 dataset.

    Parameters
    ----------
    data_home: str, optional
        Path to the `torchkge_data` directory (containing data folders). If
        files are not present on disk in this directory, they are downloaded
        and then placed in the right place.

    Returns
    -------
    kg_train: torchkge.data_structures.KnowledgeGraph
    kg_val: torchkge.data_structures.KnowledgeGraph
    kg_test: torchkge.data_structures.KnowledgeGraph

    """
    data_path = data_home + '/biodata'

    df1 = read_csv(data_path + '/train.csv')
    df2 = read_csv(data_path + '/valid.csv')
    df3 = read_csv(data_path + '/test.csv')
    df = concat([df1, df2, df3])
    kg = KnowledgeGraph(df)

    return kg.split_kg(sizes=(len(df1), len(df2), len(df3)))


In [None]:
kg_train_path = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/DATA/kg_train_pickle"
kg_test_path = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/DATA/kg_test_pickle"
kg_val_path = "/content/drive/MyDrive/Colab Notebooks/KG_breast_cancer/DATA/kg_val_pickle"

In [None]:
import pickle
file = open(kg_train_path, 'rb')
train_data = pickle.load(file)
file.close()

In [None]:
file = open(kg_test_path, 'rb')
test_data = pickle.load(file)
file.close()

In [None]:
file = open(kg_val_path, 'rb')
val_data = pickle.load(file)
file.close()

In [None]:
# Load dataset
kg_train, kg_val, kg_test = load_biodata()

In [None]:
class TrainLoop():
    def __init__(self, args , kg_train, kg_val, kg_test):
        self.n_epochs = args.n_epochs
        self.kg_train, self.kg_val, self.kg_test = kg_train, kg_val, kg_test
        self.train_dataloader = DataLoader(kg_train, batch_size=args.b_size, use_cuda='all')
        self.val_dataloader = DataLoader(kg_val, batch_size=args.b_size, use_cuda='all')
        self.test_dataloader = DataLoader(kg_test, batch_size=args.b_size, use_cuda='all')
        self.model_path = args.model_path
        self.model = TransEModel(args.emb_dim, kg_train.n_ent, kg_train.n_rel, dissimilarity_type='L2')
        self.criterion = MarginLoss(args.margin)
        # Move everything to CUDA if available
        if cuda.is_available():
            cuda.empty_cache()
            self.model.cuda()
            self.criterion.cuda()

        # Define the torch optimizer to be used
        self.optimizer = Adam(self.model.parameters(), lr=args.lr, weight_decay=1e-5)

        self.sampler = BernoulliNegativeSampler(self.kg_train)

    def fit_step(self):
        running_loss = 0.0
        for i, batch in enumerate(self.train_dataloader):
            h, t, r = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            n_h, n_t = self.sampler.corrupt_batch(h.to('cpu'), t.to('cpu'), r.to('cpu'))
            n_h, n_t = n_h.to(device), n_t.to(device)
            self.optimizer.zero_grad()

            # forward + backward + optimize
            pos, neg = self.model(h, t, r, n_h, n_t)
            loss = self.criterion(pos, neg)
            loss.backward()
            self.optimizer.step()

            running_loss += loss.item()
        return running_loss
    def test_step(self):
        self.model = torch.load(self.model_path)
        self.model.eval()
        evaluator = LinkPredictionEvaluator(self.model, self.kg_test)
        evaluator.evaluate(200, verbose=True)
        evaluator.print_results()

    def eval_step(self):
        evaluator = LinkPredictionEvaluator(self.model, self.kg_val)
        evaluator.evaluate(200, verbose=False)
        return evaluator.mrr()[1]

    def fit(self):
        iterator = tqdm(range(self.n_epochs), unit='epoch')
        best_val = -np.inf
        patience= 10
        patience_count = 0
        for epoch in iterator:
            running_loss = self.fit_step()
            val_ = self.eval_step()

            if val_> best_val:
                best_val = val_
                torch.save(self.model, self.model_path)
                patience_count = 0
            else:
                if patience_count == patience:
                    break
                else:
                    patience_count += 1

            iterator.set_description(
                'Epoch {} | mean loss: {:.5f}'.format(epoch + 1,
                                                      running_loss / len(self.train_dataloader)))

        self.model.normalize_parameters()

In [None]:
class Params():
    def __init__(self):
        pass

In [None]:
args = Params()

In [None]:
args.emb_dim = 100
args.lr = 0.0004
args.n_epochs = 1000
args.b_size = 32768
args.margin = 0.5
args.model_path = join(MODEL_PATH, "basic.bt")

In [None]:
tloop = TrainLoop(args, kg_train, kg_val, kg_test)

In [None]:
tloop.fit()

In [None]:
tloop.test_step()