In [None]:
# Install required packages.
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
!pip install -q torch-geometric
!pip install transformers

In [None]:
import os
import shutil
import time

import math

import csv
import random

import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity



import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder, TransformerDecoderLayer


import tokenizers
from tokenizers import Tokenizer
from transformers import BertTokenizerFast, BertModel

In [None]:
import os.path as osp
import zipfile

import torch
from torch_geometric.data import download_url, Data
from torch_geometric.data import Dataset as GeoDataset
from torch_geometric.data import DataLoader as GeoDataLoader
from torch_geometric.data import Data, Batch
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool

In [None]:
#Need a special generator for random sampling:


class GenerateData():
  def __init__(self, path_train, path_val, path_test, path_molecules, path_token_embs):
    self.path_train = path_train
    self.path_val = path_val
    self.path_test = path_test
    self.path_molecules = path_molecules
    self.path_token_embs = path_token_embs

    self.mol_trunc_length = 512
    self.text_trunc_length = 256 

    self.prep_text_tokenizer()
    
    self.load_substructures()

    self.batch_size = 32

    self.store_descriptions()
    
  def load_substructures(self):
    self.molecule_sentences = {}
    self.molecule_tokens = {}

    total_tokens = set()
    self.max_mol_length = 0
    with open(self.path_molecules) as f:
      for line in f:
        spl = line.split(":")
        cid = spl[0]
        tokens = spl[1].strip()
        self.molecule_sentences[cid] = tokens
        t = tokens.split()
        total_tokens.update(t)
        size = len(t)
        if size > self.max_mol_length: self.max_mol_length = size


    self.token_embs = np.load(self.path_token_embs, allow_pickle = True)[()]



  def prep_text_tokenizer(self):
    self.text_tokenizer = BertTokenizerFast.from_pretrained("allenai/scibert_scivocab_uncased")
 

  def store_descriptions(self):
    self.descriptions = {}
    
    self.mols = {}



    self.training_cids = []
    #get training set cids...
    with open(self.path_train) as f:
      reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE, fieldnames = ['cid', 'mol2vec', 'desc'])
      for n, line in enumerate(reader):
        self.descriptions[line['cid']] = line['desc']
        self.mols[line['cid']] = line['mol2vec']
        self.training_cids.append(line['cid'])
        
    self.validation_cids = []
    #get validation set cids...
    with open(self.path_val) as f:
      reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE, fieldnames = ['cid', 'mol2vec', 'desc'])
      for n, line in enumerate(reader):
        self.descriptions[line['cid']] = line['desc']
        self.mols[line['cid']] = line['mol2vec']
        self.validation_cids.append(line['cid'])
        
    self.test_cids = []
    with open(self.path_test) as f:
      reader = csv.DictReader(f, delimiter="\t", quoting=csv.QUOTE_NONE, fieldnames = ['cid', 'mol2vec', 'desc'])
      for n, line in enumerate(reader):
        self.descriptions[line['cid']] = line['desc']
        self.mols[line['cid']] = line['mol2vec']
        self.test_cids.append(line['cid'])

  #transformers can't take array with full attention so have to pad a 0...
  def padarray(self, A, size, value=0):
      t = size - len(A)
      return np.pad(A, pad_width=(0, t), mode='constant', constant_values = value)


  def generate_examples_train(self):
    """Yields examples."""

    np.random.shuffle(self.training_cids)

    for cid in self.training_cids:
      label = np.random.randint(2)
      rand_cid = np.random.choice(self.training_cids)
      if label:
        text_input = self.text_tokenizer(self.descriptions[cid], truncation=True, max_length=self.text_trunc_length - 1,
                                        padding='max_length', return_tensors = 'np')
      else:
        text_input = self.text_tokenizer(self.descriptions[rand_cid], truncation=True, max_length=self.text_trunc_length - 1,
                                        padding='max_length', return_tensors = 'np')

      text_ids = self.padarray(text_input['input_ids'].squeeze(), self.text_trunc_length)
      text_mask = self.padarray(text_input['attention_mask'].squeeze(), self.text_trunc_length)

      yield {
          'cid': cid,
          'input': {
              'text': {
                'input_ids': text_ids,
                'attention_mask': text_mask,
              },
              'molecule' : {
                    'mol2vec' : np.fromstring(self.mols[cid], sep = " "),
                    'cid' : cid
              },         
          },
          'label': label
      }


  def generate_examples_val(self):
    """Yields examples."""

    np.random.shuffle(self.validation_cids)

    for cid in self.validation_cids:
      label = np.random.randint(2)
      rand_cid = np.random.choice(self.validation_cids)
      if label:
        text_input = self.text_tokenizer(self.descriptions[cid], truncation=True, max_length=self.text_trunc_length - 1,
                                        padding='max_length', return_tensors = 'np')
      else:
        text_input = self.text_tokenizer(self.descriptions[rand_cid], truncation=True, max_length=self.text_trunc_length - 1,
                                        padding='max_length', return_tensors = 'np')


      text_ids = self.padarray(text_input['input_ids'].squeeze(), self.text_trunc_length)
      text_mask = self.padarray(text_input['attention_mask'].squeeze(), self.text_trunc_length)

      yield {
          'cid': cid,
          'input': {
              'text': {
                'input_ids': text_ids,
                'attention_mask': text_mask,
              },
              'molecule' : {
                    'mol2vec' : np.fromstring(self.mols[cid], sep = " "),
                    'cid' : cid
              },         
          },
          'label': label
      }

  def generate_examples_test(self):
    """Yields examples."""

    np.random.shuffle(self.test_cids)

    for cid in self.test_cids:
      label = np.random.randint(2)
      rand_cid = np.random.choice(self.test_cids)
      if label:
        text_input = self.text_tokenizer(self.descriptions[cid], truncation=True, max_length=self.text_trunc_length - 1,
                                        padding='max_length', return_tensors = 'np')
      else:
        text_input = self.text_tokenizer(self.descriptions[rand_cid], truncation=True, max_length=self.text_trunc_length - 1,
                                        padding='max_length', return_tensors = 'np')


      text_ids = self.padarray(text_input['input_ids'].squeeze(), self.text_trunc_length)
      text_mask = self.padarray(text_input['attention_mask'].squeeze(), self.text_trunc_length)

      yield {
          'cid': cid,
          'input': {
              'text': {
                'input_ids': text_ids,
                'attention_mask': text_mask,
              },
              'molecule' : {
                    'mol2vec' : np.fromstring(self.mols[cid], sep = " "),
                    'cid' : cid
              },         
          },
          'label': label
      }




mounted_path_token_embs = "input/token_embedding_dict.npy"
mounted_path_train = "input/mol2vec_ChEBI_20_training.txt"
mounted_path_val = "input/mol2vec_ChEBI_20_val.txt"
mounted_path_test = "input/mol2vec_ChEBI_20_test.txt"
mounted_path_molecules = "input/ChEBI_defintions_substructure_corpus.cp"
gt = GenerateData(mounted_path_train, mounted_path_val, mounted_path_test, mounted_path_molecules, mounted_path_token_embs)


In [None]:


class Dataset(Dataset):
  'Characterizes a dataset for PyTorch'
  def __init__(self, gen, length):
      'Initialization'

      self.gen = gen
      self.it = iter(self.gen())

      self.length = length

  def __len__(self):
      'Denotes the total number of samples'
      return self.length


  def __getitem__(self, index):
      'Generates one sample of data'

      try:
        ex = next(self.it)
      except StopIteration:
        self.it = iter(self.gen())
        ex = next(self.it)

      X = ex['input']
      y = ex['label']

      return X, y

training_set = Dataset(gt.generate_examples_train, len(gt.training_cids))
validation_set = Dataset(gt.generate_examples_val, len(gt.validation_cids))
test_set = Dataset(gt.generate_examples_test, len(gt.test_cids))


In [None]:

# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'num_workers': 1}

training_generator = DataLoader(training_set, **params)
validation_generator = DataLoader(validation_set, **params)
test_generator = DataLoader(test_set, **params)


In [None]:

class MoleculeGraphDataset(GeoDataset):
    def __init__(self, root, cids, data_path, gt, transform=None, pre_transform=None):
        self.cids = cids
        self.data_path = data_path
        self.gt = gt
        super(MoleculeGraphDataset, self).__init__(root, transform, pre_transform)
        
        self.idx_to_cid = {}
        i = 0
        for raw_path in self.raw_paths:
            cid = int(raw_path.split('/')[-1][:-6])
            self.idx_to_cid[i] = cid
            i += 1

    @property
    def raw_file_names(self):
        return [cid + ".graph" for cid in self.cids]

    @property
    def processed_file_names(self):
        return ['data_{}.pt'.format(cid) for cid in self.cids]

    def download(self):
        # Download to `self.raw_dir`.
        shutil.copy(self.data_path, os.path.join(self.raw_dir, "/mol_graphs.zip"))
        
    def process_graph(self, raw_path):
      edge_index  = []
      x = []
      with open(raw_path, 'r') as f:
        next(f)
        for line in f: #edges
          if line != "\n":
            edge = *map(int, line.split()), 
            edge_index.append(edge)
          else:
            break
        next(f)
        for line in f: #get mol2vec features:
          substruct_id = line.strip().split()[-1]
          if substruct_id in self.gt.token_embs:
            x.append(self.gt.token_embs[substruct_id])
          else:
            x.append(self.gt.token_embs['UNK'])

        return torch.LongTensor(edge_index).T, torch.FloatTensor(x)



    def process(self):
      
        with zipfile.ZipFile(os.path.join(self.raw_dir, "/mol_graphs.zip"), 'r') as zip_ref:
            zip_ref.extractall(self.raw_dir)


        i = 0
        for raw_path in self.raw_paths:
            # Read data from `raw_path`.

            cid = int(raw_path.split('/')[-1][:-6])

            edge_index, x = self.process_graph(raw_path)
            data = Data(x=x, edge_index = edge_index)

            if self.pre_filter is not None and not self.pre_filter(data):
                continue

            if self.pre_transform is not None:
                data = self.pre_transform(data)

            torch.save(data, osp.join(self.processed_dir, 'data_{}.pt'.format(cid)))
            i += 1

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(self.idx_to_cid[idx])))
        return data

    def get_cid(self, cid):
        data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(cid)))
        return data

#To get specific lists...

class CustomGraphCollater(object):
    def __init__(self, dataset, mask_len, follow_batch = [], exclude_keys = []):
        self.follow_batch = follow_batch
        self.exclude_keys = exclude_keys
        self.dataset = dataset
        self.mask_len = mask_len
        self.mask_indices = np.array(range(mask_len))

    def generate_mask(self, sz):
        rv = torch.zeros((self.mask_len), dtype = torch.bool)
        rv = rv.masked_fill(torch.BoolTensor(self.mask_indices < sz), bool(1)) #pytorch transformer input version
        rv[-1] = 0 #set last value to 0 because pytorch can't handle all 1s
        return rv

    def get_masks(self, batch):
      return torch.stack([self.generate_mask(b.x.shape[0]) for b in batch])

    def collate(self, batch):
        elem = batch[0]
        if isinstance(elem, Data):
            return Batch.from_data_list(batch) 
            
        raise TypeError('DataLoader found invalid type: {}'.format(type(elem)))

    def __call__(self, cids):
      
        tmp = [self.dataset.get_cid(int(cid)) for cid in cids]
        return self.collate(tmp), self.get_masks(tmp)



In [None]:
root = 'graph-data/'
graph_data_path = "input/mol_graphs.zip"


mg_data_tr = MoleculeGraphDataset(root, gt.training_cids, graph_data_path, gt)
graph_batcher_tr = CustomGraphCollater(mg_data_tr, gt.mol_trunc_length)

mg_data_val = MoleculeGraphDataset(root, gt.validation_cids, graph_data_path, gt)
graph_batcher_val = CustomGraphCollater(mg_data_val, gt.mol_trunc_length)

mg_data_test = MoleculeGraphDataset(root, gt.test_cids, graph_data_path, gt)
graph_batcher_test = CustomGraphCollater(mg_data_test, gt.mol_trunc_length)


#Get ranks

In [None]:

dir1 = "inputs/MLP1/embeddings/"


cids_train1 = np.load(dir1 + "cids_train.npy", allow_pickle=True)
cids_val1 = np.load(dir1 + "cids_val.npy", allow_pickle=True)
cids_test1 = np.load(dir1 + "cids_test.npy", allow_pickle=True)
chem_embeddings_train1 = np.load(dir1 + "chem_embeddings_train.npy")
chem_embeddings_val1 = np.load(dir1 + "chem_embeddings_val.npy")
chem_embeddings_test1 = np.load(dir1 + "chem_embeddings_test.npy")
text_embeddings_train1 = np.load(dir1 + "text_embeddings_train.npy")
text_embeddings_val1 = np.load(dir1 + "text_embeddings_val.npy")
text_embeddings_test1 = np.load(dir1 + "text_embeddings_test.npy")


all_chem_embbedings1 = np.concatenate((chem_embeddings_train1, chem_embeddings_val1, chem_embeddings_test1), axis = 0)

cids_all = np.concatenate((cids_train1, cids_val1, cids_test1), axis = 0)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def memory_efficient_similarity_matrix_custom(func, embedding1, embedding2, chunk_size = 1000):
    rows = embedding1.shape[0]
    
    num_chunks = int(np.ceil(rows / chunk_size))
    
    for i in range(num_chunks):
        end_chunk = (i+1)*(chunk_size) if (i+1)*(chunk_size) < rows else rows #account for smaller chunk at end...
        yield func(embedding1[i*chunk_size:end_chunk,:], embedding2)

#Calculate mean rank, hits at ten

def dot_product(a, b):
  return np.dot(a, b.T)
  
sigmoid = lambda x: 1 / (1 + np.exp(-x))

compose = lambda a,b: sigmoid(dot_product(a,b))

text_chem_cos1 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_train1, all_chem_embbedings1)
text_chem_cos_val1 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_val1, all_chem_embbedings1)
text_chem_cos_test1 = memory_efficient_similarity_matrix_custom(cosine_similarity, text_embeddings_test1, all_chem_embbedings1)


In [None]:
n_train = len(cids_train1)
n_val = len(cids_val1)
n_test = len(cids_test1)
n = n_train + n_val + n_test

offset_val = n_train
offset_test = n_train + n_val

In [None]:
#For space 1:

num_top = 10
top_cids1 = {}
top_cids_val1 = {}
top_cids_test1 = {}
scores_val1 = {}
scores_test1 = {}

ranks1 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos1):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs) #rank is actually double argsort...

        top_cids1[cids_train1[j]] = [cids_all[loc] for loc in cid_locs[:num_top]]
        
        rank = ranks[j] + 1
        ranks1.append(rank)
        
            
        j += 1
        if (j) % 1000 == 0: print((j), "train processed.")

ranks1 = np.array(ranks1)

print()
print("Training Mean rank:", np.mean(ranks1))
print("Hits at 1:", np.mean(ranks1 <= 1))
print("Hits at 10:", np.mean(ranks1 <= 10))
print("Hits at 100:", np.mean(ranks1 <= 100))
print("Hits at 500:", np.mean(ranks1 <= 500))
print("Hits at 1000:", np.mean(ranks1 <= 1000))

print("Trainng MRR:", np.mean(1/np.array(ranks1)))

ranks_val1 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_val1):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs) #rank is actually double argsort...

        scores = np.sort(emb[k,:])[::-1]
        
        top_cids_val1[cids_val1[j]] = [cids_all[loc] for loc in cid_locs[:num_top]]
        scores_val1[cids_val1[j]] = scores[:num_top]

        rank = ranks[j+offset_val] + 1
        ranks_val1.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "val processed.")
    

ranks_val1 = np.array(ranks_val1)

print()
print("Val Mean rank:", np.mean(ranks_val1))
print("Hits at 1:", np.mean(ranks_val1 <= 1))
print("Hits at 10:", np.mean(ranks_val1 <= 10))
print("Hits at 100:", np.mean(ranks_val1 <= 100))
print("Hits at 500:", np.mean(ranks_val1 <= 500))
print("Hits at 1000:", np.mean(ranks_val1 <= 1000))

print("Validation MRR:", np.mean(1/ranks_val1))


ranks_test1 = []
j = 0 #keep track of all loops
for i, emb in enumerate(text_chem_cos_test1):
    for k in range(emb.shape[0]):
        cid_locs = np.argsort(emb[k,:])[::-1]
        ranks = np.argsort(cid_locs) #rank is actually double argsort...

        scores = np.sort(emb[k,:])[::-1]
        
        top_cids_test1[cids_test1[j]] = [cids_all[loc] for loc in cid_locs[:num_top]]
        scores_test1[cids_test1[j]] = scores[:num_top]

        rank = ranks[j+offset_test] + 1
        ranks_test1.append(rank)

        j += 1
        if (j) % 1000 == 0: print((j), "test processed.")
    

ranks_test1 = np.array(ranks_test1)

print()
print("Test Mean rank:", np.mean(ranks_test1))
print("Hits at 1:", np.mean(ranks_test1 <= 1))
print("Hits at 10:", np.mean(ranks_test1 <= 10))
print("Hits at 100:", np.mean(ranks_test1 <= 100))
print("Hits at 500:", np.mean(ranks_test1 <= 500))
print("Hits at 1000:", np.mean(ranks_test1 <= 1000))

print("Test MRR:", np.mean(1/ranks_test1))

#Get Attention Association Rules

In [None]:
import pickle 

path = "input/"
with open(path + "mha_weights.pkl", 'rb') as fp:
  mha_weights = pickle.load(fp)

In [None]:
all_mol_tokens = set()
all_text_tokens = set()

import zipfile
archive = zipfile.ZipFile(graph_data_path, 'r')

for i, cid in enumerate(mha_weights):
  attn_weights = mha_weights[cid]
  text_input = gt.text_tokenizer(gt.descriptions[cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])
  
  gfile = archive.open(cid + '.graph').read().decode('ascii')
  mol_tokens = {}
  idx = False
  for line in gfile.split('\n'):
    line = line.strip()
    if line == 'idx to identifier:': 
      idx = True
      continue
    if idx and len(line) != 0: 
      id, idf = line.split(" ")
      mol_tokens[id] = idf
      
  mol_tokens = list(mol_tokens.values())

  all_mol_tokens.update(mol_tokens)
  all_text_tokens.update(text_tokens)  

mol_token_ids = {}
text_token_ids = {} 

mol_token_ids_rev = {}
text_token_ids_rev = {}
for i, k in enumerate(all_mol_tokens):
  mol_token_ids[k] = i
  mol_token_ids_rev[i] = k
for i, k in enumerate(all_text_tokens):
  text_token_ids[k] = i
  text_token_ids_rev[i] = k

In [None]:

support = np.zeros((len(all_text_tokens), len(all_mol_tokens)))
conf = np.zeros((len(all_text_tokens), len(all_mol_tokens)))

for i, cid in enumerate(mha_weights):
  if cid in gt.validation_cids or cid in gt.test_cids: continue
  attn_weights = mha_weights[cid]
  text_input = gt.text_tokenizer(gt.descriptions[cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])
    
  gfile = archive.open(cid + '.graph').read().decode('ascii')
  mol_tokens = {}
  idx = False
  for line in gfile.split('\n'):
    line = line.strip()
    if line == 'idx to identifier:': 
      idx = True
      continue
    if idx and len(line) != 0: 
      id, idf = line.split(" ")
      mol_tokens[id] = idf
  mol_tokens = list(mol_tokens.values())
  
  if len(mol_tokens) > gt.mol_trunc_length: mol_tokens = mol_tokens[:gt.mol_trunc_length]

  for j, text in enumerate(text_tokens):
    for k, molt in enumerate(mol_tokens):
      support[text_token_ids[text], mol_token_ids[molt]] += attn_weights[j,k] #* mol_length # mol_length to normalize
  
  
  if (i+1) % 1000 == 0: print(i+1)

print("Support calculation finished.")

for j, text in enumerate(all_text_tokens):
  if np.sum(support[text_token_ids[text], :]) == 0:
    conf[text_token_ids[text], :] = 0.0
  else:
    conf[text_token_ids[text], :] = support[text_token_ids[text], :] / np.sum(support[text_token_ids[text], :])

  if (j+1) % 1000 == 0: print(j+1)

print("Confidence calculation finished.")

#FPGrowth Pattern Mining

In [None]:
from collections import defaultdict

all_mol_tokens = set()
all_text_tokens = set()

for i, cid in enumerate(gt.training_cids):
  text_input = gt.text_tokenizer(gt.descriptions_train[cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])
  
  mol_length = len(gt.molecule_sentences[cid].split())
  mol_tokens = ['[CLS]']
  mol_tokens.extend(gt.molecule_sentences[cid].split()[:mol_length])

  all_mol_tokens.update(mol_tokens)
  all_text_tokens.update(text_tokens)  

mol_token_ids = defaultdict(lambda : -1)
text_token_ids = defaultdict(lambda : -1) 

mol_token_ids_rev = {}
text_token_ids_rev = {}
for i, k in enumerate(all_mol_tokens):
  mol_token_ids[k] = i
  mol_token_ids_rev[i] = k
for i, k in enumerate(all_text_tokens):
  text_token_ids[k] = i
  text_token_ids_rev[i] = k

In [None]:
#Create database

database = []

for cid in gt.training_cids:
  if cid in gt.validation_cids: continue
  text_input = gt.text_tokenizer(gt.descriptions_train[cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])
  text_tokens = text_tokens[1:-1] #skip [CLS], [SEP]

  mol_length = len(gt.molecule_sentences[cid].split())
  mol_tokens = gt.molecule_sentences[cid].split()[:mol_length]

  all_tokens = []
  all_tokens.extend(text_tokens)
  all_tokens.extend(mol_tokens)

  database.append(all_tokens)


In [None]:
print("Transactions:", len(database))
print("Item/Transaction", np.mean([len(a) for a in database]))

In [None]:
!pip install mlxtend==0.17.0

from mlxtend.frequent_patterns import fpgrowth

In [None]:
import sys
sys.setrecursionlimit(1500)


In [None]:
from mlxtend.preprocessing import TransactionEncoder
import pandas as pd

te = TransactionEncoder()
te_ary = te.fit(database).transform(database)
df = pd.DataFrame(te_ary, columns=te.columns_)


In [None]:
fp = fpgrowth(df, min_support=0.3, use_colnames=True)

In [None]:
from mlxtend.frequent_patterns import association_rules

rules = association_rules(fp, metric="confidence", min_threshold=0.99)

In [None]:
pd.set_option('display.max_columns', 4)

print(rules[['antecedents', 'consequents', 'support', 'confidence']])

#Rerank - Attention

In [None]:
from itertools import combinations, chain


def all_subsets(ss):#skip empty set
    return chain(*map(lambda x: combinations(ss, x), range(1, len(ss)+1)))


def generate_rules(text_tokens, mol_tokens):
  candidates = set()
  
  text_subs = [frozenset([text_token_ids[j] for j in i]) for i in combinations(text_tokens, 1)]
  mol_subs = [frozenset([mol_token_ids[j] for j in i]) for i in combinations(mol_tokens, 1)]

  rules = []

  for t in text_subs:
    for m in mol_subs:
      rules.append((t, m))

  return rules


def ar_score(text_cid, mol_cid, top_num=10):

  text_input = gt.text_tokenizer(gt.descriptions[text_cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])

  gfile = archive.open(mol_cid + '.graph').read().decode('ascii')
  mol_tokens = {}
  idx = False
  for line in gfile.split('\n'):
    line = line.strip()
    if line == 'idx to identifier:': 
      idx = True
      continue
    if idx and len(line) != 0: 
      id, idf = line.split(" ")
      mol_tokens[id] = idf
  mol_tokens = list(mol_tokens.values())

  rules = generate_rules(text_tokens, mol_tokens)

  tmp = np.array([conf[list(r[0])[0], list(r[1])[0]] for r in rules])

  mx = np.min((top_num, len(tmp)))
  top_confs = -np.partition(-tmp, mx-1)[:mx]


  return np.mean(top_confs)


In [None]:

import operator
from collections import defaultdict

alpha = 0.0


ar_scores = np.zeros((len(top_cids_val1), num_top))

new_ranks_val = []
for i, cid in enumerate(top_cids_val1):
  
  text_input = gt.text_tokenizer(gt.descriptions[cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])

  score = np.zeros((num_top))
  for j, cid2 in enumerate(top_cids_val1[cid]):
    gfile = archive.open(cid + '.graph').read().decode('ascii')
    mol_tokens = {}
    idx = False
    for line in gfile.split('\n'):
      line = line.strip()
      if line == 'idx to identifier:':
        idx = True
        continue
      if idx and len(line) != 0: 
        id, idf = line.split(" ")
        mol_tokens[id] = idf
    mol_tokens = list(mol_tokens.values())

    tmp = ar_score(cid, cid2)
    ar_scores[i,j] = tmp
    score[j] = alpha * scores_val1[cid][j] + (1 - alpha) * tmp
    
  try:
    old_loc = top_cids_val1[cid].index(cid)
    
    sorted = np.argsort(-score, kind='stable')
    
    new_rank = np.where(sorted == old_loc)[0][0] + 1
    
  except ValueError:
    new_rank = ranks_val1[i]

  new_ranks_val.append(new_rank)
  
      
  if (i+1) % 200 == 0: print(i+1)

new_ranks_val = np.array(new_ranks_val)

print()
print("Val Mean rank:", np.mean(new_ranks_val))
print("Hits at 1:", np.mean(new_ranks_val <= 1))
print("Hits at 10:", np.mean(new_ranks_val <= 10))
print("Hits at 100:", np.mean(new_ranks_val <= 100))

print("Validation MRR:", np.mean(1/np.array(new_ranks_val)))

In [None]:

x = np.linspace(0.0,1,101)
MRRs = []
hits1 = []
hits10 = []


for n in x:
  alpha = n

  hits_at_one = 0
  hits_at_ten = 0
  hits_at_100 = 0

  tmp_ranks = []
  for i, cid in enumerate(top_cids_val1):
    
    score = np.zeros((num_top))
    for j, cid2 in enumerate(top_cids_val1[cid]):
      score[j] = alpha * scores_val1[cid][j] + (1 - alpha) * ar_scores[i,j]

    try:
      old_loc = top_cids_val1[cid].index(cid)
      
      sorted = np.argsort(-score, kind='stable')
      
      new_rank = np.where(sorted == old_loc)[0][0] + 1
      
    except ValueError:
      new_rank = ranks_val1[i]

    tmp_ranks.append(new_rank)
    
    if new_rank <= 1:
        hits_at_one += 1
    if new_rank <= 10:
        hits_at_ten += 1
    if new_rank <= 100:
        hits_at_100 += 1
  
  MRRs.append(np.mean(1/np.array(tmp_ranks)))
  hits1.append(hits_at_one/cids_val1.size)
  hits10.append(hits_at_ten/cids_val1.size)
  
  
print("Val Mean rank:", np.mean(tmp_ranks))
print("Hits at 1:", hits_at_one/cids_val1.size)
print("Hits at 10:", hits_at_ten/cids_val1.size)
print("Hits at 100:", hits_at_100/cids_val1.size)

print("Validation MRR:", np.mean(1/np.array(tmp_ranks)))

In [None]:

import operator
from collections import defaultdict

alpha = x[np.argmax(MRRs)]

ar_scores_test = np.zeros((len(top_cids_test1), num_top))

new_ranks_test = []
for i, cid in enumerate(top_cids_test1):
  
  text_input = gt.text_tokenizer(gt.descriptions[cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])

  score = np.zeros((num_top))
  for j, cid2 in enumerate(top_cids_test1[cid]):
    gfile = archive.open(cid + '.graph').read().decode('ascii')
    mol_tokens = {}
    idx = False
    for line in gfile.split('\n'):
      line = line.strip()
      if line == 'idx to identifier:': 
        idx = True
        continue
      if idx and len(line) != 0: 
        id, idf = line.split(" ")
        mol_tokens[id] = idf
    mol_tokens = list(mol_tokens.values())

    tmp = ar_score(cid, cid2)
    ar_scores_test[i,j] = tmp
    score[j] = alpha * scores_test1[cid][j] + (1 - alpha) * tmp
    
  try:
    old_loc = top_cids_test1[cid].index(cid)
    
    sorted = np.argsort(-score, kind='stable')
    
    new_rank = np.where(sorted == old_loc)[0][0] + 1
    
  except ValueError:
    new_rank = ranks_test1[i]

  new_ranks_test.append(new_rank)
  
      
  if (i+1) % 200 == 0: print(i+1)

new_ranks_test = np.array(new_ranks_test)

print()
print("Test Mean rank:", np.mean(new_ranks_test))
print("Hits at 1:", np.mean(new_ranks_test <= 1))
print("Hits at 10:", np.mean(new_ranks_test <= 10))
print("Hits at 100:", np.mean(new_ranks_test <= 100))

print("Test MRR:", np.mean(1/np.array(new_ranks_test)))

#1->1 normal pattern mining and rerank

In [None]:
#based off FP-growth for 1->1
normal_support = np.zeros((len(all_text_tokens), len(all_mol_tokens)))
normal_conf = np.zeros((len(all_text_tokens), len(all_mol_tokens)))

#Create database

for i, cid in enumerate(gt.training_cids):

  if cid in gt.validation_cids or cid in gt.test_cids: continue
  text_input = gt.text_tokenizer(gt.descriptions[cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])
  
  gfile = archive.open(cid + '.graph').read().decode('ascii')
  mol_tokens = {}
  idx = False
  for line in gfile.split('\n'):
    line = line.strip()
    if line == 'idx to identifier:': 
      idx = True
      continue
    if idx and len(line) != 0: 
      id, idf = line.split(" ")
      mol_tokens[id] = idf
      
  mol_tokens = list(mol_tokens.values())
  
  if len(mol_tokens) > gt.mol_trunc_length: mol_tokens = mol_tokens[:gt.mol_trunc_length]

  for j, text in enumerate(text_tokens):
    for k, molt in enumerate(mol_tokens):
      normal_support[text_token_ids[text], mol_token_ids[molt]] += 1


  if (i+1) % 1000 == 0: print(i+1)

print("Support calculation finished.")

for j, text in enumerate(all_text_tokens):
  normal_conf[text_token_ids[text], :] = normal_support[text_token_ids[text], :] / np.sum(normal_support[text_token_ids[text], :])

  if (j+1) % 1000 == 0: print(j+1)

print("Confidence calculation finished.")

In [None]:
from itertools import combinations, chain


def all_subsets(ss):#skip empty set
    return chain(*map(lambda x: combinations(ss, x), range(1, len(ss)+1)))


def generate_rules(text_tokens, mol_tokens):
  candidates = set()
  
  text_subs = [frozenset([text_token_ids[j] for j in i]) for i in combinations(text_tokens, 1)]
  mol_subs = [frozenset([mol_token_ids[j] for j in i]) for i in combinations(mol_tokens, 1)]

  rules = []

  for t in text_subs:
    for m in mol_subs:
      rules.append((t, m))

  return rules


def ar_score(text_cid, mol_cid, top_num=10):

  text_input = gt.text_tokenizer(gt.descriptions[text_cid], truncation=True, padding = 'max_length', 
                                    max_length=gt.text_trunc_length - 1)
  text_length = np.sum(text_input['attention_mask'])
  text_tokens = gt.text_tokenizer.convert_ids_to_tokens(text_input['input_ids'][:text_length])

  gfile = archive.open(mol_cid + '.graph').read().decode('ascii')
  mol_tokens = {}
  idx = False
  for line in gfile.split('\n'):
    line = line.strip()
    if line == 'idx to identifier:': 
      idx = True
      continue
    if idx and len(line) != 0: 
      id, idf = line.split(" ")
      mol_tokens[id] = idf
  mol_tokens = list(mol_tokens.values())

  rules = generate_rules(text_tokens, mol_tokens)

  tmp = np.array([normal_conf[list(r[0])[0], list(r[1])[0]] for r in rules])


  mx = np.min((top_num, len(tmp)))
  top_confs = -np.partition(-tmp, mx-1)[:mx]

  return np.mean(top_confs)


In [None]:

import operator
from collections import defaultdict

alpha = 0.0


hits_at_one = 0
hits_at_ten = 0
hits_at_100 = 0

ar_scores = np.zeros((len(top_cids_val1), num_top))

new_ranks_val = []
for i, cid in enumerate(top_cids_val1):

  score = np.zeros((num_top))
  for j, cid2 in enumerate(top_cids_val1[cid]):
    tmp = ar_score(cid, cid2)
    ar_scores[i,j] = tmp
    score[j] = alpha * scores_val1[cid][j] + (1 - alpha) * tmp
  try:
    old_loc = top_cids_val1[cid].index(cid)
    
    sorted = np.argsort(-score, kind='stable')
    
    new_rank = np.where(sorted == old_loc)[0][0] + 1
    
  except ValueError:
    new_rank = ranks_val1[i]

  new_ranks_val.append(new_rank)
  
  if new_rank <= 1:
      hits_at_one += 1
  if new_rank <= 10:
      hits_at_ten += 1
  if new_rank <= 100:
      hits_at_100 += 1
      
  if (i+1) % 200 == 0: print(i+1)

print()
print("Val Mean rank:", np.mean(new_ranks_val))
print("Hits at 1:", hits_at_one/cids_val1.size)
print("Hits at 10:", hits_at_ten/cids_val1.size)
print("Hits at 100:", hits_at_100/cids_val1.size)

print("Validation MRR:", np.mean(1/np.array(new_ranks_val)))

In [None]:

x = np.linspace(0.0,1,101)
MRRs = []
hits1 = []
hits10 = []


for n in x:
  alpha = n

  hits_at_one = 0
  hits_at_ten = 0
  hits_at_100 = 0

  tmp_ranks = []
  for i, cid in enumerate(top_cids_val1):
    
    score = np.zeros((num_top))
    for j, cid2 in enumerate(top_cids_val1[cid]):
      score[j] = alpha * scores_val1[cid][j] + (1 - alpha) * ar_scores[i,j]

    try:
      old_loc = top_cids_val1[cid].index(cid)
      
      sorted = np.argsort(-score, kind='stable')
      
      new_rank = np.where(sorted == old_loc)[0][0] + 1
      
    except ValueError:
      new_rank = ranks_val1[i]

    tmp_ranks.append(new_rank)
    
    if new_rank <= 1:
        hits_at_one += 1
    if new_rank <= 10:
        hits_at_ten += 1
    if new_rank <= 100:
        hits_at_100 += 1
  
  MRRs.append(np.mean(1/np.array(tmp_ranks)))
  hits1.append(hits_at_one/cids_val1.size)
  hits10.append(hits_at_ten/cids_val1.size)
  
print("Val Mean rank:", np.mean(tmp_ranks))
print("Hits at 1:", hits_at_one/cids_val1.size)
print("Hits at 10:", hits_at_ten/cids_val1.size)
print("Hits at 100:", hits_at_100/cids_val1.size)

print("Validation MRR:", np.mean(1/np.array(tmp_ranks)))

In [None]:

import operator
from collections import defaultdict

alpha = 0.0


hits_at_one = 0
hits_at_ten = 0
hits_at_100 = 0

ar_scores_test = np.zeros((len(top_cids_test1), num_top))

new_ranks_test = []
for i, cid in enumerate(top_cids_test1):

  score = np.zeros((num_top))
  for j, cid2 in enumerate(top_cids_test1[cid]):
    
    tmp = ar_score(cid, cid2)
    ar_scores_test[i,j] = tmp
    score[j] = alpha * scores_test1[cid][j] + (1 - alpha) * tmp
  try:
    old_loc = top_cids_test1[cid].index(cid)
    
    sorted = np.argsort(-score, kind='stable')
    
    new_rank = np.where(sorted == old_loc)[0][0] + 1
    
  except ValueError:
    new_rank = ranks_test1[i]

  new_ranks_test.append(new_rank)
  
  if new_rank <= 1:
      hits_at_one += 1
  if new_rank <= 10:
      hits_at_ten += 1
  if new_rank <= 100:
      hits_at_100 += 1
      
  if (i+1) % 200 == 0: print(i+1)

print()
print("Test Mean rank:", np.mean(new_ranks_test))
print("Hits at 1:", hits_at_one/cids_test1.size)
print("Hits at 10:", hits_at_ten/cids_test1.size)
print("Hits at 100:", hits_at_100/cids_test1.size)

print("Test MRR:", np.mean(1/np.array(new_ranks_test)))

In [None]:

import operator
from collections import defaultdict

first = np.argmax(MRRs)
last = len(MRRs) - np.argmax(MRRs[::-1])

alpha = (x[first] + x[last])/2
print(alpha)

hits_at_one = 0
hits_at_ten = 0
hits_at_100 = 0

new_ranks_test = []
for i, cid in enumerate(top_cids_test1):
  

  score = np.zeros((num_top))
  for j, cid2 in enumerate(top_cids_test1[cid]):
    
    score[j] = alpha * scores_test1[cid][j] + (1 - alpha) * ar_scores_test[i,j]
  try:
    old_loc = top_cids_test1[cid].index(cid)
    
    sorted = np.argsort(-score, kind='stable')
    
    new_rank = np.where(sorted == old_loc)[0][0] + 1
    
  except ValueError:
    new_rank = ranks_test1[i]

  new_ranks_test.append(new_rank)
  
  if new_rank <= 1:
      hits_at_one += 1
  if new_rank <= 10:
      hits_at_ten += 1
  if new_rank <= 100:
      hits_at_100 += 1
      
  if (i+1) % 200 == 0: print(i+1)

print()
print("Test Mean rank:", np.mean(new_ranks_test))
print("Hits at 1:", hits_at_one/cids_test1.size)
print("Hits at 10:", hits_at_ten/cids_test1.size)
print("Hits at 100:", hits_at_100/cids_test1.size)

print("Test MRR:", np.mean(1/np.array(new_ranks_test)))