In [1]:
from dataloader import GraphTextDataset, GraphDataset, TextDataset, AddRWStructEncoding
from torch_geometric.loader import DataLoader
from torch.utils.data import DataLoader as TorchDataLoader
from Model import Model, W2VEncoder
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModel
from sentence_transformers import SentenceTransformer
from nltk import word_tokenize
import gensim.downloader as api
import torch
from torch import optim
from torch import nn
import torch.nn.functional as F
import time
import os
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from torch_geometric.data import Dataset 
from torch_geometric.data import Data
import os.path as osp

class GraphTextDataset(Dataset):
    def __init__(self, root, gt, split, tokenizer=None, nltk_tokenizer=None, vocab=None, graph_transform=None, transform=None, pre_transform=None):
        self.root = root
        self.gt = gt
        self.split = split
        self.tokenizer = tokenizer
        self.nltk_tokenizer = nltk_tokenizer
        self.description = pd.read_csv(os.path.join(self.root, split+'.tsv'), sep='\t', header=None)   
        self.description = self.description.set_index(0).to_dict()
        self.cids = list(self.description[1].keys())
        
        self.graph_transform = graph_transform
        
        self.idx_to_cid = {}
        i = 0
        for cid in self.cids:
            self.idx_to_cid[i] = cid
            i += 1

        if nltk_tokenizer:
            if vocab:
                self.word2idx, self.idx2word = vocab
            else:
                self.word2idx, self.idx2word = self.build_vocab()
    
        super(GraphTextDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        return [str(cid) + ".graph" for cid in self.cids]

    @property
    def processed_file_names(self):
        return ['data_{}.pt'.format(cid) for cid in self.cids]
    
    @property
    def raw_dir(self) -> str:
        return osp.join(self.root, 'raw')

    @property
    def processed_dir(self) -> str:
        return osp.join(self.root, 'processed/', self.split)

    def download(self):
        pass
        
    def process_graph(self, raw_path):
      edge_index  = []
      x = []
      with open(raw_path, 'r') as f:
        next(f)
        for line in f: 
          if line != "\n":
            edge = *map(int, line.split()), 
            edge_index.append(edge)
          else:
            break
        next(f)
        for line in f: #get mol2vec features:
          substruct_id = line.strip().split()[-1]
          if substruct_id in self.gt.keys():
            x.append(self.gt[substruct_id])
          else:
            x.append(self.gt['UNK'])
        return torch.LongTensor(edge_index).T, torch.FloatTensor(x)

    def process(self):
        i = 0        
        for raw_path in self.raw_paths:
            cid = int(raw_path.split('/')[-1][:-6])
            if self.tokenizer:
                text_input = self.tokenizer([self.description[1][cid]],
                                    return_tensors="pt", 
                                    truncation=True, 
                                    max_length=256,
                                    padding="max_length",
                                    add_special_tokens=True,)
                edge_index, x = self.process_graph(raw_path)
                data = Data(x=x, edge_index=edge_index, input_ids=text_input['input_ids'], attention_mask=text_input['attention_mask'])
                
            elif self.nltk_tokenizer:
                tokenized_text = word_tokenize(self.description[1][cid])
                indexed_text = [self.word2idx.get(w, self.word2idx['UNK']) for w in tokenized_text]
                input_ids = [torch.LongTensor(text) for text in indexed_text]
                data = Data(x=x, edge_index=edge_index, input_ids=input_ids)
               
            else:
               edge_index, x = self.process_graph(raw_path)
               data = Data(x=x, edge_index=edge_index, text=self.description[1][cid])
            if self.graph_transform is not None:
               data = self.graph_transform(data)

            torch.save(data, osp.join(self.processed_dir, 'data_{}.pt'.format(cid)))
            i += 1

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(self.idx_to_cid[idx])))
        return data

    def get_cid(self, cid):
        data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(cid)))
        return data
    
    def build_vocab(self, count_threshold=1):
        corpus = list(self.description[1].values())
        word_index = {}
        idx_word = {}
        word_counts = {}
        newIndex = 1
            
        for text in corpus:
            tokens = word_tokenize(text)
            for token in tokens:
                if not token in word_counts.keys():
                    word_counts[token] = 1
                else:
                    word_counts[token] += 1
                        
                if word_counts[token] >= count_threshold and not token in word_index.keys():
                    word_index[token] = newIndex
                    idx_word[newIndex] = token
                    newIndex += 1
            
        word_index['UNK'] = newIndex
        idx_word[newIndex] = 'UNK'
            
        return word_index, idx_word
    
    def get_vocab(self):
        return self.word2idx, self.idx2word
    
gt = np.load("./data/token_embedding_dict.npy", allow_pickle=True)[()]
train_dataset = GraphTextDataset(root='./data/', gt=gt, split='train', nltk_tokenizer=word_tokenize, graph_transform=None)

Processing...


UnboundLocalError: local variable 'x' referenced before assignment

# Train W2V model on given corpus

In [5]:
train_sentences = pd.read_csv(os.path.join('./data/', 'train'+'.tsv'), sep='\t', header=None)
train_sentences = train_sentences.set_index(0).to_dict()
corpus = list(train_sentences[1].values())

In [40]:
from gensim import utils
from gensim.models import Word2Vec

class MyCorpus():
    def __iter__(self):
        c = corpus
        c.append('UNK')
        for sent in corpus:
            yield word_tokenize(sent)

mycorpus = MyCorpus()
w2v = Word2Vec(sentences=mycorpus, min_count=1, vector_size=300, epochs=20)

In [43]:
w2v.wv.save_word2vec_format('w2v_model.txt')

In [44]:
corpus[0]

'UDP-alpha-D-galactofuranose(2-) is a UDP-D-galactofuranose(2-) in which the anomeric centre of the galactofuranose moiety has alpha-configuration. It is a conjugate base of an UDP-alpha-D-galactofuranose.'

In [57]:
tokenized_text = word_tokenize(corpus[0])
indexed_text = [w2v.wv.key_to_index.get(w, w2v.wv.key_to_index['UNK'])+1 for w in tokenized_text[:256]]
input_ids = torch.zeros(256, dtype=torch.long)
input_ids[:len(indexed_text)] = torch.LongTensor(indexed_text)

In [58]:
input_ids

tensor([17560,     6,    45,     7,     3,     1, 25679,     6,    45,     7,
           26,    30,    11,   243,   389,     5,    11, 25667,   169,    13,
          960,     2,     4,     3,     1,    22,    28,     5,    10, 17560,
            2,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [19]:
model_w2v = w2v.wv
word2vec_embeddings = np.zeros((len(model_w2v.vectors)+1, model_w2v.vectors.shape[1]), dtype=np.float32)
word2vec_embeddings[1:] = model_w2v.vectors
word2vec_embeddings.shape

(52645, 300)

In [24]:
np.save('w2v_embeddings', word2vec_embeddings)

# Load pre-trained W2V model

In [18]:
import gensim

txt_file = 'patent_w2v.txt'

chem_patent_w2v = gensim.models.KeyedVectors.load_word2vec_format(txt_file)

In [28]:
chem_patent_w2v.vectors.shape, chem_patent_w2v.key_to_index['UNK']

((1252586, 200), 116010)

In [30]:
training_word2idx, training_idx2word = train_dataset.get_vocab()
len(training_word2idx)

52644

In [32]:
def get_glove_adapted_embeddings(glove_model, input_voc):
    keys = {i: glove_model.key_to_index.get(w, None) for w, i in input_voc.items()}
    index_dict = {i: key for i, key in keys.items() if key is not None}
    embeddings = np.zeros((len(input_voc)+1,glove_model.vectors.shape[1]))
    for i, ind in index_dict.items():
        embeddings[i] = glove_model.vectors[ind]
    return embeddings

chem_patent_embeddings = get_glove_adapted_embeddings(chem_patent_w2v, training_word2idx)

In [33]:
unkown_count = 0
for w in training_word2idx.keys():
    if chem_patent_w2v.key_to_index.get(w, None) is None:
        unkown_count += 1
unkown_count

31398

# Test text encoder

In [2]:
with open('config.json') as f:
    config = json.load(f)

with open('graph_config.json') as f:
    graph_config = json.load(f)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = config['model_name']
nout = config['nout']
nhid = config['nhid']
nb_epochs = config['nb_epochs']
batch_size_train = config['batch_size_train']
batch_size_test = config['batch_size_test']
learning_rate = config['learning_rate']
load_graph_pretrained = config['load_graph_pretrained']

walk_length = graph_config['walk_length']

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
#tokenizer = None
#nltk_tokenizer=word_tokenize
nltk_tokenizer=None
gt = np.load("./data/token_embedding_dict.npy", allow_pickle=True)[()]

val_dataset = GraphTextDataset(root='./data/', gt=gt, split='val', tokenizer=tokenizer, 
                               nltk_tokenizer=nltk_tokenizer, graph_transform=AddRWStructEncoding(walk_length))
train_dataset = GraphTextDataset(root='./data/', gt=gt, split='train', tokenizer=tokenizer, 
                                 nltk_tokenizer=nltk_tokenizer, graph_transform=AddRWStructEncoding(walk_length))

val_loader = DataLoader(val_dataset, batch_size=batch_size_test, shuffle=False)
train_loader = DataLoader(train_dataset, batch_size=batch_size_train, shuffle=True)

Downloading tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 722kB/s]
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 962kB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 2.64MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 804kB/s]
Processing...
  return torch.LongTensor(edge_index).T, torch.FloatTensor(x)
  return torch.LongTensor(edge_index).T, torch.FloatTensor(x)
Done!
Processing...
Done!


In [6]:
corpus[0]

'UDP-alpha-D-galactofuranose(2-) is a UDP-D-galactofuranose(2-) in which the anomeric centre of the galactofuranose moiety has alpha-configuration. It is a conjugate base of an UDP-alpha-D-galactofuranose.'

In [11]:
tokenizer.encode(corpus[0])

[101,
 20904,
 2361,
 1011,
 6541,
 1011,
 1040,
 1011,
 16122,
 6593,
 11253,
 4648,
 15460,
 2063,
 1006,
 1016,
 1011,
 1007,
 2003,
 1037,
 20904,
 2361,
 1011,
 1040,
 1011,
 16122,
 6593,
 11253,
 4648,
 15460,
 2063,
 1006,
 1016,
 1011,
 1007,
 1999,
 2029,
 1996,
 2019,
 8462,
 7277,
 2803,
 1997,
 1996,
 16122,
 6593,
 11253,
 4648,
 15460,
 2063,
 25175,
 27405,
 2038,
 6541,
 1011,
 9563,
 1012,
 2009,
 2003,
 1037,
 9530,
 9103,
 5867,
 2918,
 1997,
 2019,
 20904,
 2361,
 1011,
 6541,
 1011,
 1040,
 1011,
 16122,
 6593,
 11253,
 4648,
 15460,
 2063,
 1012,
 102]

In [12]:
tokenizer.decode([20904])

'ud'

In [10]:
tokenizer

Using bos_token, but it is not set yet.
Using eos_token, but it is not set yet.


BertTokenizerFast(name_or_path='sentence-transformers/all-MiniLM-L6-v2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [37]:
encoded_input = tokenizer([corpus[0], corpus[1]], return_tensors="pt", truncation=True, max_length=256, padding="max_length", add_special_tokens=True,)

In [38]:
encoded_input

{'input_ids': tensor([[  101, 20904,  2361,  1011,  6541,  1011,  1040,  1011, 16122,  6593,
         11253,  4648, 15460,  2063,  1006,  1016,  1011,  1007,  2003,  1037,
         20904,  2361,  1011,  1040,  1011, 16122,  6593, 11253,  4648, 15460,
          2063,  1006,  1016,  1011,  1007,  1999,  2029,  1996,  2019,  8462,
          7277,  2803,  1997,  1996, 16122,  6593, 11253,  4648, 15460,  2063,
         25175, 27405,  2038,  6541,  1011,  9563,  1012,  2009,  2003,  1037,
          9530,  9103,  5867,  2918,  1997,  2019, 20904,  2361,  1011,  6541,
          1011,  1040,  1011, 16122,  6593, 11253,  4648, 15460,  2063,  1012,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [14]:
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model

Downloading config.json: 100%|██████████| 612/612 [00:00<00:00, 5.22MB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:17<00:00, 5.08MB/s]


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [21]:
print(f'{sum(p.numel() for p in model.parameters()):,} parameters')

22,713,216 parameters


In [40]:
out = model(encoded_input['input_ids'], attention_mask=encoded_input['attention_mask']).last_hidden_state

In [41]:
attention_mask = encoded_input['attention_mask']

In [42]:
input_mask_expanded = attention_mask.unsqueeze(-1).expand(out.size()).float()
input_mask_expanded.shape

torch.Size([2, 256, 384])

In [43]:
out_mean = torch.sum(out * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
out_mean.shape

torch.Size([2, 384])

In [44]:
sentence_embeddings = F.normalize(out_mean, p=2, dim=1)
sentence_embeddings.shape

torch.Size([2, 384])

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [5]:
class TextEncoder(nn.Module):
    def __init__(self, model_name):
        super(TextEncoder, self).__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        
    def forward(self, input_ids, attention_mask):
        encoded_text = self.bert(input_ids, attention_mask=attention_mask)
        #print(encoded_text.last_hidden_state.size())
        return encoded_text.last_hidden_state[:,0,:]

In [5]:
text_encoder = TextEncoder(model_name)
text_encoder

TextEncoder(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [10]:
text_encoder.to(device)

TextEncoder(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affin

In [11]:
print(f'{sum(p.numel() for p in text_encoder.parameters()):,} parameters')

109,482,240 parameters


In [12]:
batch = next(iter(train_loader))
input_ids = batch.input_ids
attention_mask = batch.attention_mask

In [13]:
out = text_encoder(input_ids.to(device), attention_mask.to(device))

In [14]:
out.shape

torch.Size([16, 768])

In [5]:
model = SentenceTransformer('menadsa/S-BioELECTRA')
sentences = ["This is an example sentence", "Each sentence is converted"]
out = model.encode(sentences)
out

array([[-0.02665125,  0.09909892, -0.03884244, ...,  0.0345562 ,
         0.01290261,  0.077338  ],
       [ 0.10659089,  0.20882852,  0.07350729, ..., -0.01763117,
        -0.00997985,  0.24828516]], dtype=float32)

In [8]:
model = model.to(device)

In [12]:
sentences = ["This is an example sentence", "Each sentence is converted"]
out = model(sentences)

TypeError: list indices must be integers or slices, not str

AttributeError: 'numpy.ndarray' object has no attribute 'device'