In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from molfeat.trans.pretrained.hf_transformers import HFExperiment
from molfeat.trans.pretrained.hf_transformers import HFModel
from molfeat.store import ModelInfo
from molfeat.store import ModelStore

### GPT2 Zinc 87M parameters

In [3]:
molgpt2_card = ModelInfo(
    name = "GPT2-Zinc480M-87M",
    inputs = "smiles",
    type="pretrained",
    group="huggingface",
    version=0,
    submitter="Datamol",
    description="This is a GPT2 style autoregressive language model trained on ~480m SMILES strings from the ZINC database available. The model has ~87m parameters and was trained for 175000 iterations with a batch size of 3072 to a validation loss of ~.615.",
    representation="line-notation",
    require_3D=False,
    tags = ["smiles", 'huggingface', "transformers", "GPT2"],
    authors= ["Karl Heyer"],
    reference = "https://github.com/kheyer/gpt2_zinc_87m" 
)

# attempt to register the model
model = HFModel.register_pretrained("entropy/gpt2_zinc_87m", "entropy/gpt2_zinc_87m", molgpt2_card)

Downloading (…)lve/main/config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/350M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/40.5k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/24.4k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]



  0%|          | 0/8 [00:00<?, ?it/s]

[32m2023-05-14 09:19:08.629[0m | [1mINFO    [0m | [36mmolfeat.trans.pretrained.hf_transformers[0m:[36msave[0m:[36m50[0m - [1mModel saved to gs://molfeat-store-prod/artifacts/huggingface/GPT2-Zinc480M-87M/0/model.save[0m
[32m2023-05-14 09:19:12.377[0m | [1mINFO    [0m | [36mmolfeat.store.modelstore[0m:[36mregister[0m:[36m124[0m - [1mSuccessfuly registered model GPT2-Zinc480M-87M ![0m


### Roberta Zinc 480M

In [5]:
molroberta_card = ModelInfo(
    name = "Roberta-Zinc480M-102M",
    inputs = "smiles",
    type="pretrained",
    group="huggingface",
    version=0,
    submitter="Datamol",
    description="This is a Roberta style masked language model trained on ~480m SMILES strings from the ZINC database. The model has ~102m parameters and was trained for 150000 iterations with a batch size of 4096 to a validation loss of ~0.122.",
    representation="line-notation",
    require_3D=False,
    tags = ["smiles", 'huggingface', "transformers", "Roberta"],
    authors= ["Karl Heyer"],
    reference = "https://github.com/kheyer/roberta_zinc_480m" 
)

# attempt to register the model
model = HFModel.register_pretrained("entropy/roberta_zinc_480m", "entropy/roberta_zinc_480m", molroberta_card)

  0%|          | 0/7 [00:00<?, ?it/s]

[32m2023-05-14 09:29:43.553[0m | [1mINFO    [0m | [36mmolfeat.trans.pretrained.hf_transformers[0m:[36msave[0m:[36m50[0m - [1mModel saved to gs://molfeat-store-prod/artifacts/huggingface/Roberta-Zinc480M-102M/0/model.save[0m
[32m2023-05-14 09:29:46.841[0m | [1mINFO    [0m | [36mmolfeat.store.modelstore[0m:[36mregister[0m:[36m124[0m - [1mSuccessfuly registered model Roberta-Zinc480M-102M ![0m


### Evaluating the embeddings

In [26]:
import torch
from molfeat.trans.pretrained import PretrainedHFTransformer

#### Roberta

In [27]:
from transformers import RobertaTokenizerFast, RobertaForMaskedLM, DataCollatorWithPadding

tokenizer = RobertaTokenizerFast.from_pretrained("entropy/roberta_zinc_480m", max_len=128)
model = RobertaForMaskedLM.from_pretrained('entropy/roberta_zinc_480m')
collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')

smiles = ['Brc1cc2c(NCc3ccccc3)ncnc2s1',
 'Brc1cc2c(NCc3ccccn3)ncnc2s1',
 'Brc1cc2c(NCc3cccs3)ncnc2s1',
 'Brc1cc2c(NCc3ccncc3)ncnc2s1',
 'Brc1cc2c(Nc3ccccc3)ncnc2s1']

inputs = collator(tokenizer(smiles))
outputs = model(**inputs, output_hidden_states=True)
full_embeddings = outputs[1][-1]
mask = inputs['attention_mask']
embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1))

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [28]:
transformer = PretrainedHFTransformer("Roberta-Zinc480M-102M", max_len=128, layer=-1, dtype=torch.float)

In [29]:
molfeat_embeddings = transformer(smiles)

In [30]:
(molfeat_embeddings == embeddings).all()

tensor(True)

#### GPT2

In [31]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from transformers import DataCollatorWithPadding

tokenizer = GPT2TokenizerFast.from_pretrained("entropy/gpt2_zinc_87m", max_len=256)
model = GPT2LMHeadModel.from_pretrained('entropy/gpt2_zinc_87m')
collator = DataCollatorWithPadding(tokenizer, padding=True, return_tensors='pt')

inputs = collator(tokenizer(smiles))
outputs = model(**inputs, output_hidden_states=True)
full_embeddings = outputs[-1][-1]
mask = inputs['attention_mask']
embeddings = ((full_embeddings * mask.unsqueeze(-1)).sum(1) / mask.sum(-1).unsqueeze(-1))


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [32]:
transformer = PretrainedHFTransformer("GPT2-Zinc480M-87M", max_len=256, layer=-1, dtype=torch.float)

In [35]:
molfeat_embeddings = transformer(smiles)

In [36]:
(molfeat_embeddings == embeddings).all()

tensor(True)

Note, for GPT2, it's likely better to use the GPT pooling layer

In [37]:
transformer = PretrainedHFTransformer("GPT2-Zinc480M-87M", max_len=256, pooling="gpt", layer=-1, dtype=torch.float)



In [38]:
transformer(smiles) # which gives a different results compared to the embeddings below

tensor([[-0.7721, -1.3579,  1.0738,  ..., -2.9465, -0.6193, -1.7630],
        [-0.5820, -1.8309,  0.6659,  ..., -2.7409, -1.0297, -1.7621],
        [-0.7349, -1.5854,  0.8986,  ..., -2.7796, -0.5494, -1.7704],
        [-0.6369, -1.7601,  0.6885,  ..., -2.5800, -1.0640, -1.7046],
        [-0.7219, -1.4894,  1.1671,  ..., -3.0910, -0.3174, -1.7954]])

In [39]:
embeddings

tensor([[-2.3539, -1.1362, -0.8619,  ..., -1.7240, -0.0970, -0.1031],
        [-2.1634, -1.2579, -0.8267,  ..., -1.7632,  0.0584, -0.2343],
        [-2.2977, -1.1809, -0.7014,  ..., -1.8313,  0.1073, -0.2144],
        [-2.2383, -1.0197, -0.8010,  ..., -1.7786, -0.0261, -0.1898],
        [-2.3104, -1.1759, -0.5705,  ..., -1.4688, -0.1083, -0.2210]],
       grad_fn=<DivBackward0>)