In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
from molfeat.trans.pretrained.hf_transformers import HFExperiment
from molfeat.trans.pretrained.hf_transformers import HFModel
from molfeat.store import ModelInfo
from molfeat.store import ModelStore

In [2]:
from transformers import AutoConfig
from transformers import AutoModelForSequenceClassification

In [3]:
# import datamol as dm
# from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer

# data = dm.freesolv().iloc[:100]
# transformer = PretrainedHFTransformer(kind="ChemGPT-4.7M", notation="selfies")
# features = transformer(data["smiles"])

### ChemGPT

In [15]:
chemgpt_4M = ModelInfo(
    name = "ChemGPT-4.7M",
    inputs = "selfies",
    type="pretrained",
    group="huggingface",
    version=0,
    submitter="Datamol",
    description="ChemGPT (4.7M params) is a transformer model for generative molecular modeling, which was pretrained on the PubChem10M dataset.",
    representation="line-notation",
    require_3D=False,
    tags = ["ChemGPT", 'huggingface', "transformers", "GPTNeo", "PubChem", "selfies", "small"],
    authors= ['Nathan Frey',
        'Ryan Soklaski',
        'Simon Axelrod',
        'Siddharth Samsi',
        'Rafael Gomez-Bombarelli',
        'Connor Coley',
        'Vijay Gadepally'
    ],
    reference = "10.26434/chemrxiv-2022-3s512" 
)


chemgpt_1B = ModelInfo(
    name = "ChemGPT-1.2B",
    inputs = "selfies",
    type="pretrained",
    group="huggingface",
    version=0,
    submitter="Datamol",
    description="ChemGPT (1.2B params) is a transformer model for generative molecular modeling, which was pretrained on the PubChem10M dataset.",
    representation="line-notation",
    require_3D=False,
    tags = ["ChemGPT", 'huggingface', "transformers", "GPTNeo", "PubChem", "selfies", "huge"],
    authors= ['Nathan Frey',
        'Ryan Soklaski',
        'Simon Axelrod',
        'Siddharth Samsi',
        'Rafael Gomez-Bombarelli',
        'Connor Coley',
        'Vijay Gadepally'
    ],
    reference = "10.26434/chemrxiv-2022-3s512" 
)

chemgpt_19M = ModelInfo(
    name = "ChemGPT-19M",
    inputs = "selfies",
    type="pretrained",
    group="huggingface",
    version=0,
    submitter="Datamol",
    description="ChemGPT (19M params) is a transformers model for generative molecular modeling, which was pretrained on the PubChem10M dataset.",
    representation="line-notation",
    require_3D=False,
    tags = ["ChemGPT", 'huggingface', "transformers", "GPTNeo", "PubChem", "selfies", "large"],
    authors= ['Nathan Frey',
        'Ryan Soklaski',
        'Simon Axelrod',
        'Siddharth Samsi',
        'Rafael Gomez-Bombarelli',
        'Connor Coley',
        'Vijay Gadepally'
    ],
    reference = "10.26434/chemrxiv-2022-3s512" 
)



For ChemGPT, we need to patch the tokenizers

In [16]:
from transformers import AutoTokenizer, AutoModel
tokenizer_4M = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-4.7M")
tokenizer_1B = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-1.2B")
tokenizer_19M = AutoTokenizer.from_pretrained("ncfrey/ChemGPT-19M")


In [17]:
# model = AutoModel.from_pretrained("ncfrey/ChemGPT-4.7M")
# model.config


In [18]:
def patch_tokenizer(tokenizer):
    # unk
    tokenizer.unk_token = "[UNK]"
    tokenizer.unk_token_id = tokenizer.vocab.get(tokenizer.unk_token)

    # cls
    tokenizer.cls_token = "[CLS]"
    tokenizer.cls_token_id = tokenizer.vocab.get(tokenizer.cls_token)

    # pad
    tokenizer.pad_token = "[PAD]"
    tokenizer.pad_token_id = tokenizer.vocab.get(tokenizer.pad_token)

    # bos
    tokenizer.bos_token = "[CLS]"
    tokenizer.bos_token_id = tokenizer.vocab.get(tokenizer.bos_token)
    
    # sep
    tokenizer.sep_token = "[SEP]"
    tokenizer.sep_token_id = tokenizer.vocab.get(tokenizer.sep_token)
    
    # EN: My guess is that the EOS token is the one that is wrong
    # eos
    tokenizer.eos_token = "[SEP]"
    tokenizer.eos_token_id = tokenizer.vocab.get(tokenizer.eos_token)
    
    # mask
    tokenizer.mask_token = "[MASK]"
    tokenizer.mask_token_id = tokenizer.vocab.get(tokenizer.mask_token)
    

    return tokenizer

In [19]:
tokenizer_4M = patch_tokenizer(tokenizer_4M)
tokenizer_19M = patch_tokenizer(tokenizer_19M)
tokenizer_1B = patch_tokenizer(tokenizer_1B)

In [20]:
from transformers import AutoModelForCausalLM
model_4M = AutoModelForCausalLM.from_pretrained("ncfrey/ChemGPT-4.7M")
model_1B = AutoModelForCausalLM.from_pretrained("ncfrey/ChemGPT-1.2B")
model_19M = AutoModelForCausalLM.from_pretrained("ncfrey/ChemGPT-19M")


In [21]:

def patch_model_from_tokenizer(model, tokenizer):
    """This is copied from Factory.patch_hgf_config_from_tokenizer"""
    config = model.config

    conf_dict_data = {}
    for conf_key in [
        "bos_token_id",
        "bos_token",
        "eos_token_id",
        "eos_token", 
        "pad_token_id",
        "pad_token",
        "unk_token_id",
        "unk_token",
        "mask_token_id",
        "mask_token",
        "sep_token_id",
        "sep_token",
        "cls_token_id",
        "cls_token",

    ]:
        if hasattr(config, conf_key):
            conf_dict_data[conf_key] = getattr(tokenizer, conf_key)
    for conf_key in ["forced_eos_token_id", "decoder_start_token_id"]:
        if hasattr(config, conf_key):
            conf_dict_data[conf_key] = tokenizer.eos_token_id
    #conf_dict_data["vocab_size"] = len(tokenizer)
    # if this is false, there is a big issue
    #print(config.vocab_size)
    #print(conf_dict_data["vocab_size"])
    #assert conf_dict_data["vocab_size"] == config.vocab_size, "Vocab size mismatch"
    
    config.update(conf_dict_data)
    model.config = config
    return model


In [22]:
model_4M = patch_model_from_tokenizer(model_4M, tokenizer_4M)
model_19M = patch_model_from_tokenizer(model_19M, tokenizer_19M)
model_1B = patch_model_from_tokenizer(model_1B, tokenizer_1B)


In [23]:
from transformers import PreTrainedModel
from transformers import PreTrainedTokenizer
from transformers import PreTrainedTokenizerFast
isinstance(tokenizer_4M, PreTrainedTokenizerFast)

True

In [24]:
chempgtp_4M_model = HFModel.register_pretrained(model_4M, tokenizer_4M, chemgpt_4M)

  0%|          | 0/6 [00:00<?, ?it/s]

[32m2023-05-04 07:39:40.359[0m | [1mINFO    [0m | [36mmolfeat.trans.pretrained.hf_transformers[0m:[36msave[0m:[36m51[0m - [1mModel saved to gs://molfeat-store-prod/artifacts/huggingface/ChemGPT-4.7M/0/model.save[0m
[32m2023-05-04 07:39:43.717[0m | [1mINFO    [0m | [36mmolfeat.store.modelstore[0m:[36mregister[0m:[36m124[0m - [1mSuccessfuly registered model ChemGPT-4.7M ![0m


In [25]:
chempgtp_19M_model = HFModel.register_pretrained(model_19M, tokenizer_19M, chemgpt_19M)

  0%|          | 0/6 [00:00<?, ?it/s]

[32m2023-05-04 07:41:40.578[0m | [1mINFO    [0m | [36mmolfeat.trans.pretrained.hf_transformers[0m:[36msave[0m:[36m51[0m - [1mModel saved to gs://molfeat-store-prod/artifacts/huggingface/ChemGPT-19M/0/model.save[0m
[32m2023-05-04 07:41:43.940[0m | [1mINFO    [0m | [36mmolfeat.store.modelstore[0m:[36mregister[0m:[36m124[0m - [1mSuccessfuly registered model ChemGPT-19M ![0m


In [26]:
chempgtp_1B_model = HFModel.register_pretrained(model_1B, tokenizer_1B, chemgpt_1B)

  0%|          | 0/6 [00:00<?, ?it/s]

[32m2023-05-04 08:38:27.942[0m | [1mINFO    [0m | [36mmolfeat.trans.pretrained.hf_transformers[0m:[36msave[0m:[36m51[0m - [1mModel saved to gs://molfeat-store-prod/artifacts/huggingface/ChemGPT-1.2B/0/model.save[0m
[32m2023-05-04 08:38:32.808[0m | [1mINFO    [0m | [36mmolfeat.store.modelstore[0m:[36mregister[0m:[36m124[0m - [1mSuccessfuly registered model ChemGPT-1.2B ![0m


In [27]:
chempgtp_19M_model

<molfeat.trans.pretrained.hf_transformers.HFModel at 0x1819435e0>

In [28]:
import datamol as dm
import platformdirs

# remove chemgpt local dir
try:
    chemgpt_local_dir = dm.fs.join(platformdirs.user_cache_dir("molfeat"), "ChemGPT-4.7M")
    mapper = dm.fs.get_mapper(chemgpt_local_dir)
    mapper.fs.delete(chemgpt_local_dir, recursive=True)
except FileNotFoundError:
    pass

# make sure we clear the cache of the function
from molfeat.trans.pretrained.hf_transformers import HFModel
HFModel._load_or_raise.cache_clear()


In [None]:
! ls /Users/manu/Library/Caches/molfeat

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[1m[36mChemBERTa-77M-MLM[m[m          [1m[36mgin_supervised_edgepred[m[m
[1m[36mDeepChem-ChemBERTa-77M-MLM[m[m [1m[36mgin_supervised_infomax[m[m
[1m[36m_lock_files[m[m                [1m[36mmaccs[m[m
[1m[36mconformers[m[m                 [1m[36mpcqm4mv2_graphormer_base[m[m
[1m[36mcustom_model_store[m[m         [1m[36mprecomputed[m[m
[1m[36mgin_supervised_contextpred[m[m [1m[36mtreedecomp[m[m


In [None]:
import datamol as dm
import os
from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer

os.environ["TOKENIZERS_PARALLELISM"] = "false" # annoying huggingface warning
data = dm.freesolv().iloc[:100]
transformer = PretrainedHFTransformer(kind="ChemGPT-1.2B", notation="selfies")
features = transformer(data["smiles"])

  0%|          | 0.00/738 [00:00<?, ?B/s]

  0%|          | 0/7 [00:00<?, ?it/s]



In [53]:
features.shape

(100, 256)