In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from dianna.utils.tokenizers import SpacyTokenizer

ModuleNotFoundError: No module named 'transformers'

# 3 different tokenizers:
- huggingface for language
- huggingface for chemistry
- custom, word based

In [None]:
model_name_language = "nlptown/bert-base-multilingual-uncased-sentiment"
model_name_chemistry = "DeepChem/ChemBERTa-77M-MLM"
tokenizer_language = AutoTokenizer.from_pretrained(model_name_language)
tokenizer_chemistry = AutoTokenizer.from_pretrained(model_name_chemistry)
tokenizer_default = SpacyTokenizer(name='en_core_web_sm')

model_language = AutoModelForSequenceClassification.from_pretrained(model_name_language)
model_chemistry = AutoModelForSequenceClassification.from_pretrained(model_name_chemistry)

NameError: name 'AutoTokenizer' is not defined

# tokenizing

In [None]:
sentence = "This movie is shit."
molecule = "CC(C)CC(=O)"

['this', 'movie', 'is', 'shi', '##t', '.']
['This', 'movie', 'is', 'shit']
['C', 'C', '(', 'C', ')', 'C', 'C', '(', '=', 'O', ')']


In [None]:
tokens_language = tokenizer_language.tokenize(sentence)
tokens_default = tokenizer_default.tokenize(sentence)
tokens_chemistry = tokenizer_chemistry.tokenize(molecule)
print(tokens_language)
print(tokens_default)
print(tokens_chemistry)

['this', 'movie', 'is', 'shi', '##t', '.']
['This', 'movie', 'is', 'shit']
['C', 'C', '(', 'C', ')', 'C', 'C', '(', '=', 'O', ')']


# De-tokenizing

In [None]:
sentence_hf = tokenizer_language.convert_tokens_to_string(tokens_language)
sentence_custom = tokenizer_default.convert_tokens_to_string(tokens_default)
molecule_decoded = tokenizer_chemistry.convert_tokens_to_string(tokens_chemistry)
print(sentence_hf)
print(sentence_custom)
print(molecule_decoded)

this movie is shit.
This movie is shit
CC(C)CC(=O)


# Masking

In [None]:
masked_language = tokens_language[:]
masked_language[3] = tokenizer_language.mask_token
masked_default = tokens_default[:]
masked_default[3] = tokenizer_default.mask_token
masked_chemistry = tokens_chemistry[:]
masked_chemistry[3] = tokenizer_chemistry.mask_token

print(masked_language)
print(masked_default)
print(masked_chemistry)

['this', 'movie', 'is', '[MASK]', '##t', '.']
['This', 'movie', 'is', 'UNKWORDZ']
['C', 'C', '(', '[MASK]', ')', 'C', 'C', '(', '=', 'O', ')']


## check that de-tokenizing and tokenizing the masked strings is the identity

In [None]:
masked_decoded_language = tokenizer_language.tokenize(tokenizer_language.convert_tokens_to_string(masked_language))
print(masked_language, masked_decoded_language, masked_language == masked_decoded_language) 
masked_decoded_default = tokenizer_default.tokenize(tokenizer_default.convert_tokens_to_string(masked_default))
print(masked_default, masked_decoded_default, masked_default == masked_decoded_default) 
masked_decoded_chemistry = tokenizer_chemistry.tokenize(tokenizer_chemistry.convert_tokens_to_string(masked_chemistry))
print(masked_chemistry, masked_decoded_chemistry, masked_chemistry == masked_decoded_chemistry) 

['this', 'movie', 'is', '[MASK]', '##t', '.'] ['this', 'movie', 'is', '[MASK]', 't', '.'] False
['This', 'movie', 'is', 'UNKWORDZ'] ['This', 'movie', 'is', 'UNKWORDZ'] True
['C', 'C', '(', '[MASK]', ')', 'C', 'C', '(', '=', 'O', ')'] ['C', 'C', '(', '[MASK]', ')', 'C', 'C', '(', '=', 'O', ')'] True


In [None]:
tokenizer_language.convert_tokens_to_string(masked_language)

'this movie is [MASK]t.'

# NOTE: it is NOT for the huggingface language tokenizer!