In [1]:
import pysolr, os

MESH_URL = 'http://' + os.environ['SOLR_HOST'] + ':8983/solr/mesh3'
mesh_solr = pysolr.Solr(MESH_URL, always_commit=False)
mesh_solr.ping()

'{\n  "responseHeader":{\n    "zkConnected":null,\n    "status":0,\n    "QTime":6,\n    "params":{\n      "q":"{!lucene}*:*",\n      "distrib":"false",\n      "df":"_text_",\n      "rows":"10",\n      "echoParams":"all"}},\n  "status":"OK"}\n'

In [2]:
from transformers import BertTokenizer, BertModel

BERT_MODEL='bert-base-cased'

In [3]:
def init_model(model_path):
    tokenizer = BertTokenizer.from_pretrained(model_path,do_lower_case=False)
    model = BertModel.from_pretrained(model_path, output_hidden_states = True)
    model.eval()
    return model,tokenizer

In [4]:
def get_hidden_states(model,tokenizer,marked_text):
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    
    segments_ids = [1] * len(tokenized_text)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        return hidden_states

## get all mesh concepts

In [5]:
i=0
nrows = 1000
numFound=0

concepts = []

while i*nrows <= numFound:
    results = mesh_solr.search('*:*', rows=1000, start=i*nrows)
    
    numFound = results.raw_response['response']['numFound']
    i = i+1

    for result in results:
        entry_terms = [entry_term for entry_term in result['EntryTerm']]
        concepts.append( {'concept_id':result['DescriptorUI'], 'entry_terms':entry_terms} )

# print(len(concepts))
# print(concepts)

In [6]:
model,tokenizer = init_model(BERT_MODEL)

bert_concepts = []

for c in concepts:
    for term in c['entry_terms']:
        tokenized_text = tokenizer.tokenize(term)

        if (len(tokenized_text)==1):
            c['bert_token'] = tokenized_text
            bert_concepts.append(c)

            # get just the first unique token. Usefull in cases like 'institute' and 'academy', which are entry_terms of the same concept
            break
    i = i + 1

In [7]:
print(len(bert_concepts))

815


In [8]:
for c in bert_concepts:
    print(c['bert_token'])

['Institutes']
['Falls']
['Accounting']
['Farmers']
['Wars']
['Beijing']
['Vampires']
['Chocolate']
['Lotus']
['Pilot']
['Gay']
['Calendar']
['Farm']
['Gardens']
['Rowing']
['Sugar']
['UNESCO']
['Clay']
['Sitting']
['Bacon']
['Sand']
['Achievement']
['Acoustic']
['AIDS']
['ATP']
['Administrator']
['Teen']
['Adult']
['Advertising']
['Afghanistan']
['Africa']
['Sahara']
['Agriculture']
['Air']
['Aircraft']
['Alabama']
['Alaska']
['Albania']
['Alberta']
['Algeria']
['Muse']
['Ambulance']
['Americas']
['Anatomy']
['Anger']
['Angola']
['Animals']
['Anniversary']
['Antarctica']
['Monkey']
['Anthropology']
['Schedule']
['Talent']
['Arabia']
['Archaeology']
['Architecture']
['Archives']
['Argentina']
['Arizona']
['Arkansas']
['Arm']
['Armenia']
['Art']
['Asia']
['Association']
['Australia']
['Austria']
['Automobile']
['Aviation']
['Prize']
['Azerbaijan']
['Back']
['Bahamas']
['Bahrain']
['Bangladesh']
['Barbados']
['Baseball']
['Basketball']
['Bass']
['Bath']
['Bears']
['Beauty']
['Bed']
['Bee