In [27]:
pip install pysolr
pip install spacy==2.2.4

Note: you may need to restart the kernel to use updated packages.


In [3]:
!spacy download en_core_web_sm

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')


In [7]:
import spacy
import pysolr, os
import pandas as pd

nlp = spacy.load('en_core_web_sm')
tokenizer = nlp.Defaults.create_tokenizer(nlp)

In [8]:
PMC_URL = 'http://' + os.environ['SOLR_HOST'] + ':8983/solr/pmc'
MESH_URL = 'http://' + os.environ['SOLR_HOST'] + ':8983/solr/mesh3'

pmc_solr = pysolr.Solr(PMC_URL, always_commit=False)
mesh_solr = pysolr.Solr(MESH_URL, always_commit=False)

# Do a health check.
mesh_solr.ping()
pmc_solr.ping()

'{\n  "responseHeader":{\n    "zkConnected":null,\n    "status":0,\n    "QTime":326,\n    "params":{\n      "q":"{!lucene}*:*",\n      "distrib":"false",\n      "df":"_text_",\n      "rows":"10",\n      "echoParams":"all"}},\n  "status":"OK"}\n'

In [9]:
def tokenize(text):
    tokens = nlp.tokenizer.explain(text)

    return [t[1] for t in tokens if t[0] == 'TOKEN']
    

In [10]:
def get_n_grams(doc):
    i = 0
    n_grams = []
    for i in range(i, len(doc)):
        n_gram = []
        n_gram.append(doc[i])

        if (i<len(doc)-1):
            n_gram.append(doc[i+1])
        if (i<len(doc)-2):
            n_gram.append(doc[i+2])    

        n_grams.append(n_gram)
    return n_grams
     

In [11]:
def search(n_gram):
    separator = ' '
    text_value = separator.join(n_gram)
    query = '{}:\"{}\"'.format('ConceptName', text_value)
#     print(query)
    try:
        results = mesh_solr.search(query)

        if (len(results) != 0):
            for r in results:
                return (r['ConceptName'], len(n_gram))
        elif len(n_gram)-1>0: 
            return search(n_gram[:len(n_gram)-1])
        else: return ('', len(n_gram)-1)
    except: return ('', 0)

In [12]:
def get_labels(n_grams):
    n = 0
    word_index = 0
    labels = ['O' for i in range(len(n_grams))]
    for n_gram in n_grams:
        if n > 1:
            word_index = word_index + 1
            n= n-1
            continue

        concept, n = search(n_gram)
        if (concept):
            for i in range(n):
                labels[word_index+i] = concept

        word_index = word_index + 1
    return labels

In [39]:
def get_labels_uni_class(n_grams):
    n = 0
    word_index = 0
    labels = ['O' for i in range(len(n_grams))]
    for n_gram in n_grams:
        if n > 1:
            word_index = word_index + 1
            n= n-1
            continue

        concept, n = search(n_gram)
        if (concept):
            for i in range(n):
                labels[word_index+i] = 'B-MESH' if i == 0 else 'I-MESH'

        word_index = word_index + 1
    return labels

# A simple sentence example

In [35]:
original_sentence = "A 58-year-old African-American woman presents to the ER with episodic pressing/burning anterior chest pain that began two days earlier for the first time in her life. The pain started while she was walking, radiates to the back, and is accompanied by nausea, diaphoresis and mild dyspnea, but is not increased on inspiration. The latest episode of pain ended half an hour prior to her arrival. She is known to have hypertension and obesity. She denies smoking, diabetes, hypercholesterolemia, or a family history of heart disease. She currently takes no medications. Physical examination is normal. The EKG shows nonspecific changes. The urogenital system failled."
tokens = tokenize(original_sentence)

print(tokens)

['A', '58-year', 'old', 'African', 'American', 'woman', 'presents', 'to', 'the', 'ER', 'with', 'episodic', 'pressing', 'burning', 'anterior', 'chest', 'pain', 'that', 'began', 'two', 'days', 'earlier', 'for', 'the', 'first', 'time', 'in', 'her', 'life', 'The', 'pain', 'started', 'while', 'she', 'was', 'walking', 'radiates', 'to', 'the', 'back', 'and', 'is', 'accompanied', 'by', 'nausea', 'diaphoresis', 'and', 'mild', 'dyspnea', 'but', 'is', 'not', 'increased', 'on', 'inspiration', 'The', 'latest', 'episode', 'of', 'pain', 'ended', 'half', 'an', 'hour', 'prior', 'to', 'her', 'arrival', 'She', 'is', 'known', 'to', 'have', 'hypertension', 'and', 'obesity', 'She', 'denies', 'smoking', 'diabetes', 'hypercholesterolemia', 'or', 'a', 'family', 'history', 'of', 'heart', 'disease', 'She', 'currently', 'takes', 'no', 'medications', 'Physical', 'examination', 'is', 'normal', 'The', 'EKG', 'shows', 'nonspecific', 'changes', 'The', 'urogenital', 'system', 'failled']


In [40]:
n_grams = get_n_grams(tokens)
labels = get_labels_uni_class(n_grams)
for label, token in zip(labels, tokens):
    print(label, token)

O A
O 58-year
O old
O African
O American
O woman
O presents
O to
O the
O ER
O with
O episodic
O pressing
O burning
O anterior
O chest
O pain
O that
O began
O two
O days
O earlier
O for
O the
O first
O time
O in
O her
B-MESH life
O The
O pain
O started
O while
O she
O was
O walking
O radiates
O to
O the
B-MESH back
O and
O is
O accompanied
O by
B-MESH nausea
O diaphoresis
O and
O mild
B-MESH dyspnea
O but
O is
O not
O increased
O on
O inspiration
O The
O latest
O episode
O of
O pain
O ended
O half
O an
O hour
O prior
O to
O her
O arrival
O She
O is
O known
O to
O have
B-MESH hypertension
O and
B-MESH obesity
O She
O denies
O smoking
O diabetes
B-MESH hypercholesterolemia
O or
O a
O family
O history
O of
B-MESH heart
B-MESH disease
O She
O currently
O takes
O no
O medications
O Physical
O examination
O is
O normal
O The
O EKG
O shows
O nonspecific
O changes
O The
B-MESH urogenital
I-MESH system
O failled


In [None]:
n_grams = get_n_grams(tokens)
labels = get_labels(n_grams)
for label, token in zip(labels, tokens):
    print(label, token)

# Sentences from PMC

In [41]:
sentences = []
i=0
nrows = 1000
numFound=0
while i*nrows <= 2000:
    results = pmc_solr.search('*:*', rows=1000, start=i*nrows)
    
#     print(results.raw_response['response'])
#     numFound = results.raw_response['response']['numFound']
    i = i+1
    
    for result in results:
        sentence = ''
        if 'abstract' in result:
            for abstract in result['abstract']:
                sentence = sentence + abstract + ' '
            sentences.append(sentence)

In [42]:
i = 1
dicts= []
for sentence in sentences:
    words = tokenize(sentence)
    n_grams =  get_n_grams(words)
    tags = get_labels_uni_class(n_grams)
    each_sentence = []
    for word,tag in zip(words, tags):
        dicts.append({ 'sentence_idx':i, 'word': word, 'tag':tag})
#     dicts.append(each_sentence)
    i = i+1

In [43]:
df = pd.DataFrame(dicts)

df.to_csv('mesh.csv')