## Pubmed Abstract Extraction and Entity Recognition

#### PubMed Data Extraction

In [1]:
#pip install metapub  https://pypi.org/project/metapub/
from metapub import PubMedFetcher
fetch = PubMedFetcher()

# get the first 10 pmids matching "breast neoplasm" keyword search
pmids = fetch.pmids_for_query('breast neoplasm', retmax=10)

# get abstract for each article:
abstracts = []
authorname={}
for pmid in pmids:
    abstracts.append(fetch.article_by_pmid(pmid).abstract)
    authorname[pmid]=fetch.article_by_pmid(pmid).authors

#print(article.title)
#print(article.journal, article.year, article.volume, article.issue)
#print(article.authors)
#print(article.citation)
print(abstracts[0])
#print(authorname)
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')




BACKGROUND: Apparent diffusion coefficient (ADC) measurements are not incorporated in BI-RADS classification.
PURPOSE: To assess the probability of malignancy of breast lesions at magnetic resonance mammography (MRM) at 3 T, by combining ADC measurements with the BI-RADS score, in order to improve the specificity of MRM.
MATERIAL AND METHODS: A total of 296 biopsy-proven breast lesions were included in this prospective study. MRM was performed at 3 T, using a standard protocol with dynamic sequence (DCE-MRI) and an extra echo-planar diffusion-weighted sequence. A freehand region of interest was drawn inside the lesion, and ADC values were calculated. Each lesion was categorized according to the BI-RADS classification. Logistic regression analysis was employed to predict the probability of malignancy of a lesion. The model combined the BI-RADS classification and the ADC value. Sensitivity, specificity, positive predictive value, negative predictive value, and diagnostic accuracy were ca

#### Installing modules and Model - Don't Run this Cell if already installed

In [2]:
!pip install scispacy
!pip install swifter
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz      #scispacy medium model
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_md-0.4.0.tar.gz      #scispacy medium model
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz  #biomedical NER model trained on BC5CDR corpus
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bionlp13cg_md-0.4.0.tar.gz  #biomedical NER model trained on BIONLP13CG corpus
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_craft_md-0.4.0.tar.gz    #biomedical NER model trained on CRAFT corpus
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_jnlpba_md-0.4.0.tar.gz     #biomedical NER model trained on JNLPBA corpus





ERROR: Invalid requirement: '#scispacy'
ERROR: Invalid requirement: '#scispacy'
ERROR: Invalid requirement: '#biomedical'
ERROR: Invalid requirement: '#biomedical'
ERROR: Invalid requirement: '#biomedical'
ERROR: Invalid requirement: '#biomedical'


In [5]:
import spacy
import scispacy
import swifter
import pandas as pd
from spacy import displacy
import en_core_sci_sm
#import en_core_sci_md
# import en_ner_bc5cdr_md
# import en_ner_jnlpba_md
# import en_ner_craft_md
# import en_ner_bionlp13cg_md
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from collections import OrderedDict,Counter
from pprint import pprint
from tqdm import tqdm
tqdm.pandas()

In [11]:
sample_text=abstracts[0]
sample_text

'BACKGROUND: Apparent diffusion coefficient (ADC) measurements are not incorporated in BI-RADS classification.\nPURPOSE: To assess the probability of malignancy of breast lesions at magnetic resonance mammography (MRM) at 3 T, by combining ADC measurements with the BI-RADS score, in order to improve the specificity of MRM.\nMATERIAL AND METHODS: A total of 296 biopsy-proven breast lesions were included in this prospective study. MRM was performed at 3 T, using a standard protocol with dynamic sequence (DCE-MRI) and an extra echo-planar diffusion-weighted sequence. A freehand region of interest was drawn inside the lesion, and ADC values were calculated. Each lesion was categorized according to the BI-RADS classification. Logistic regression analysis was employed to predict the probability of malignancy of a lesion. The model combined the BI-RADS classification and the ADC value. Sensitivity, specificity, positive predictive value, negative predictive value, and diagnostic accuracy were

In [27]:
#### Display Entities Function

In [28]:
def display_entities(model,document):
    """ A function that returns a tuple of displacy image of named or unnamed word entities and
        a set of unique entities recognized based on scispacy model in use
        Args: 
            model: A pretrained model from spaCy or ScispaCy
            document: text data to be analysed"""
    nlp = model.load()
    doc = nlp(document)
    displacy_image = displacy.render(doc, jupyter=True,style='ent')
    entity_and_label = set([(X.text, X.label_) for X in doc.ents])
    return  displacy_image, entity_and_label

#### Running en_ner_bc5dr_md Model

In [14]:
#!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz
bc5dr_entities = display_entities(en_ner_bc5cdr_md,sample_text)

In [15]:
bc5dr_entities_dataframe = pd.DataFrame(bc5dr_entities[1],columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
bc5dr_entities_dataframe['Ner_model'] = 'bc5dr'  #include a column with constant value of NER model
bc5dr_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,breast lesions,DISEASE,bc5dr
1,malignancy,DISEASE,bc5dr


#### Running en_ner_bionlp13cg_md Model

In [16]:
bionlp13cg_entities = display_entities(en_ner_bionlp13cg_md,sample_text)

In [17]:
bionlp13cg_entities_dataframe = pd.DataFrame(bionlp13cg_entities[1],columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
bionlp13cg_entities_dataframe['Ner_model'] = 'bionlp13cg'  #include a column with constant value of NER model
bionlp13cg_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,lesions,CANCER,bionlp13cg
1,biopsy-proven breast lesions,CANCER,bionlp13cg
2,breast lesions,CANCER,bionlp13cg
3,DCE-MRI,CANCER,bionlp13cg
4,extra,IMMATERIAL_ANATOMICAL_ENTITY,bionlp13cg
5,malignancy,CANCER,bionlp13cg
6,MRM,CANCER,bionlp13cg


#### Running en_ner_craft_md Model

In [18]:
craft_entities = display_entities(en_ner_craft_md,sample_text)

In [19]:
craft_entities_dataframe = pd.DataFrame(craft_entities[1],columns=['Entity','Label'])  #save returned values of entities and their labels in a pandas dataframe
craft_entities_dataframe['Ner_model'] = 'craft' #include a column with constant value of NER model
craft_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,echo-planar,SO,craft
1,sequence,SO,craft
2,breast,CL,craft


#### Running en_ner_jnlpba_md Model

In [20]:
jnlpba_entities = display_entities(en_ner_jnlpba_md,sample_text)

In [21]:
jnlpa_entities_dataframe = pd.DataFrame(jnlpba_entities[1],columns=['Entity','Label']) #save returned values of entities and their labels in a pandas dataframe
jnlpa_entities_dataframe['Ner_model'] = 'jnlpa' # include a column with constant value of NER model
jnlpa_entities_dataframe

Unnamed: 0,Entity,Label,Ner_model
0,DCE-MRI,PROTEIN,jnlpa
1,echo-planar diffusion-weighted sequence,DNA,jnlpa


#### Combining all entities Dataframe

In [22]:
entities_and_label_from_4_NER_model_dataframe = pd.concat([bc5dr_entities_dataframe,bionlp13cg_entities_dataframe,craft_entities_dataframe,jnlpa_entities_dataframe])
#Concatenate all pandas dataframe into one.
entities_and_label_from_4_NER_model_dataframe.to_csv('entities_and_label_from_4_scispacy_NER_models.csv', index=False) #Save dataframe to csv
entities_and_label_from_4_NER_model_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14 entries, 0 to 1
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Entity     14 non-null     object
 1   Label      14 non-null     object
 2   Ner_model  14 non-null     object
dtypes: object(3)
memory usage: 448.0+ bytes


In [25]:
#Load pre extracted dataset into pandas
entities_and_label_from_4_NER_model_dataframe = pd.read_csv('entities_and_label_from_4_scispacy_NER_models.csv')

In [26]:
#pd.options.display.max_colwidth = 1000      #increase display width of the pandas dataframe
entities_and_label_from_4_NER_model_dataframe.head() 

Unnamed: 0,Entity,Label,Ner_model
0,breast lesions,DISEASE,bc5dr
1,malignancy,DISEASE,bc5dr
2,lesions,CANCER,bionlp13cg
3,biopsy-proven breast lesions,CANCER,bionlp13cg
4,breast lesions,CANCER,bionlp13cg


### References:
#### 1. https://oyewusiwuraola.medium.com/how-to-use-scispacy-entity-linkers-for-biomedical-named-entities-7cf13b29ef67
#### 2. https://medium.com/@maheshdmahi/scispacy-for-bio-medical-named-entity-recognition-ner-63ed548f1df0
#### 3. https://spacy.io/
#### 4. Model Download https://allenai.github.io/scispacy/