# PubMed Extracts - Fibromatosis

In [1]:
desmoid_keywords = ['APC','CTNNB1','alpha catenin',
                    'beta catenin','CD34','CD9','ETV6',
                    'NTRK3','MDM2','MTX','MUTYH','PDGFRB',
                    'S45F','tyrosine kinase inhibitor',
                    'Wnt','STAT','ETV6','NTRK(3)',
                    'IHC','NAB2','TFE3 beta catenin','TFE3','Cyclin D']

In [2]:
# import libraries
import pandas as pd
from tqdm import tqdm
from collections import deque, OrderedDict
import time
import gensim
from gensim.corpora import Dictionary
import spacy
import scispacy
import nltk
from nltk import ngrams, FreqDist
from corextopic import corextopic as ct
from negspacy.negation import Negex
import numpy as np
from scipy.spatial.distance import cosine
import scipy.sparse as ss

In [3]:
%%time
# I will use SciSpacy model to clean the text 
# and extract entity for topic modeling
#nlp_eng = spacy.load('en_core_web_lg')
nlp_sci = spacy.load('en_core_sci_lg')
#nlp_craft = spacy.load('en_ner_craft_md')
#nlp_jnlpba = spacy.load('en_ner_jnlpba_md')
nlp_bionlp = spacy.load('en_ner_bionlp13cg_md')
#nlp_bc5cdr = spacy.load('en_ner_bc5cdr_md')

CPU times: user 11.4 s, sys: 996 ms, total: 12.4 s
Wall time: 12.4 s


In [4]:
# The following file contains all the extracts from PubMed
# that has mentioned "fibromatosis"
FILE = 'fibromatosis_pubmed_extract.csv'

In [5]:
%%time
df = pd.read_csv(FILE)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22020 entries, 0 to 22019
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    22020 non-null  object
dtypes: object(1)
memory usage: 172.2+ KB
CPU times: user 97.1 ms, sys: 12.6 ms, total: 110 ms
Wall time: 108 ms


In [6]:
df.to_csv('fibromatosis_pubmed_extract.csv', index=False)

In [7]:
# Since abstract usually contains a bunch of words/phrases that are
# non-informative to the analysis.  The following list contains
# those words/phrases to remove before further analysis
word_to_remove = ['- ','ABSTRACT:','BACKGROUND ','CASE: ',
                  'CASE PRESENTATION:','CASE REPORT ',
                  'CASE SUMMARY:','CLINICAL QUESTION/LEVEL OF EVIDENCE: ',
                  'CONCLUSIONS:','CONCLUSIONS.-: ','CONCLUSIONS: - ','Conclusion: ',
                  'Conclusions: ','CONCLUSION:','DATA SOURCES.-: ','DATA SOURCES: - ',
                  'DIAGNOSES: ','DIAGNOSES AND OUTCOMES: ','DISCUSSION:',
                  'INTERPRETATION:','INTERVENTIONS: ','FUNDING: ','LESSONS: ',
                  'MATERIALS AND METHODS: ','METHODS:','METHODS: ','Methods:',
                  'METHOD:','OBJECTIVES:','OBJECTIVE:','OBJECTIVE AND METHOD:',
                  'OBJECTIVE.-: ','OBJECTIVE: - ','OUTCOMES: ','PATIENT CONCERNS: ',
                  'PRESENTATION OF CASE: ','RESULTS:','RESULT:',
                  'MATERIALS AND METHODS:', '(',')','MEDLINE', 'FINDINGS']

In [8]:
%%time
# clean the original abstract by removing the non-informative words/phrases
# I also remove the abstract that is too short to be useful
t = deque()
for i in range(len(df)):
    text = df.loc[i,'text']
    for word in word_to_remove:
        text = text.replace(word,'')
    if len(text.split(' '))>40:
        t.append(text)
print (len(t))

12330
CPU times: user 530 ms, sys: 7.67 ms, total: 538 ms
Wall time: 537 ms


In [22]:
# Helper functions
def extract_specific_tokens(nlp, paragraph): # using English common POS as starting point
    POS_to_remove = ['ADP','ADV','AUX','CONJ','SCONJ','SPACE','DET','INTJ','NUM','PRON','CCONJ','PUNCT','SYM','X','VERB','PART']
    doc = nlp(paragraph)
    pr = []
    for token in doc:
        if token.pos_ not in POS_to_remove:
            if '%' not in token.text:
                pr.append(token.text)
            else:
                pass
    return pr

def extract_keyword(text):
    start_pos = text.find('"')
    if text[start_pos+1:-2]=='':
        pass
    else:
        return text[start_pos+1:-1]

# Topic Modeling

In [10]:
%%time
# Set of language libraries from different domains 
# to clean up the text and extract entities
nlp_eng = spacy.load('en_core_web_lg')
nlps = [nlp_bionlp]
nlp_names  = ['nlp_bionlp']
label_to_remove = ['DISEASE','CANCER','MULTI_TISSUE_STRUCTURE','PATHOLOGICAL_FORMATION','ORGAN','TISSUE','ORGANISM_SUBDIVISION','CL','CELL_TYPE','CELL','SO','GO','CELLULAR_COMPONENT','ORGANISM_SUBSTANCE','TAXON','ORGANISM']

# Process to extract entities for topic analysis
doc_list = []
for paragraph in tqdm(t): 
    text = ' '.join(extract_specific_tokens(nlp_eng, paragraph)) # remove common words
    doc_list.append(text)

new_doc = []
for paragraph in tqdm(doc_list): 
    for nlp in nlps: # use different biomedical domain corpus to enrich the document informative content
        doc = nlp(paragraph)
        pr = [ent.text for ent in doc.ents if ent.label_ not in label_to_remove] # extract biomedical domain relevant entity
        new_doc.append(pr)

len(new_doc) # print out the total number of documents in the corpus

100%|██████████| 12330/12330 [03:52<00:00, 53.11it/s]
100%|██████████| 12330/12330 [02:28<00:00, 82.77it/s] 

CPU times: user 6min 19s, sys: 3.27 s, total: 6min 23s
Wall time: 6min 22s





12330

In [11]:
word = Dictionary(new_doc)
corpus = [word.doc2bow(doc) for doc in new_doc]

In [12]:
%%time
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=word,
                                           num_topics=10, 
                                           random_state=1,
                                           update_every=1,
                                           passes=200,
                                           alpha='auto',
                                           per_word_topics=True)

CPU times: user 2min 39s, sys: 266 ms, total: 2min 39s
Wall time: 2min 39s


In [13]:
# Below are 10 topics with keywords identified from PubMed abstracts
for topic in lda_model.print_topics(num_words=10):
    print (topic)
    print ()

(0, '0.085*"APC" + 0.066*"FAP" + 0.018*"alcohol" + 0.016*"p53" + 0.013*"PCNA" + 0.012*"adenomatous polyposis coli" + 0.011*"FAP patients" + 0.008*"intracytoplasmic" + 0.007*"PTEN" + 0.006*"ANTXR2"')

(1, '0.026*"imatinib" + 0.022*"Ki-67" + 0.016*"cytokeratin" + 0.016*"elastin" + 0.014*"S100" + 0.013*"type III collagen" + 0.012*"keratin" + 0.009*"WT1" + 0.009*"HRPT2" + 0.009*"KCNH1"')

(2, '0.022*"acrylamide" + 0.019*"actin" + 0.014*"progesterone" + 0.013*"insulin" + 0.010*"MDM2" + 0.010*"Arm" + 0.009*"EGF" + 0.007*"collagen fibrils" + 0.007*"oxygen" + 0.007*"CD99"')

(3, '0.069*"CD34" + 0.021*"CCH" + 0.016*"amino acid" + 0.008*"collagenase clostridium" + 0.008*"BRAF" + 0.008*"GnRH" + 0.007*"sulindac" + 0.006*"NSAIDs" + 0.006*"BMP" + 0.006*"tendon"')

(4, '0.180*"collagen" + 0.031*"S-100" + 0.011*"intravenous" + 0.009*"calcium" + 0.008*"estrogen" + 0.007*"TNF" + 0.006*"c myc" + 0.005*"VEGF" + 0.005*"MMP-2" + 0.005*"GCF"')

(5, '0.062*"electron" + 0.044*"vimentin" + 0.036*"tamoxifen" + 0

In [23]:
%%time
# Extract keywords for further analysis
keyword_lst = []
for topic in tqdm(lda_model.print_topics(num_words=20)):
    index, formula = topic
    components = formula.split(" + ")
    for component in components:
        keyword = extract_keyword(component)
        keyword_lst.append(keyword)
len(list(set(keyword_lst)))

100%|██████████| 10/10 [00:00<00:00, 42755.39it/s]

CPU times: user 3 ms, sys: 1.25 ms, total: 4.25 ms
Wall time: 3.16 ms





200

In [65]:
# Extract gene keys/phrases
shortlisted = [key for key in keyword_lst if (len(key)>2 and len(key)<7)]

In [66]:
len(shortlisted), shortlisted

(91,
 ['APC',
  'FAP',
  'p53',
  'PCNA',
  'PTEN',
  'ANTXR2',
  'TSC',
  'SATB2',
  'COX-2',
  'AKT2',
  'XIIIa',
  'CA125',
  'Ki-67',
  'S100',
  'WT1',
  'HRPT2',
  'KCNH1',
  'KIT',
  'STAT6',
  'MET',
  'actin',
  'MDM2',
  'Arm',
  'EGF',
  'oxygen',
  'CD99',
  'PHF1',
  'NTRK3',
  'FOXL2',
  'Feed',
  'CD34',
  'CCH',
  'BRAF',
  'GnRH',
  'NSAIDs',
  'BMP',
  'tendon',
  'CD10',
  'S-100',
  'TNF',
  'c myc',
  'VEGF',
  'MMP-2',
  'GCF',
  'MMP',
  'CPF',
  'CMG2',
  'desmin',
  'Ki67',
  'p63',
  'DMBA',
  'CDC73',
  'IGF II',
  'Fine',
  'EGFR',
  'needle',
  'IGF',
  'PTCH',
  'PTCH1',
  'H3F3A',
  'GLI1',
  'optic',
  'RHAMM',
  'RANKL',
  'CFs',
  'FNAC',
  'PDGFRB',
  'PIP',
  'FTS',
  'ALK',
  'bFGF',
  'MACF',
  'PNA',
  'CTNNB1',
  'Wnt',
  'FDG',
  'CO2',
  'CA-125',
  'FNA',
  'ATP',
  'EED',
  'MUC4',
  'HGF',
  'CMF',
  'USP6',
  'SOS1',
  'NF1',
  'PDGFRA',
  'c kit',
  'hand',
  'INI-1'])

In [67]:
shortlisted.remove('Fine')
shortlisted.remove('Feed')
shortlisted.remove('optic')
shortlisted.remove('hand')
shortlisted.remove('Arm')
shortlisted.remove('oxygen')
shortlisted.remove('needle')
shortlisted.remove('tendon')
shortlisted.remove('FAP')

In [68]:
len(shortlisted), shortlisted

(82,
 ['APC',
  'p53',
  'PCNA',
  'PTEN',
  'ANTXR2',
  'TSC',
  'SATB2',
  'COX-2',
  'AKT2',
  'XIIIa',
  'CA125',
  'Ki-67',
  'S100',
  'WT1',
  'HRPT2',
  'KCNH1',
  'KIT',
  'STAT6',
  'MET',
  'actin',
  'MDM2',
  'EGF',
  'CD99',
  'PHF1',
  'NTRK3',
  'FOXL2',
  'CD34',
  'CCH',
  'BRAF',
  'GnRH',
  'NSAIDs',
  'BMP',
  'CD10',
  'S-100',
  'TNF',
  'c myc',
  'VEGF',
  'MMP-2',
  'GCF',
  'MMP',
  'CPF',
  'CMG2',
  'desmin',
  'Ki67',
  'p63',
  'DMBA',
  'CDC73',
  'IGF II',
  'EGFR',
  'IGF',
  'PTCH',
  'PTCH1',
  'H3F3A',
  'GLI1',
  'RHAMM',
  'RANKL',
  'CFs',
  'FNAC',
  'PDGFRB',
  'PIP',
  'FTS',
  'ALK',
  'bFGF',
  'MACF',
  'PNA',
  'CTNNB1',
  'Wnt',
  'FDG',
  'CO2',
  'CA-125',
  'FNA',
  'ATP',
  'EED',
  'MUC4',
  'HGF',
  'CMF',
  'USP6',
  'SOS1',
  'NF1',
  'PDGFRA',
  'c kit',
  'INI-1'])

In [69]:
df1 = pd.DataFrame(list(set(shortlisted)), columns=['keyword'])
df1.to_csv('fibromatosis_pubmed_keywords.csv', index=False)