In [4]:
import pandas as pd
from pymed import PubMed

In [34]:
%%time

# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="MyTool", email="my@email.address")

# Create a GraphQL query in plain text
query = '(2020/01/15:2021/06/15[dp])'

# Execute the query against the API
results = pubmed.query(query, max_results=10000)

# Loop over the retrieved articles

counter = 0
pubmed = []
for article in results:
    counter += 1
    if counter % 1000 == 0:
        print(counter)

    abstract = article.abstract
    if abstract is not None:
        # Extract and format information from the article
        article_id = article.pubmed_id
        title = article.title
    #     if article.keywords:
    #         if None in article.keywords:
    #             article.keywords.remove(None)
    #         keywords = '", "'.join(article.keywords)
        publication_date = article.publication_date

        dct = {
            "article_id": article_id,
            "title": title,
            "publication_date": publication_date,
            "abstract": abstract
        }

        pubmed.append(dct)

pubmed = pd.DataFrame(pubmed)

#     # Show information about the article
#     print(
#         f'{article_id} - {publication_date} - {title}\nKeywords: "{keywords}"\n{abstract}\n'
#     )

1000
2000
3000
4000
5000
6000
7000
8000
9000
CPU times: user 9.46 s, sys: 192 ms, total: 9.65 s
Wall time: 1min 21s


In [35]:
pubmed

Unnamed: 0,article_id,title,publication_date,abstract
0,34130369,Stress Fracture of the Femoral Shaft in Paget'...,2021-06-16,Paget's disease of bone (PDB) is a progressive...
1,34130368,No Detectable Alteration of Inorganic Allogene...,2021-06-16,"During major bone substance loss, secured allo..."
2,34130367,Bone Loss and Radiographic Damage Profile in R...,2021-06-16,Rheumatoid arthritis (RA) is a known cause of ...
3,34130366,A Network Meta-Analysis Comparing Osteoporotic...,2021-06-16,There are limited studies comparing the risk o...
4,34130365,Comparison of Bone Mineral Density and Markers...,2021-06-16,"In a randomized controlled trial, we compared ..."
...,...,...,...,...
8960,34119913,Association between mesenteric panniculitis an...,2021-06-14,To assess the relationship between MP and coex...
8961,34119912,Potential therapeutic benefits of cannabinoid ...,2021-06-14,The utility of cannabinoids and cannabinoid-ba...
8962,34119911,"Mood during pregnancy: Trends, structure, and ...",2021-06-14,Mood dynamics during pregnancy are important i...
8963,34119910,Disturbed craving regulation to gaming cues in...,2021-06-14,The ability to control craving for games is ve...


In [41]:
import re

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import Word

nltk.download('punkt') # one time execution
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /home/dglubokov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dglubokov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/dglubokov/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [45]:
%%time
# function for removing html tags, drop stop ords, etc.
def preprocess_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(' +', ' ', text)
    text = word_tokenize(text.lower())
    text = [w for w in text if not w in stop_words]
    stemmer = WordNetLemmatizer()
    text = [stemmer.lemmatize(word) for word in text]
    text = [w for w in text if len(w) >=3]
    return text

pubmed['tokenized'] = pubmed['abstract'].apply(preprocess_text)
pubmed['tokenized'][0]

CPU times: user 9.23 s, sys: 23.9 ms, total: 9.25 s
Wall time: 9.25 s


['paget',
 'disease',
 'bone',
 'pdb',
 'progressive',
 'bone',
 'disorder',
 'characterized',
 'increased',
 'osteoclast',
 'mediated',
 'bone',
 'resorption',
 'abnormal',
 'bone',
 'formation',
 'incomplete',
 'atypical',
 'femoral',
 'fracture',
 'appearing',
 'radiographically',
 'stress',
 'fracture',
 'lateral',
 'aspect',
 'femur',
 'uncommon',
 'low',
 'trauma',
 'fracture',
 'frequently',
 'seen',
 'association',
 'long',
 'term',
 'bisphosphonate',
 'therapy',
 'describe',
 'case',
 'year',
 'old',
 'female',
 'patient',
 'pdb',
 'developed',
 'stress',
 'fracture',
 'lateral',
 'femoral',
 'cortex',
 'dos',
 'intravenous',
 'bisphosphonate',
 'conservative',
 'treatment',
 'plan',
 'included',
 'discontinuation',
 'bisphosphonate',
 'continuation',
 'calcium',
 'vitamin',
 'supplementation',
 'limited',
 'weight',
 'bearing',
 'month',
 'patient',
 'pain',
 'level',
 'gradually',
 'improved',
 'switching',
 'new',
 'treatment',
 'plan',
 'latest',
 'follow',
 'approximately

In [51]:
from collections import defaultdict
frequency = defaultdict(int)
for text in pubmed['tokenized']:
    for token in text:
        frequency[token] += 1

freqs = []
for w in sorted(frequency, key=frequency.get, reverse=True):
    dct = {
        'word': w,
        'frequency': frequency[w],
    }
    freqs.append(dct)
df_freqs = pd.DataFrame(freqs)
df_freqs.to_csv('pubmed_freqs.tsv', sep='\t', index=False)

In [66]:
companies = pd.read_csv("companies.tsv", sep='\t', names=['company', 'text'], index_col=None)[2:].reset_index(drop=True)
companies

Unnamed: 0,company,text
0,Merck,Merck Pipeline Q1 2021 Reflecting Pipeline to ...
1,Novartis,Novartis Hematology: Relentless Pursuit of a C...
2,Johnson & Johnson,Consumer Health | Medical Devices | Pharmaceut...
3,Bristol-Myers Squibb,The safety and efficacy of investigational age...
4,Abbott Laboratories,CANCER Our combination of comprehensive tumor ...
5,AstraZeneca,"AstraZeneca care with our portfolio, our pipel..."
6,Pfizer,Pfizer cancer biology and to translate this k...
7,GlaxoSmithKline,We invest in scientific and technical excellen...
8,Lilly,Ely lily Care Beyond Treatment At Lilly Onco...
9,Sanofi,Sanofi Sanofi Genzyme recognizes the incredib...


In [68]:
companies['tokenized'] = companies['text'].apply(preprocess_text)
companies['tokenized'][0]

['merck',
 'pipeline',
 'reflecting',
 'pipeline',
 'may',
 'lead',
 'language',
 'chart',
 'reflects',
 'company',
 'research',
 'pipeline',
 'may',
 'candidate',
 'shown',
 'phase',
 'include',
 'specific',
 'product',
 'date',
 'candidate',
 'entered',
 'phase',
 'development',
 'candidate',
 'shown',
 'phase',
 'include',
 'advanced',
 'compound',
 'specific',
 'mechanism',
 'listed',
 'compound',
 'mechanism',
 'currently',
 'intended',
 'commercialization',
 'given',
 'therapeutic',
 'area',
 'small',
 'molecule',
 'biologics',
 'given',
 'number',
 'designation',
 'vaccine',
 'candidate',
 'given',
 'number',
 'designation',
 'except',
 'otherwise',
 'noted',
 'candidate',
 'phase',
 'additional',
 'indication',
 'therapeutic',
 'area',
 'respect',
 'cancer',
 'certain',
 'indication',
 'additional',
 'claim',
 'line',
 'extension',
 'formulation',
 'line',
 'product',
 'shown',
 'merck',
 'pipeline',
 'may',
 'developed',
 'collaboration',
 'developed',
 'combination',
 'keytru

In [69]:
from collections import defaultdict
frequency = defaultdict(int)
for text in companies['tokenized']:
    for token in text:
        frequency[token] += 1

freqs = []
for w in sorted(frequency, key=frequency.get, reverse=True):
    dct = {
        'word': w,
        'frequency': frequency[w],
    }
    freqs.append(dct)
df_freqs = pd.DataFrame(freqs)
df_freqs.to_csv('companies_freqs.tsv', sep='\t', index=False)

In [70]:
df_freqs

Unnamed: 0,word,frequency
0,cancer,270
1,patient,184
2,company,122
3,treatment,115
4,johnson,114
...,...,...
3310,nanobody,1
3311,synthorins,1
3312,laying,1
3313,conquer,1
