# Scraper

Collect publications from Pubmed for a given keyword within a time window and save data in CSV format

In [1]:
import pandas as pd
import os

from Bio import Entrez, Medline

keyword = 'HIV'
date_range = ('2020/01/01', '2020/08/30')

#keyword = 'medulloblastoma'
#date_range = ('2020/01/01', '2020/12/30')

email = 'djh.shih@gmail.com'
db_dir = 'db'
retmax = 1000
csv_fname = os.path.join(db_dir, keyword) + '.csv'

In [2]:
if not os.path.exists(db_dir):
    os.makedirs(db_dir)

Get Pubmed ID list

In [3]:
Entrez.email = email
handle = Entrez.esearch(db='pubmed', term=keyword,
                        mindate=date_range[0], maxdate=date_range[1],
                        retmax=retmax)
record = Entrez.read(handle)
handle.close()
pids = record['IdList']
print(len(pids))

1000


Retrieve MEDLINE abstracts and save to file

In [4]:
handle = Entrez.efetch(db='pubmed', id=pids, rettype='medline',
                       retmode='text')
records = Medline.parse(handle)
df = pd.DataFrame.from_dict(records)
df.to_csv(csv_fname)

In [5]:
df.columns

Index(['PMID', 'OWN', 'STAT', 'LR', 'IS', 'VI', 'DP', 'TI', 'PG', 'LID', 'AB',
       'CI', 'FAU', 'AU', 'AD', 'LA', 'PT', 'DEP', 'PL', 'TA', 'JT', 'JID',
       'SB', 'OTO', 'OT', 'COIS', 'EDAT', 'MHDA', 'CRDT', 'PHST', 'AID', 'PST',
       'SO', 'DCOM', 'MH', 'IP', 'RN', 'PMC', 'GR', 'CN', 'CIN', 'IR', 'FIR',
       'AUID', 'MID', 'PMCR', 'TT', 'SI', 'EIN', 'CON', 'UOF', 'EFR', 'OAB',
       'OABL', 'UIN', 'DA', 'CTDT', 'PB', 'BTI', 'CDAT', 'CTI'],
      dtype='object')

In [6]:
df['PMID'].count()

1000