<a href="https://colab.research.google.com/github/felipeodorcyk/Medium_Tutorials/blob/main/PubMedScrapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 5.1 MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79


In [2]:
from Bio import Entrez
import pandas as pd
import numpy as np
import random
random.seed(10)

In [24]:
#Using e-search API to find PubMed search results
def search(query):
    Entrez.email = 'example@email.com'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='250000',
                            retmode='xml',
                            term=query)
    results = Entrez.read(handle)
    return results

In [25]:
#Performing the search with the Mesh terms of Alzheimer Disease
studies = search('COVID-19')
studiesIdList = studies['IdList']
len(studiesIdList)

215556

In [7]:
#Reducing the number of papers to 5000 for better computational performance
studiesIdList = random.sample(studiesIdList, 5000)
len(studiesIdList)

5000

In [8]:
#Using e-fetch to get details from the papers obtained from e-search
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'example@email.com'
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    return results

In [13]:
#Making a DF with article information

title_list= []
abstract_list=[]
journal_list = []
language_list =[]
pubdate_year_list = []
pubdate_month_list = []

studies = fetch_details(studiesIdList)

chunk_size = 10000  
for chunk_i in range(0, len(studiesIdList), chunk_size):
  chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
  papers = fetch_details(chunk)
  for i, paper in enumerate (papers['PubmedArticle']):
    title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
    try:
      abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
    except:
      abstract_list.append('No Abstract')
    journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
    language_list.append(paper['MedlineCitation']['Article']['Language'][0])
    try:
      pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
    except:
      pubdate_year_list.append('No Data')
    try:
      pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
    except:
      pubdate_month_list.append('No Data')

df = pd.DataFrame(list(zip(
    title_list, abstract_list, journal_list, language_list, pubdate_year_list, pubdate_month_list
    )), 
    columns=[
             'Title', 'Abstract', 'Journal', 'Language', 'Year','Month'
             ])
df.shape

(4999, 6)

In [14]:
df.head(10)

Unnamed: 0,Title,Abstract,Journal,Language,Year,Month
0,The Coronavirus 2019 Pandemic and Diabetes: An...,No Abstract,Journal of diabetes science and technology,eng,2020,07
1,Epigenetic Lens to Visualize the Severe Acute ...,"In <20 years, we have witnessed three differen...",Frontiers in genetics,eng,2021,No Data
2,Hunting the main protease of SARS-CoV-2 by pli...,COVID-19 has shaken all the countries across t...,Amino acids,eng,2021,Nov
3,COVID-19 infection and diffusion among the hea...,Backgroud: Since the beginning of the coronavi...,La Medicina del lavoro,eng,2020,Jun
4,Helmet Continuous Positive Airway Pressure in ...,No Abstract,Journal of epidemiology and global health,eng,2020,Sep
5,"Effectiveness of mRNA-1273 against delta, mu, ...",To evaluate the effectiveness of the mRNA-1273...,BMJ (Clinical research ed.),eng,2021,12
6,Lessons of the month 1: Longitudinal extensive...,Longitudinal extensive transverse myelitis (LE...,"Clinical medicine (London, England)",eng,2021,Sep
7,COVID-19 and Myositis: What We Know So Far.,Myositis as a rare manifestation of COVID-19 i...,Current rheumatology reports,eng,2021,07
8,Linking COVID-19 and Parkinson's disease: Targ...,COVID-19 pandemic has a major effect on world ...,Biochemical and biophysical research communica...,eng,2021,Oct
9,The Impact of Coronavirus Disease 2019 on Bari...,The global outbreak of the 2019 novel coronavi...,"Obesity (Silver Spring, Md.)",eng,2020,06


In [16]:
#Standardizing months
df['Month'].replace('Jan', '01', inplace=True)
df['Month'].replace('Feb', '02', inplace=True)
df['Month'].replace('Mar', '03', inplace=True)
df['Month'].replace('Apr', '04', inplace=True)
df['Month'].replace('May', '05', inplace=True)
df['Month'].replace('Jun', '06', inplace=True)
df['Month'].replace('Jul', '07', inplace=True)
df['Month'].replace('Aug', '08', inplace=True)
df['Month'].replace('Sep', '09', inplace=True)
df['Month'].replace('Oct', '10', inplace=True)
df['Month'].replace('Nov', '11', inplace=True)
df['Month'].replace('Dec', '12', inplace=True)
df['Month'].replace('No Data', np.nan, inplace=True)

In [17]:
df.head(10)

Unnamed: 0,Title,Abstract,Journal,Language,Year,Month
0,The Coronavirus 2019 Pandemic and Diabetes: An...,No Abstract,Journal of diabetes science and technology,eng,2020,7.0
1,Epigenetic Lens to Visualize the Severe Acute ...,"In <20 years, we have witnessed three differen...",Frontiers in genetics,eng,2021,
2,Hunting the main protease of SARS-CoV-2 by pli...,COVID-19 has shaken all the countries across t...,Amino acids,eng,2021,11.0
3,COVID-19 infection and diffusion among the hea...,Backgroud: Since the beginning of the coronavi...,La Medicina del lavoro,eng,2020,6.0
4,Helmet Continuous Positive Airway Pressure in ...,No Abstract,Journal of epidemiology and global health,eng,2020,9.0
5,"Effectiveness of mRNA-1273 against delta, mu, ...",To evaluate the effectiveness of the mRNA-1273...,BMJ (Clinical research ed.),eng,2021,12.0
6,Lessons of the month 1: Longitudinal extensive...,Longitudinal extensive transverse myelitis (LE...,"Clinical medicine (London, England)",eng,2021,9.0
7,COVID-19 and Myositis: What We Know So Far.,Myositis as a rare manifestation of COVID-19 i...,Current rheumatology reports,eng,2021,7.0
8,Linking COVID-19 and Parkinson's disease: Targ...,COVID-19 pandemic has a major effect on world ...,Biochemical and biophysical research communica...,eng,2021,10.0
9,The Impact of Coronavirus Disease 2019 on Bari...,The global outbreak of the 2019 novel coronavi...,"Obesity (Silver Spring, Md.)",eng,2020,6.0
