# Setup

In [1]:
from Bio import Entrez
import pandas as pd
import json # for pretty printing

In [2]:
Entrez.email = "g.carbone8@campus.unimib.it"

# PubMed Articles IDs Search and Retrieval

## Batch Search Idea with for loop and 10000+ search entries (not working)

In [None]:
search_record = {}
counter = 0
NUM_ARTICLES = 9999
ret_start = 0
lenght_batch = 1
while (lenght_batch != 0):
    handle = Entrez.esearch(db = "pubmed", retmax = NUM_ARTICLES, retstart = ret_start, 
                            term = '"toxoplasma+gondii"[All]', idtype = "acc")
    search_record[counter] = Entrez.read(handle)
    print(counter)
    lenght_batch = len(search_record[counter]["IdList"])
    counter += 1
    ret_start += NUM_ARTICLES
    handle.close()

## Batch Articles IDs Search per Data Range

For PubMed, ESearch can only retrieve the first 10,000 records matching the query. To obtain more than 10,000 PubMed records, consider using <EDirect> that contains additional logic to batch PubMed search results automatically so that an arbitrary number can be retrieved.

To avoid the maximum number of articles retrieved limitation, i decided to search them in batches of maximum 9999 articles, using a date range as limiter.

Fetching the most recent 9934 articles published from **2009** to **2023**:

In [13]:
NUM_ARTICLES = 10000
search_record = {}
handle = Entrez.esearch(db = "pubmed", 
                        retmax = NUM_ARTICLES, 
                        retstart = 0, 
                        term = '(("toxoplasma gondii"[Title/Abstract]) AND ("2009/01/01"[Date - Publication] : "2023/03/22"[Date - Publication]))', 
                        idtype = "acc",
                        retmode = "xml")
search_record[">2009"] = Entrez.read(handle)
handle.close()

In [18]:
print(search_record[">2009"].keys())
print(search_record[">2009"]["QueryTranslation"])
print(len(search_record[">2009"]["IdList"]))

"toxoplasma gondii"[Title/Abstract] AND 2009/01/01:2023/03/22[Date - Publication]
9934


Fetching older 7770 articles published **before 2009**:

In [16]:
NUM_ARTICLES = 10000
handle = Entrez.esearch(db = "pubmed", 
                        retmax = NUM_ARTICLES, 
                        retstart = 0, 
                        term = '(("toxoplasma gondii"[Title/Abstract]) AND ("1900/01/01"[Date - Publication] : "2008/12/31"[Date - Publication]))', 
                        idtype = "acc",
                        retmode = "xml")
search_record["<2009"] = Entrez.read(handle)
handle.close()

In [24]:
print(search_record["<2009"].keys())
print(search_record["<2009"]["WarningList"])
print(len(search_record["<2009"]["IdList"]))

{'QuotedPhraseNotFound': [], 'OutputMessage': ['Restrictions achieved. start and count adjusted to 0, 9999'], 'PhraseIgnored': []}
7770


## Merging the batches and Remove Duplicates

Now I'll merge the IDs Batches obtaining a single list of 10000+ IDs.

In [26]:
search_record_full = list(search_record[">2009"]["IdList"]) + list(search_record["<2009"]["IdList"])

Number of articles to fetch

In [None]:
# Number of articles
print(len(search_record_full))
# Number of unique articles
print(len(set(search_record_full)))

Removing the duplicates

In [44]:
search_record_full = list(set(search_record_full))

# Fetch Bibliographic Information from retrieved IDs

Also `Entrez.efetch` has a maximum number of 'IDs' which can be retrieved, it is important to slice the list of IDs during fetching.

Fetch detailed information about every article in the list

In [45]:
# Retrieve only `top_n` most recent articles
records = {}
half = len(search_record_full) // 2
handle = Entrez.efetch(db = "pubmed", 
                       # Convert list to string with comma as separator 
                       # to pass to Entrez.efetch
                       id = ','.join(search_record_full[:half]),
                       retmode = "xml")
records['1stHalf'] = Entrez.read(handle)
handle.close()
handle = Entrez.efetch(db = "pubmed", 
                       # Convert list to string with comma as separator 
                       # to pass to Entrez.efetch
                       id = ','.join(search_record_full[half:]),
                       retmode = "xml")
records['2ndHalf'] = Entrez.read(handle)
handle.close()

Record dictionary details:

In [49]:
# First Half
print(records['1stHalf'].keys())

print(len(records['1stHalf']["PubmedArticle"]))
print(type(records['1stHalf']["PubmedArticle"]))
print(type(records['1stHalf']["PubmedArticle"][1]))

# Second Half
print(len(records['2ndHalf']["PubmedArticle"]))
print(type(records['2ndHalf']["PubmedArticle"]))
print(type(records['2ndHalf']["PubmedArticle"][1]))

dict_keys(['PubmedArticle', 'PubmedBookArticle'])
8814
<class 'list'>
<class 'Bio.Entrez.Parser.DictionaryElement'>
8815
<class 'list'>
<class 'Bio.Entrez.Parser.DictionaryElement'>


Merging the two records dictionaries in a single one:

In [51]:
full_records = {}
full_records['PubmedArticle'] = records['1stHalf']["PubmedArticle"] + records['2ndHalf']["PubmedArticle"]
full_records['PubmedBookArticle'] = records['1stHalf']["PubmedBookArticle"] + records['2ndHalf']["PubmedBookArticle"]

In [52]:
print(len(full_records["PubmedArticle"]))
print(type(full_records["PubmedArticle"]))
print(type(full_records["PubmedArticle"][1]))

17629
<class 'list'>
<class 'Bio.Entrez.Parser.DictionaryElement'>


In [53]:
print(json.dumps(full_records['PubmedArticle'][1], indent = 2))

{
  "MedlineCitation": {
    "OtherID": [],
    "OtherAbstract": [],
    "KeywordList": [],
    "GeneralNote": [],
    "CitationSubset": [
      "IM"
    ],
    "SpaceFlightMission": [],
    "PMID": "12197139",
    "DateCompleted": {
      "Year": "2002",
      "Month": "09",
      "Day": "17"
    },
    "DateRevised": {
      "Year": "2006",
      "Month": "11",
      "Day": "15"
    },
    "Article": {
      "ArticleDate": [],
      "ELocationID": [],
      "Language": [
        "eng"
      ],
      "Journal": {
        "ISSN": "0022-3395",
        "JournalIssue": {
          "Volume": "88",
          "Issue": "4",
          "PubDate": {
            "Year": "2002",
            "Month": "Aug"
          }
        },
        "Title": "The Journal of parasitology",
        "ISOAbbreviation": "J Parasitol"
      },
      "ArticleTitle": "Development and evaluation of an enzyme-linked immunosorbent assay with recombinant SAG2 for diagnosis of Toxoplasma gondii infection in cats.",
      "P

# Parsing retrieved articles and build a dataframe

## JSON Parsing

We create multiple dataframes containing information about:
-  Retrieved Article
-  ~~Journal that published the articles~~

First of all we'll extract some basic information about the retrieved articles:
- PubMed ID
- Title
- Abstract Text
- Dates
  - Completed Date
  - Revised Date
  - Published Year
- Language
- Authors List
- PublicationTypeList
- Journal Country

In [None]:
# Create empty dictionary to store pubmed_id its info
id_article = {}
counter = 0
for record in full_records['PubmedArticle']:
    print(counter)
    counter += 1
    # Root element to access all data about a given article
    root = record['MedlineCitation']['Article']
    # Get pubmed id to be used for dictionary key access
    uid = record['MedlineCitation']['PMID']
    print(uid)
    # Create empty dictionary for each given article id
    id_article[uid] = {}
    # Title
    try:
        id_article[uid]['Title'] = root['ArticleTitle']
    except:
        id_article[uid]['Title'] = None
    # Country
    try:
        id_article[uid]['Journal Country'] = record['MedlineCitation']['MedlineJournalInfo']['Country']
    except:
        id_article[uid]['Journal Country'] = None
    # Abstract text different sections like results, conclusions, 
    # that are in separate elements of the list are joined together
    try:
        id_article[uid]['Abstract'] = "\t".join(root['Abstract']['AbstractText']) if 'Abstract' in root.keys() else None
    except:
        id_article[uid]['Abstract'] = None
    # Date (submission?) as MM/DD/YYYY
    try: 
        id_article[uid]['ArticleDate'] = "/".join(root['ArticleDate'][0].values()) if len(root['ArticleDate']) != 0 else None 
    except:
        id_article[uid]['ArticleDate'] = None
    # Completed date as MM/DD/YYYY
    try: 
        id_article[uid]['CompletedDate'] = "/".join(record['MedlineCitation']['DateCompleted'].values()) if len(record['MedlineCitation']['DateCompleted']) != 0 else None
    except:
        id_article[uid]['CompletedDate'] = None
    # Revised date as MM/DD/YYYY
    try:
        id_article[uid]['RevisedDate'] = "/".join(record['MedlineCitation']['DateRevised'].values()) if len(record['MedlineCitation']['DateRevised']) != 0 else None
    except:
        id_article[uid]['RevisedDate'] = None
    # Publication year
    try:
        id_article[uid]['PublicationYear'] = record['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']["Year"]
    except:
        id_article[uid]['PublicationYear'] = None
    # Language(s)
    try:
        id_article[uid]['Language'] = " | ".join(root['Language'])
    except:
        id_article[uid]['Language'] = None
    # Authors list
    try:
        id_article[uid]["AuthorList"] = " | ".join([i['LastName'] + ", " + i['Initials']\
                                                for i in root['AuthorList'] if 'LastName' and 'Initials' in i.keys()]) if 'AuthorList' in root.keys() else None
    except:
        id_article[uid]["AuthorList"] = None
    # List of publication types
    try: 
        id_article[uid]['PublicationTypeList'] = " | ".join(root['PublicationTypeList'])
    except:
        id_article[uid]['PublicationTypeList'] = None
        

Example of the parsed data:

In [63]:
print(json.dumps(id_article["12197139"], indent = 2))

{
  "Title": "Development and evaluation of an enzyme-linked immunosorbent assay with recombinant SAG2 for diagnosis of Toxoplasma gondii infection in cats.",
  "Journal Country": "United States",
  "Abstract": "Cats are pivotal in the transmission of Toxoplasma gondii. To develop a sensitive and specific serodiagnostic method for feline toxoplasmosis, surface antigen 2 (SAG2) of T. gondii was expressed in Escherichia coli and its diagnostic potential evaluated in an enzyme-linked immunosorbent assay (ELISA). The ELISA with recombinant SAG2 (rSAG2) was able to differentiate very clearly between sera from cats experimentally infected with T. gondii and sera from normal cats. Serum samples collected from domestic cats in Japan were investigated by the ELISA, and the results were compared with those of a commercially available latex agglutination test (LAT) kit. Of the 192 samples screened, 42 (21.9%) were positive by ELISA. Among the 42 ELISA-positive samples, 39 were positive by LAT. Th

Create the dataframe

In [64]:
id_article_df = pd.DataFrame.from_dict(id_article, orient = "index")
id_article_df.head()

Unnamed: 0,Title,Journal Country,Abstract,ArticleDate,CompletedDate,RevisedDate,PublicationYear,Language,AuthorList,PublicationTypeList
22946380,Granulomatous meningoencephalitis due to toxop...,United States,This report describes toxoplasmosis infection ...,,2012/09/25,2012/09/05,2011,eng,"Antoniassi, NA | Boabaid, FM | Souza, RL | Nak...",Case Reports | Journal Article
12197139,Development and evaluation of an enzyme-linked...,United States,Cats are pivotal in the transmission of Toxopl...,,2002/09/17,2006/11/15,2002,eng,"Huang, X | Xuan, X | Kimbita, EN | Battur, B |...",Comparative Study | Journal Article | Research...
19578625,Neurotoxoplasmosis diagnosis for HIV-1 patient...,Brazil,Encephalitis caused by Toxoplasma gondii is th...,,2009/08/27,2019/10/27,2009,eng,"Nogui, FL | Mattas, S | Turcato Júnior, G | Le...","Journal Article | Research Support, Non-U.S. G..."
18703054,Neospora caninum: detection in wild rabbits an...,United States,Neospora caninum is an important pathogen of c...,2008/07/30,2008/10/24,2022/04/08,2008,eng,"Hughes, JM | Thomasson, D | Craig, PS | Georgi...","Journal Article | Research Support, Non-U.S. G..."
29469258,[Investigation on <i>Toxoplasma gondii</i> inf...,China,To investigate the prevalence of <i>Toxoplasma...,,2018/07/12,2018/12/02,2016,chi,"Yun, F | Hui-Fang, L | Min-Yuan, S",Journal Article


## Abstract Text Cleaning

In [66]:
import re 

In [65]:
# Copy abstract to new column
id_article_df['PreprocessedAbstract'] = id_article_df['Abstract']

Convert all None elements to wmpty strings (to be able to use re functions)

In [71]:
id_article_df['PreprocessedAbstract'] = id_article_df['PreprocessedAbstract'].fillna('')

Remove Puntuaction

In [79]:
def remove_punctuation(text):
    return re.sub('[^\w\s]', '', text)
id_article_df['PreprocessedAbstract'] = id_article_df['PreprocessedAbstract'].map(remove_punctuation)

Lower the text

In [81]:
def to_lower(text):
    return text.lower()
id_article_df['PreprocessedAbstract'] = id_article_df['PreprocessedAbstract'].map(to_lower)

In [82]:
id_article_df['PreprocessedAbstract'][100]

'apicomplexans are successful parasites responsible for severe human diseases including malaria toxoplasmosis and cryptosporidiosis for many years it has been discussed whether these parasites are in possession of peroxisomes highly variable eukaryotic organelles usually involved in fatty acid degradation and cellular detoxification conflicting experimental data has been published with the age of genomics ever more high quality apicomplexan genomes have become available that now allow a new assessment of the dispute here we provide bioinformatic evidence for the presence of peroxisomes in toxoplasma gondii and other coccidians for these organisms we have identified a complete set of peroxins probably responsible for peroxisome biogenesis division and protein import moreover via a global screening for peroxisomal targeting signals we were able to show that a complete set of fatty acid βoxidation enzymes is equipped with either pts1 or pts2 sequences most likely mediating transport of th

## Convert Authors and PublicationTypeList in lists

In [83]:
# Definizione della funzione per trasformare una stringa in una lista di stringhe utilizzando il separatore '|'
def string_to_list(text):
    return text.split('|')

# Applicazione della funzione alla colonna 'my_column' tramite apply()
id_article_df['AuthorList'] = id_article_df['AuthorList'].fillna('')
id_article_df['AuthorList'] = id_article_df['AuthorList'].apply(string_to_list)

id_article_df['PublicationTypeList'] = id_article_df['PublicationTypeList'].fillna('')
id_article_df['PublicationTypeList'] = id_article_df['PublicationTypeList'].apply(string_to_list)

In [84]:
id_article_df.head()

Unnamed: 0,Title,Journal Country,Abstract,ArticleDate,CompletedDate,RevisedDate,PublicationYear,Language,AuthorList,PublicationTypeList,PreprocessedAbstract
22946380,Granulomatous meningoencephalitis due to toxop...,United States,This report describes toxoplasmosis infection ...,,2012/09/25,2012/09/05,2011,eng,"[Antoniassi, NA , Boabaid, FM , Souza, RL , ...","[Case Reports , Journal Article]",this report describes toxoplasmosis infection ...
12197139,Development and evaluation of an enzyme-linked...,United States,Cats are pivotal in the transmission of Toxopl...,,2002/09/17,2006/11/15,2002,eng,"[Huang, X , Xuan, X , Kimbita, EN , Battur,...","[Comparative Study , Journal Article , Resea...",cats are pivotal in the transmission of toxopl...
19578625,Neurotoxoplasmosis diagnosis for HIV-1 patient...,Brazil,Encephalitis caused by Toxoplasma gondii is th...,,2009/08/27,2019/10/27,2009,eng,"[Nogui, FL , Mattas, S , Turcato Júnior, G ,...","[Journal Article , Research Support, Non-U.S....",encephalitis caused by toxoplasma gondii is th...
18703054,Neospora caninum: detection in wild rabbits an...,United States,Neospora caninum is an important pathogen of c...,2008/07/30,2008/10/24,2022/04/08,2008,eng,"[Hughes, JM , Thomasson, D , Craig, PS , Ge...","[Journal Article , Research Support, Non-U.S....",neospora caninum is an important pathogen of c...
29469258,[Investigation on <i>Toxoplasma gondii</i> inf...,China,To investigate the prevalence of <i>Toxoplasma...,,2018/07/12,2018/12/02,2016,chi,"[Yun, F , Hui-Fang, L , Min-Yuan, S]",[Journal Article],to investigate the prevalence of itoxoplasma g...


## Save the dataset

In [85]:
id_article_df.to_csv("Data/toxoplasma_gondii_pubmed.csv", index_label = "pubmed_id", encoding = 'utf-8')

In [89]:
print(id_article_df.columns)

Index(['Title', 'Journal Country', 'Abstract', 'ArticleDate', 'CompletedDate',
       'RevisedDate', 'PublicationYear', 'Language', 'AuthorList',
       'PublicationTypeList', 'PreprocessedAbstract'],
      dtype='object')
