# Retracted papers in SwissProt

Emma Hatton-Ellis, 06/11/2020

In [1]:
import requests
import urllib

import pandas as pd
import numpy as np
import xml.etree.ElementTree as et

The cell below specifies parameters which can be set when running the notebook from the command line with papermill.

In [2]:
report_file = 'uncurated_retractions.tsv'
email = None

In [3]:
esearch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
esummary_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'

In [4]:
search_params = {
    'db': 'pubmed',
    'term': 'Retracted publication[PT]',
    'email': email,
    'retmax': 0,
    'usehistory': 'yes',
    'retmode': 'json'
}

In [5]:
r = requests.get(esearch_url, params=search_params)
r.raise_for_status()
res = r.json()

In [6]:
res

{'header': {'type': 'esearch', 'version': '0.3'},
 'esearchresult': {'count': '8276',
  'retmax': '0',
  'retstart': '0',
  'querykey': '1',
  'webenv': 'MCID_5fda26524911161ab6524703',
  'idlist': [],
  'translationset': [],
  'translationstack': [{'term': 'Retracted publication[PT]',
    'field': 'PT',
    'count': '8276',
    'explode': 'Y'},
   'GROUP'],
  'querytranslation': 'Retracted publication[PT]'}}

Number of retracted publications in PubMed.

In [7]:
search_hits = int(res['esearchresult']['count'])
search_hits

8276

In [8]:
webenv = res['esearchresult']['webenv']
qkey = res['esearchresult']['querykey']

In [9]:
summary_params = {
    'db': 'pubmed',
    'email': email,
    'webenv': webenv,
    'query_key': qkey,
    'retstart': 1,
    'retmax': 10000, # max allowed for xml
    'retmode': 'xml'
}

In [10]:
def extract_pubmed_data(xml):
    """Extracts data from an Entrez ESummary xml document of PubMed citations.

    Returns a list of dictionaries (keys are pmid, title and retraction_notice_pmid) which
    can then be converted into a dataframe.

    Args:
        xml (str): string in Entrez ESummary xml format.

    Returns:
        rows: list of dictionaries.
    """

    rows = []
    tree = et.fromstring(xml)
    for docsum in tree:
        pmid = docsum.find('Id').text
        title = docsum.find('./Item[@Name="Title"]').text
        related_pubs = [x.text for x in docsum.findall('./Item[@Name="References"]/Item[@Name="Retraction in"]')]
        retraction_notice_pmid = []
        for x in related_pubs:
            try:
                notice_pmid = x.split('PMID: ')[1]
            except IndexError:
                notice_pmid = None
            retraction_notice_pmid.append(notice_pmid)
        rows.append({'pmid': pmid, 'title': title, 'retraction_notice_pmid': retraction_notice_pmid})
    return rows

In [11]:
pm_rows = []
with requests.Session() as session:
    for i in range(0, search_hits, 10000):
        summary_params['retstart'] = i
        r = session.get(esummary_url, params=summary_params)
        r.raise_for_status()
        pm_rows += extract_pubmed_data(r.text)

In [12]:
columns = ['pmid', 'title', 'retraction_notice_pmid']
retracted_papers = pd.DataFrame(pm_rows, columns=columns)
retracted_papers

Unnamed: 0,pmid,title,retraction_notice_pmid
0,33074103,Amino acid synthesis loss in parasitoid wasps ...,[33258773]
1,33040737,Globalization and vulnerable populations in ti...,[33317564]
2,32983561,Retracted: Exploring the microbiome and mindfu...,[32986044]
3,32922604,Circulating miR-134 is a potential biomarker f...,[33284900]
4,32884299,lncRNA LSINCT5 Regulates miR-20a-5p/XIAP to In...,[33273824]
...,...,...,...
8271,4122370,Peripheral thymus-dependent (T) lymphocytes in...,[1079321]
8272,4120258,Chromosome assignments in man of the genes for...,"[1088818, 1203488]"
8273,5659639,Effect of x-rays on the electrical conductance...,[5409980]
8274,14043337,UNMINERALIZED FOSSIL BACTERIA.,[5644050]


In [13]:
# Double-check that the correct number of references has been retrieved
assert len(retracted_papers) == search_hits

In [14]:
retracted_papers = retracted_papers.explode('retraction_notice_pmid')

Number of retracted papers with no retraction notice PMID.

In [15]:
retracted_papers.retraction_notice_pmid.fillna(value=np.nan, inplace=True)
retracted_papers[retracted_papers['retraction_notice_pmid'].isnull()]

Unnamed: 0,pmid,title,retraction_notice_pmid
94,32196579,"LncRNA FEZF1-AS1 promoted chemoresistance, aut...",
144,32016976,Effect of miR-200c on migration and proliferat...,
145,32016952,FBW7 inhibits nucleus pulposus cells prolifera...,
146,32016949,Effect of exosomes derived from mir-126-modifi...,
156,31981186,RETRACTED ARTICLE: Long non-coding RNAs in cer...,
...,...,...,...
7597,10597557,Prevention of postoperative vomiting with gran...,
7717,9698964,Modified double burst stimulation of varying s...,
7793,9241335,Evaluation of residual neuromuscular blockade ...,
8175,2993656,"A mos oncogene-containing retrovirus, myelopro...",


In [16]:
retracted_papers

Unnamed: 0,pmid,title,retraction_notice_pmid
0,33074103,Amino acid synthesis loss in parasitoid wasps ...,33258773
1,33040737,Globalization and vulnerable populations in ti...,33317564
2,32983561,Retracted: Exploring the microbiome and mindfu...,32986044
3,32922604,Circulating miR-134 is a potential biomarker f...,33284900
4,32884299,lncRNA LSINCT5 Regulates miR-20a-5p/XIAP to In...,33273824
...,...,...,...
8272,4120258,Chromosome assignments in man of the genes for...,1088818
8272,4120258,Chromosome assignments in man of the genes for...,1203488
8273,5659639,Effect of x-rays on the electrical conductance...,5409980
8274,14043337,UNMINERALIZED FOSSIL BACTERIA.,5644050


In [17]:
def extract_swissprot_data(xml_string):
    """Extracts publication data from UniProt xml.
    
    Args:
        xml_string (str): UniProt entries in xml format.

    Returns:
        rows: list of dictionaries with the following keys: accession, scope, pmid.
    """

    ns = {'up': 'http://uniprot.org/uniprot'}
    tree = et.fromstring(xml_string)
    rows = []

    for elem in tree:
        accession = elem.find('up:accession', ns).text
        references = elem.findall('up:reference', ns)
        for r in references:
            scope = r.find('up:scope', ns).text
            try:
                pmid = r.find('up:citation/up:dbReference[@type="PubMed"]', ns).attrib['id']
            except AttributeError:
                pmid = None
            if '-' not in accession: # ignore isoforms
                rows.append({'accession': accession, 'scope':scope, 'pmid': pmid})
    return rows

Get a table of all retracted citations in SwissProt by accession.

In [18]:
url = 'https://www.ebi.ac.uk/proteins/api/proteins?'
headers = {'accept': 'application/xml'}

pmid_list = retracted_papers['pmid']
sp_rows = []

with requests.Session() as session: 
    session.headers.update(headers)
    for i in range(0, len(pmid_list), 20):
        params = {
            'pubmed': ','.join(j for j in pmid_list[i:i+20]),
            'size': -1,
            'reviewed': 'true',
        }
        r = session.get(url, params=params)
        hits = int(r.headers['X-Pagination-TotalRecords'] )
        if hits > 0:
            rows = extract_swissprot_data(r.text)
            sp_rows += rows

In [19]:
sp_refs = pd.DataFrame(sp_rows)
sp_refs.pmid.fillna(value=np.nan, inplace=True)
sp_refs

Unnamed: 0,accession,scope,pmid
0,P62993,NUCLEOTIDE SEQUENCE [MRNA] (ISOFORM 1),1322798
1,P62993,NUCLEOTIDE SEQUENCE [MRNA] (ISOFORM 1),1384039
2,P62993,NUCLEOTIDE SEQUENCE [MRNA] (ISOFORM 2),8178156
3,P62993,NUCLEOTIDE SEQUENCE [GENOMIC DNA],10051406
4,P62993,NUCLEOTIDE SEQUENCE [LARGE SCALE MRNA] (ISOFOR...,
...,...,...,...
5991,P42166,LACK OF INVOLVEMENT IN DILATED CARDIOMYOPATHY,27896284
5992,P42166,INTERACTION WITH CMTM6,28813417
5993,P42166,STRUCTURE BY NMR OF 1-169,11500367
5994,P42166,STRUCTURE BY NMR OF 1-57 AND 103-159,11435115


In [20]:
retracted_papers_in_swissprot = pd.merge(left=sp_refs, right=retracted_papers, on='pmid')
retracted_papers_in_swissprot

Unnamed: 0,accession,scope,pmid,title,retraction_notice_pmid
0,P62993,INTERACTION WITH ZDHHC19,31462771,Fatty acids and cancer-amplified ZDHHC19 promo...,32555452
1,Q8WVZ1,FUNCTION,31462771,Fatty acids and cancer-amplified ZDHHC19 promo...,32555452
2,P40763,FUNCTION,31462771,Fatty acids and cancer-amplified ZDHHC19 promo...,32555452
3,Q8N3U4,VARIANTS MKMS 743-TRP--PHE-1231 DEL AND 1033-A...,30765867,Nonsense variants in STAG2 result in distinct ...,32536687
4,P23874,FUNCTION,25848049,Stochastic induction of persister cells by Hip...,31110010
...,...,...,...,...,...
306,P15115,NUCLEOTIDE SEQUENCE [GENOMIC DNA],2684782,Nucleotide sequences of genes encoding heat-st...,2227448
307,P00362,NUCLEOTIDE SEQUENCE [GENOMIC DNA],2684782,Nucleotide sequences of genes encoding heat-st...,2227448
308,Q89933,NUCLEOTIDE SEQUENCE [GENOMIC RNA],2303032,Infectious measles virus from cloned cDNA.,1915308
309,P62157,RETRACTED PAPER,3058479,Heat-resistant inhibitors of protein kinase C ...,2180696


In [21]:
retracted_papers_in_swissprot['pmid'].nunique()

138

In [22]:
uncurated_retractions = retracted_papers_in_swissprot[retracted_papers_in_swissprot['scope'] != 'RETRACTED PAPER']

Number of retracted publications for removal.

In [23]:
uncurated_retractions['pmid'].nunique()

98

Number of SwissProt accessions affected.

In [24]:
uncurated_retractions['accession'].nunique()

203

In [25]:
cols = ['accession', 'scope', 'pmid', 'title', 'retraction_notice_pmid']

uncurated_retractions.to_csv(report_file, sep='\t', index=False, columns=cols)