# Retracted papers in SwissProt

Emma Hatton-Ellis, 06/11/2020

In [1]:
import requests
import urllib

import pandas as pd
import numpy as np
import xml.etree.ElementTree as et

from pathlib import Path
from posixpath import join as urljoin # force fowards slash on Windows

In [2]:
esearch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
esummary_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'

In [3]:
sprot = 'C:\sprot' # directory location of SwissProt flat files
report = 'uncurated_retractions.tsv'

In [4]:
search_params = {
    'db': 'pubmed',
    'term': 'Retracted publication[PT]',
    'email': 'ehatton@ebi.ac.uk',
    'retmax': 0,
    'usehistory': 'yes',
    'retmode': 'json'
}

In [5]:
r = requests.get(esearch_url, params=search_params)
r.raise_for_status()
res = r.json()

In [6]:
res

{'header': {'type': 'esearch', 'version': '0.3'},
 'esearchresult': {'count': '8189',
  'retmax': '0',
  'retstart': '0',
  'querykey': '1',
  'webenv': 'MCID_5fb7d0f617e6617f3250d2b0',
  'idlist': [],
  'translationset': [],
  'translationstack': [{'term': 'Retracted publication[PT]',
    'field': 'PT',
    'count': '8189',
    'explode': 'Y'},
   'GROUP'],
  'querytranslation': 'Retracted publication[PT]'}}

Number of retracted publications in PubMed.

In [7]:
search_hits = int(res['esearchresult']['count'])
search_hits

8189

In [8]:
webenv = res['esearchresult']['webenv']
qkey = res['esearchresult']['querykey']

In [9]:
summary_params = {
    'db': 'pubmed',
    'email': 'ehatton@ebi.ac.uk',
    'webenv': webenv,
    'query_key': qkey,
    'retstart': 1,
    'retmax': 10000, # max allowed for xml
    'retmode': 'xml'
}

In [10]:
def extract_pubmed_data(xml):
    """Extracts data from an Entrez ESummary xml document of PubMed citations.

    Returns a list of dictionaries (keys are pmid, title and retraction_notice_pmid) which
    can then be converted into a dataframe.

    Args:
        xml (str): string in Entrez ESummary xml format.

    Returns:
        rows: list of dictionaries.
    """

    rows = []
    tree = et.fromstring(xml)
    for docsum in tree:
        pmid = docsum.find('Id').text
        title = docsum.find('./Item[@Name="Title"]').text
        retractions = [x.text for x in docsum.findall('./Item[@Name="References"]/Item[@Name="Retraction in"]')]
        try:
            retraction_notice_pmid = [x.split('PMID: ')[1] for x in retractions]
        except IndexError:
            retraction_notice_pmid = None
        rows.append({'pmid': pmid, 'title': title, 'retraction_notice_pmid': retraction_notice_pmid})
    return rows

In [11]:
pm_rows = []
for i in range(0, search_hits, 10000):
    summary_params['retstart'] = i
    r = requests.get(esummary_url, params=summary_params)
    r.raise_for_status()
    pm_rows += extract_pubmed_data(r.text)

In [13]:
columns = ['pmid', 'title', 'retraction_notice_pmid']
retracted_papers = pd.DataFrame(pm_rows, columns=columns)
retracted_papers

Unnamed: 0,pmid,title,retraction_notice_pmid
0,32983561,Retracted: Exploring the microbiome and mindfu...,[32986044]
1,32868265,WITHDRAWN:Childhood violence exposure and soci...,[32888891]
2,32848418,The Potential Tumor Promotional Role of circVA...,[33116579]
3,32822247,<b><i>Retracted:</i></b> A Novel CircRNA <i>Ci...,[32985893]
4,32801858,Long Non-Coding RNA AGAP2-AS1/miR-628-5p/PTN A...,[33116829]
...,...,...,...
8184,4122370,Peripheral thymus-dependent (T) lymphocytes in...,[1079321]
8185,4120258,Chromosome assignments in man of the genes for...,"[1088818, 1203488]"
8186,5659639,Effect of x-rays on the electrical conductance...,[5409980]
8187,14043337,UNMINERALIZED FOSSIL BACTERIA.,[5644050]


In [14]:
# Double-check that the correct number of references has been retrieved
assert len(retracted_papers) == search_hits

Number of retracted papers with no retraction notice PMID.

In [15]:
retracted_papers.retraction_notice_pmid.fillna(value=np.nan, inplace=True)
retracted_papers[retracted_papers['retraction_notice_pmid'].isnull()]

Unnamed: 0,pmid,title,retraction_notice_pmid
77,32196579,"LncRNA FEZF1-AS1 promoted chemoresistance, aut...",
89,32141578,Clinical efficacy and prognosis of aspirin com...,
125,32016976,Effect of miR-200c on migration and proliferat...,
126,32016952,FBW7 inhibits nucleus pulposus cells prolifera...,
127,32016949,Effect of exosomes derived from mir-126-modifi...,
...,...,...,...
7510,10597557,Prevention of postoperative vomiting with gran...,
7630,9698964,Modified double burst stimulation of varying s...,
7706,9241335,Evaluation of residual neuromuscular blockade ...,
8088,2993656,"A mos oncogene-containing retrovirus, myelopro...",


In [87]:
retracted_papers = retracted_papers.explode('retraction_notice_pmid')

In [58]:
def extract_swissprot_data(xml_string):
    ns = {'up': 'http://uniprot.org/uniprot'}
    tree = et.fromstring(xml_string)
    rows = []

    for elem in tree:
        accession = elem.find('up:accession', ns).text
        references = elem.findall('up:reference', ns)
        for r in references:
            scope = r.find('up:scope', ns).text
            try:
                pmid = r.find('up:citation/up:dbReference[@type="PubMed"]', ns).attrib['id']
            except AttributeError:
                pmid = None
            if '-' not in accession and pmid is not None:
                rows.append({'accession': accession, 'scope':scope, 'pmid': pmid})
    return rows

Get a table of all retracted citations in SwissProt by accession.

In [72]:
url = 'https://www.ebi.ac.uk/proteins/api/proteins?'
headers = {'accept': 'application/xml'}

pmid_list = retracted_papers['pmid']
sp_rows = []

with requests.Session() as session: 
    session.headers.update(headers)
    for i in range(0, len(pmid_list), 20):
        params = {
            'pubmed': ','.join(j for j in pmid_list[i:i+20]),
            'size': -1,
            'reviewed': 'true',
        }
        r = session.get(url, params=params)
        hits = int(r.headers['X-Pagination-TotalRecords'] )
        if hits > 0:
            rows = extract_swissprot_data(r.text)
            sp_rows += rows

In [74]:
sp_refs = pd.DataFrame(sp_rows)
sp_refs

Unnamed: 0,accession,scope,pmid
0,P62993,NUCLEOTIDE SEQUENCE [MRNA] (ISOFORM 1),1322798
1,P62993,NUCLEOTIDE SEQUENCE [MRNA] (ISOFORM 1),1384039
2,P62993,NUCLEOTIDE SEQUENCE [MRNA] (ISOFORM 2),8178156
3,P62993,NUCLEOTIDE SEQUENCE [GENOMIC DNA],10051406
4,P62993,NUCLEOTIDE SEQUENCE [LARGE SCALE GENOMIC DNA],16625196
...,...,...,...
5647,P42166,LACK OF INVOLVEMENT IN DILATED CARDIOMYOPATHY,27896284
5648,P42166,INTERACTION WITH CMTM6,28813417
5649,P42166,STRUCTURE BY NMR OF 1-169,11500367
5650,P42166,STRUCTURE BY NMR OF 1-57 AND 103-159,11435115


In [76]:
retracted_papers_in_swissprot = pd.merge(left=sp_refs, right=retracted_papers, on='pmid')
retracted_papers_in_swissprot

Unnamed: 0,accession,scope,pmid,title,retraction_notice_pmid
0,P62993,INTERACTION WITH ZDHHC19,31462771,Fatty acids and cancer-amplified ZDHHC19 promo...,[32555452]
1,Q8WVZ1,FUNCTION,31462771,Fatty acids and cancer-amplified ZDHHC19 promo...,[32555452]
2,P40763,FUNCTION,31462771,Fatty acids and cancer-amplified ZDHHC19 promo...,[32555452]
3,Q8N3U4,VARIANTS MKMS 743-TRP--PHE-1231 DEL AND 1033-A...,30765867,Nonsense variants in STAG2 result in distinct ...,[32536687]
4,Q3ZCV2,RETRACTED PAPER,25883318,T cell metabolism. The protein LEM promotes CD...,[27980177]
...,...,...,...,...,...
300,Q89933,NUCLEOTIDE SEQUENCE [GENOMIC RNA],2303032,Infectious measles virus from cloned cDNA.,[1915308]
301,P15115,NUCLEOTIDE SEQUENCE [GENOMIC DNA],2684782,Nucleotide sequences of genes encoding heat-st...,[2227448]
302,P00362,NUCLEOTIDE SEQUENCE [GENOMIC DNA],2684782,Nucleotide sequences of genes encoding heat-st...,[2227448]
303,P62157,RETRACTED PAPER,3058479,Heat-resistant inhibitors of protein kinase C ...,[2180696]


In [77]:
retracted_papers_in_swissprot['pmid'].nunique()

138

In [97]:
retracted_papers_in_swissprot = retracted_papers_in_swissprot[retracted_papers_in_swissprot['scope'] != 'RETRACTED PAPER']

In [101]:
retracted_papers_in_swissprot['pmid'].nunique()

98

In [104]:
sp_refs[sp_refs['scope'].str.contains('12397363')]

Unnamed: 0,accession,scope,pmid
2712,Q9VW15,RETRACTION NOTICE OF PUBMED:12397363,25874679
3971,Q9VW15,RETRACTION NOTICE OF PUBMED:12397363,25874679


In [83]:
retracted_papers_in_swissprot[retracted_papers_in_swissprot['scope'].str.contains('RETRACTED PAPER')]

Unnamed: 0,accession,scope,pmid,title,retraction_notice_pmid
4,Q3ZCV2,RETRACTED PAPER,25883318,T cell metabolism. The protein LEM promotes CD...,[27980177]
5,A2AVQ5,RETRACTED PAPER,25883318,T cell metabolism. The protein LEM promotes CD...,[27980177]
7,A0A452E9Y6,RETRACTED PAPER,25760705,Mode of binding of the antithyroid drug propyl...,[26057817]
12,Q6UXH0,RETRACTED PAPER,23623304,Betatrophin: a hormone that controls pancreati...,[28038792]
13,Q8R1L8,RETRACTED PAPER,23623304,Betatrophin: a hormone that controls pancreati...,[28038792]
...,...,...,...,...,...
281,Q14789,RETRACTED PAPER,7511208,Molecular genetic analyses of a 376-kilodalton...,[7799969]
282,P33124,RETRACTED PAPER,1654331,Molecular cloning and sequencing of cDNA encod...,[1460058]
298,P0C518,RETRACTED PAPER,2143016,Nucleotide sequences of mitochondrial ATPase s...,[2235528]
299,P18203,RETRACTED PAPER,2253615,Amino acid sequence of a 12-kDa inhibitor of p...,[1915353]


In [87]:
sp_refs[sp_refs['scope'].str.contains('RETRACTION')]

Unnamed: 0,accession,scope,pmid
166,Q3ZCV2,RETRACTION NOTICE OF PUBMED:25883318,27980177
170,A2AVQ5,RETRACTION NOTICE OF PUBMED:25883318,27980177
197,A0A452E9Y6,RETRACTION NOTICE OF PUBMED:25760705,26057817
228,Q6UXH0,RETRACTION NOTICE OF PUBMED:23623304,28038792
245,Q8R1L8,RETRACTION NOTICE OF PUBMED:23623304,28038792
...,...,...,...
5511,Q14789,RETRACTION NOTICE OF PUBMED:7511208,7799969
5525,P33124,RETRACTION NOTICE OF PUBMED:1654331,1460058
5572,P0C518,RETRACTION NOTICE OF PUBMED:2143016,2235528
5575,P18203,RETRACTION NOTICE OF PUBMED:2253615,1915353


In [70]:
uncurated_retractions['pmid'].nunique()

79

Number of SwissProt accessions affected.

In [71]:
uncurated_retractions['accession'].nunique()

165

In [72]:
cols = ['accession', 'entry_name', 'pmid', 'title', 'retraction_notice_pmid']

report_path = Path(report)
if report_path.suffix == '.tsv':
    uncurated_retractions.to_csv(report_path.name, sep='\t', index=False, columns=cols)
elif report_path.suffix == '.xslx':
    uncurated_retractions.to_excel(report_path.name, index=False, columns=cols)
else:
    raise ValueError(f'Unexpected file suffix: {report_path.suffix}. Valid file types are *.tsv and *.xslx.')