# Retracted papers in SwissProt

Emma Hatton-Ellis, 06/11/2020

In [21]:
import requests
import pandas as pd
import numpy as np
import xml.etree.ElementTree as et

In [2]:
esearch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
esummary_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'

In [None]:
uniprot_url = 'https://www.uniprot.org/uniprot'
report_file = 'uncurated_retractions.tsv'

In [3]:
search_params = {
    'db': 'pubmed',
    'term': 'Retracted publication[PT]',
    'email': 'ehatton@ebi.ac.uk',
    'retmax': 0,
    'usehistory': 'yes',
    'retmode': 'json'
}

In [6]:
r = requests.get(esearch_url, params=search_params)
r.raise_for_status()
res = r.json()

In [12]:
# TODO: handle situation where more than 10000 search hits found
search_hits = int(res['esearchresult']['count'])
assert search_hits < 10000

In [13]:
webenv = res['esearchresult']['webenv']
qkey = res['esearchresult']['querykey']

In [14]:
summary_params = {
    'db': 'pubmed',
    'email': 'ehatton@ebi.ac.uk',
    'webenv': webenv,
    'query_key': qkey,
    'retstart': 1,
    'retmax': 10000, # max allowed for xml
    'retmode': 'xml'
}

In [15]:
r = requests.get(esummary_url, params=summary_params)

In [16]:
res = et.fromstring(r.text)

Display a single Entrez summary.

In [17]:
print(et.tostring(res[0], encoding='utf-8').decode('utf-8'))

<DocSum>
	<Id>32848418</Id>
	<Item Name="PubDate" Type="Date">2020</Item>
	<Item Name="EPubDate" Type="Date">2020 Aug 5</Item>
	<Item Name="Source" Type="String">Onco Targets Ther</Item>
	<Item Name="AuthorList" Type="List">
		<Item Name="Author" Type="String">Xu Q</Item>
	</Item>
	<Item Name="LastAuthor" Type="String">Xu Q</Item>
	<Item Name="Title" Type="String">The Potential Tumor Promotional Role of circVAPA in Retinoblastoma via Regulating miR-615-3p and SMARCE1.</Item>
	<Item Name="Volume" Type="String">13</Item>
	<Item Name="Issue" Type="String" />
	<Item Name="Pages" Type="String">7839-7849</Item>
	<Item Name="LangList" Type="List">
		<Item Name="Lang" Type="String">English</Item>
	</Item>
	<Item Name="NlmUniqueID" Type="String">101514322</Item>
	<Item Name="ISSN" Type="String" />
	<Item Name="ESSN" Type="String">1178-6930</Item>
	<Item Name="PubTypeList" Type="List">
		<Item Name="PubType" Type="String">Journal Article</Item>
		<Item Name="PubType" Type="String">Retracted Publ

In [18]:
columns = ['pmid', 'title', 'retraction_notice_pmid']
rows = []

for docsum in res:
    pmid = docsum.find('Id').text
    title = docsum.find('./Item[@Name="Title"]').text
    retractions = [x.text for x in docsum.findall('./Item[@Name="References"]/Item[@Name="Retraction in"]')]
    try:
        retraction_notice_pmid = [x.split('PMID: ')[1] for x in retractions]
    except IndexError:
        retraction_notice_pmid = None
    rows.append({'pmid': pmid, 'title': title, 'retraction_notice_pmid': retraction_notice_pmid})


In [19]:
retracted_papers = pd.DataFrame(rows, columns=columns)
retracted_papers

Unnamed: 0,pmid,title,retraction_notice_pmid
0,32848418,The Potential Tumor Promotional Role of circVA...,[33116579]
1,32822247,<b><i>Retracted:</i></b> A Novel CircRNA <i>Ci...,[32985893]
2,32801858,Long Non-Coding RNA AGAP2-AS1/miR-628-5p/PTN A...,[33116829]
3,32782982,Retracted Article: Application of 3D printing ...,[32954041]
4,32764880,Cordycepin Alleviates Anterior Cruciate Ligame...,[33116402]
...,...,...,...
8122,4122370,Peripheral thymus-dependent (T) lymphocytes in...,[1079321]
8123,4120258,Chromosome assignments in man of the genes for...,"[1088818, 1203488]"
8124,5659639,Effect of x-rays on the electrical conductance...,[5409980]
8125,14043337,UNMINERALIZED FOSSIL BACTERIA.,[5644050]


Number of retracted papers with no retraction notice PMID.

In [22]:
retracted_papers.retraction_notice_pmid.fillna(value=np.nan, inplace=True)
retracted_papers[retracted_papers['retraction_notice_pmid'].isnull()]

Unnamed: 0,pmid,title,retraction_notice_pmid
117,32016976,Effect of miR-200c on migration and proliferat...,
118,32016952,FBW7 inhibits nucleus pulposus cells prolifera...,
119,32016949,Effect of exosomes derived from mir-126-modifi...,
126,31981186,RETRACTED ARTICLE: Long non-coding RNAs in cer...,
129,31957830,Long noncoding RNA ROR1-AS1 induces tumor meta...,
...,...,...,...
7457,10597557,Prevention of postoperative vomiting with gran...,
7575,9698964,Modified double burst stimulation of varying s...,
7650,9241335,Evaluation of residual neuromuscular blockade ...,
8026,2993656,"A mos oncogene-containing retrovirus, myelopro...",


In [23]:
retracted_papers = retracted_papers.explode('retraction_notice_pmid')

['32848418',
 '32822247',
 '32801858',
 '32782982',
 '32764880',
 '32709401',
 '32699314',
 '32696949',
 '32683951',
 '32678639']

Get a table of all SwissProt citations by accession.

In [28]:
sp_refs = pd.read_csv('https://www.uniprot.org/uniprot/?query=reviewed:yes&columns=id,entry+name,citation&format=tab&compress=yes', sep='\t', compression='gzip', names=['accession', 'entry_name', 'pmid'], header=0, converters={'pmid': lambda x:x.split('; ')})
sp_refs

Unnamed: 0,accession,entry_name,pmid
0,P49122,3SOF7_NAJAT,"[8679666, 15587986]"
1,C5VZW3,ACCD_STRSE,[19603075]
2,P0C2L1,A3X1_LOXLA,[8819009]
3,A6TDH2,AAS_KLEP7,[]
4,B5EWT7,ACPH_SALA4,[21602358]
...,...,...,...
563547,Q10032,YQ93_CAEEL,[9851916]
563548,Q6P4U6,ZNRF1_DANRE,[]
563549,Q1R778,YQGF_ECOUT,[16585510]
563550,P18017,YPI6_CLOPF,[2901768]


Copy the PMID list to a new column as we need to check this list later for a retraction notice PMID.

In [31]:
sp_refs['pmid_list'] = sp_refs['pmid']

In [32]:
sp_refs

Unnamed: 0,accession,entry_name,pmid,pmid_list
0,P49122,3SOF7_NAJAT,"[8679666, 15587986]","[8679666, 15587986]"
1,C5VZW3,ACCD_STRSE,[19603075],[19603075]
2,P0C2L1,A3X1_LOXLA,[8819009],[8819009]
3,A6TDH2,AAS_KLEP7,[],[]
4,B5EWT7,ACPH_SALA4,[21602358],[21602358]
...,...,...,...,...
563547,Q10032,YQ93_CAEEL,[9851916],[9851916]
563548,Q6P4U6,ZNRF1_DANRE,[],[]
563549,Q1R778,YQGF_ECOUT,[16585510],[16585510]
563550,P18017,YPI6_CLOPF,[2901768],[2901768]


In [33]:
sp_refs = sp_refs.explode('pmid')

In [60]:
# Clean up empty strings by converting to NaN
sp_refs['pmid'].replace('', np.nan, inplace=True)

In [61]:
sp_refs.head()

Unnamed: 0,accession,entry_name,pmid,pmid_list
0,P49122,3SOF7_NAJAT,8679666.0,"[8679666, 15587986]"
0,P49122,3SOF7_NAJAT,15587986.0,"[8679666, 15587986]"
1,C5VZW3,ACCD_STRSE,19603075.0,[19603075]
2,P0C2L1,A3X1_LOXLA,8819009.0,[8819009]
3,A6TDH2,AAS_KLEP7,,[]


In [79]:
retracted_papers_in_swissprot = pd.merge(left=sp_refs, right=retracted_papers, on='pmid')
retracted_papers_in_swissprot

Unnamed: 0,accession,entry_name,pmid,pmid_list,title,retraction_notice_pmid
0,Q94F62,BAK1_ARATH,20876109,"[11706164, 12150929, 20064227, 10617198, 27862...",Autophosphorylation of Tyr-610 in the receptor...,27325779
1,Q94F62,BAK1_ARATH,21350342,"[11706164, 12150929, 20064227, 10617198, 27862...",Functional importance of BAK1 tyrosine phospho...,27603314
2,Q6UXH0,ANGL8_HUMAN,23623304,"[15352036, 12975309, 15057824, 22569073, 22809...",Betatrophin: a hormone that controls pancreati...,28038792
3,Q8R1L8,ANGL8_MOUSE,23623304,"[19468303, 15489334, 20562862, 22569073, 22809...",Betatrophin: a hormone that controls pancreati...,28038792
4,P10844,BXB_CLOBO,10932255,"[1514783, 8408542, 3139097, 4030755, 3856850, ...",Cocrystal structure of synaptobrevin-II bound ...,19578378
...,...,...,...,...,...,...
276,O94875,SRBS2_HUMAN,15784622,"[9211900, 11786189, 9872452, 14702039, 1581562...",ArgBP2gamma interacts with Akt and p21-activat...,27825083
277,Q9UL45,BL1S6_HUMAN,21665000,"[10610180, 14702039, 15489334, 11936273, 12191...",A BLOC-1 mutation screen reveals that PLDN is ...,28475864
278,A9QXE0,CHT_ASPNG,21210990,"[17061133, 23455586, 21210990, 23870008, 21892...","Heterologous expression, purification and char...",23870008
279,P35222,CTNB1_HUMAN,11279024,"[7806582, 14702039, 16641997, 15489334, 120191...",Regulation of beta-catenin structure and activ...,27226643


In [86]:
def missing_retraction_notice(row):
    return row['retraction_notice_pmid'] not in row['pmid_list']

In [89]:
uncurated_retractions = retracted_papers_in_swissprot[retracted_papers_in_swissprot.apply(missing_retraction_notice, axis=1)]

uncurated_retractions

Unnamed: 0,accession,entry_name,pmid,pmid_list,title,retraction_notice_pmid
0,Q94F62,BAK1_ARATH,20876109,"[11706164, 12150929, 20064227, 10617198, 27862...",Autophosphorylation of Tyr-610 in the receptor...,27325779
1,Q94F62,BAK1_ARATH,21350342,"[11706164, 12150929, 20064227, 10617198, 27862...",Functional importance of BAK1 tyrosine phospho...,27603314
6,Q8BGV9,ATG4D_MOUSE,12446702,"[12446702, 16141072, 15489334, ]","Human autophagins, a family of cysteine protei...",30808002
7,Q8WYN0,ATG4A_HUMAN,12446702,"[12446702, 15169837, 14702039, 15772651, 15489...","Human autophagins, a family of cysteine protei...",30808002
8,Q8BGE6,ATG4B_MOUSE,12446702,"[12446702, 16141072, 15489334, 14621295, 21183...","Human autophagins, a family of cysteine protei...",30808002
...,...,...,...,...,...,...
270,Q4QQV3,F162A_RAT,19520982,"[15489334, 19520982]",Proapoptotic role of human growth and transfor...,32272861
272,P28715,ERCC5_HUMAN,9096355,"[8483504, 7510366, 8413238, 11266544, 15057823...",A common mutational pattern in Cockayne syndro...,17179216
273,Q8RWY6,CLASP_ARATH,22500804,"[10617197, 27862469, 14593172, 18042620, 17272...",A PLETHORA-auxin transcription module controls...,24267897
275,P31645,SC6A4_HUMAN,18227069,"[8452685, 7681602, 7684072, 16601320, 15489334...",Serotonin transamidates Rab4 and facilitates i...,31201246


Number of retracted papers in SwissProt which have not been flagged.

In [90]:
uncurated_retractions['pmid'].nunique()

79

Number of SwissProt accessions affected.

In [91]:
uncurated_retractions['accession'].nunique()

165

In [93]:
cols = ['accession', 'entry_name', 'pmid', 'title', 'retraction_notice_pmid']
uncurated_retractions.to_csv('uncurated_retractions.tsv', sep='\t', index=False, columns=cols)