# XML Final Analysis
### Daina Bouquin, Daniel Chivvis

Scripts below were used to generate all .csv files in XML_RESULTS
Full repo of files: https://github.com/dbouquin/cite_astro_software_2019

In [1]:
import pandas as pd
import numpy as np
import sys
import csv

In [2]:
XML_results = pd.read_csv("XML_CLEAN_INPUT_082019.csv") 

In [3]:
list(XML_results.columns.values)

['Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'DOI',
 'Journal_Title',
 'Article_ID',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Author(s)',
 'Publisher',
 'Title']

In [4]:
XML_results.head(5)

Unnamed: 0,Alias,Software_Package,Identifier,Pub_Year,DOI,Journal_Title,Article_ID,File_Name,Parent1_Tag,Parent2_Tag,Parent3_Tag,Parent4_Tag,Parent1_Content,Parent2_Content,Parent3_Content,Author(s),Publisher,Title
0,astroblend,AstroBlend,0,2016,10.3847/0004-637X/818/2/115,The Astrophysical Journal,"apj521773, 10.3847/0004-637X/818/2/115, 521773...",apj_818_2_115.xml,sc,p,sec,sec,['astroblend'],['We note that our example scripts only explor...,['\n<label>3.5.</label>\n<title>From <sans-ser...,Vogt Frédéric P. A. Owen Chris I. Verdes-Mon...,The American Astronomical Society,ADVANCED DATA VISUALIZATION IN ASTROPHYSICS: T...
1,astroblend.com,AstroBlend,0,2016,10.3847/0004-637X/818/2/115,The Astrophysical Journal,"apj521773, 10.3847/0004-637X/818/2/115, 521773...",apj_818_2_115.xml,ext-link,p,fn,p,['http://www.astroblend.com'],"['\n<ext-link ext-link-type=""uri"" xlink:href=""...",['\n<label><sup>26</sup></label>\n<p>\n<ext-li...,Vogt Frédéric P. A. Owen Chris I. Verdes-Mon...,The American Astronomical Society,ADVANCED DATA VISUALIZATION IN ASTROPHYSICS: T...
2,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/1/13,The Astronomical Journal,"aj493368, ANJOAA, 10.1088/0004-6256/148/1/13, ...",aj_148_1_13.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Rodney Steven A.Riess Adam G.Strolger Louis-Gr...,The American Astronomical Society,TYPE Ia SUPERNOVA RATE MEASUREMENTS TO REDSHIF...
3,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/1/14,The Astronomical Journal,"aj495229, ANJOAA, 10.1088/0004-6256/148/1/14, ...",aj_148_1_14.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Bañados E.Venemans B. P.Morganson E.Decarli R....,The American Astronomical Society,DISCOVERY OF EIGHT z ∼ 6 QUASARS FROM Pan-STARRS1
4,10.1051/0004-6361/201322068,Astropy,1,2014,10.1088/0004-6256/148/3/53,The Astronomical Journal,"aj499538, ANJOAA, 10.1088/0004-6256/148/3/53, ...",aj_148_3_53.xml,ext-link,nlm-citation,ref,ref-list,['10.1051/0004-6361/201322068'],"['\n<person-group person-group-type=""author"">\...","['\n<nlm-citation citation-type=""journal"">\n<p...",Gullikson KevinDodson-Robinson SarahKraus Adam...,The American Astronomical Society,"CORRECTING FOR TELLURIC ABSORPTION: METHODS, C..."


In [5]:
# Create column for bibliography section

# If the tag lable or tag content contain any of the following reference elements it will be marked "yes":
# bib
# bibr
# citation-alternatives
# collab
# contrib-group
# element-citation
# mixed-citation
# nlm-citation
# person-group
# pub-id
# ref
# ref-list
# source
# xref

bibliography = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']

XML_results["bib"] = np.where((XML_results[bibliography] == "bib").any(axis=1) | (XML_results[bibliography]== "bibr").any(axis=1) | (XML_results[bibliography]== "citation-alternatives").any(axis=1) | (XML_results[bibliography]== "collab").any(axis=1) | (XML_results[bibliography]== "contrib-group").any(axis=1) | (XML_results[bibliography]== "element-citation").any(axis=1) | (XML_results[bibliography]== "mixed-citation").any(axis=1) | (XML_results[bibliography]== "nlm-citation").any(axis=1) | (XML_results[bibliography]== "person-group").any(axis=1) | (XML_results[bibliography]== "pub-id").any(axis=1) | (XML_results[bibliography]== "ref").any(axis=1) | (XML_results[bibliography]== "ref-list").any(axis=1) | (XML_results[bibliography]== "source").any(axis=1) | (XML_results[bibliography]== "xref").any(axis=1), "yes", "no")

In [6]:
# Create column for acknowledgements
# If the tag lable or tag content contains "ack" it will be marked "yes"

acknowledgements = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["ack"] = np.where((XML_results[acknowledgements] == "ack").any(axis=1), "yes", "no")

In [7]:
# Create column for footnotes
# If the tag lable or tag content contains "fn" or "fn-group" it will be marked "yes"

footnotes = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["fn"] = np.where((XML_results[footnotes] == "fn").any(axis=1) | (XML_results[footnotes]== "fn-group").any(axis=1), "yes", "no")

In [8]:
# Create column for attempt at recognizable credit (bib + ack + fn + "ext-link" + "back")

# ack
# back
# bib
# bibr
# citation-alternatives
# collab
# contrib-group
# element-citation
# ex-link
# fn
# fn-group
# mixed-citation
# nlm-citation
# person-group
# pub-id
# ref
# ref-list
# source
# xref

recognizable = ['Parent1_Tag','Parent2_Tag','Parent3_Tag','Parent4_Tag','Parent1_Content','Parent2_Content']
XML_results["rec_credit"] = np.where((XML_results[recognizable] == "bib").any(axis=1) | (XML_results[recognizable]== "bibr").any(axis=1) | (XML_results[recognizable]== "citation-alternatives").any(axis=1) | (XML_results[recognizable]== "collab").any(axis=1) | (XML_results[recognizable]== "contrib-group").any(axis=1) | (XML_results[recognizable]== "element-citation").any(axis=1) | (XML_results[recognizable]== "mixed-citation").any(axis=1) | (XML_results[recognizable]== "nlm-citation").any(axis=1) | (XML_results[recognizable]== "person-group").any(axis=1) | (XML_results[recognizable]== "pub-id").any(axis=1) | (XML_results[recognizable]== "ref").any(axis=1) | (XML_results[recognizable]== "ref-list").any(axis=1) | (XML_results[recognizable]== "source").any(axis=1) | (XML_results[recognizable]== "xref").any(axis=1) | (XML_results[recognizable]== "fn").any(axis=1) | (XML_results[recognizable]== "fn-group").any(axis=1) | (XML_results[recognizable]== "ack").any(axis=1) | (XML_results[recognizable]== "back").any(axis=1) | (XML_results[recognizable]== "ex-link").any(axis=1), "yes", "no")

In [9]:
# Check new cols

list(XML_results.columns.values)

['Alias',
 'Software_Package',
 'Identifier',
 'Pub_Year',
 'DOI',
 'Journal_Title',
 'Article_ID',
 'File_Name',
 'Parent1_Tag',
 'Parent2_Tag',
 'Parent3_Tag',
 'Parent4_Tag',
 'Parent1_Content',
 'Parent2_Content',
 'Parent3_Content',
 'Author(s)',
 'Publisher',
 'Title',
 'bib',
 'ack',
 'fn',
 'rec_credit']

In [10]:
XML_results.to_csv("XML_FINAL_ANALYSIS_082819.csv")

## Summary of Results

In [11]:
# number of unique aliases found in each paper
XML_alias_per_paper = pd.DataFrame({'count' : XML_results.groupby(["Software_Package","File_Name"])['Alias'].nunique()})
XML_alias_per_paper.to_csv("XML_alias_per_paper_082819.csv")
# XML_alias_per_paper

In [12]:
# How many total papers did we find for each software package?

total_papers = XML_results.groupby('Software_Package')['File_Name'].nunique()
total_papers.to_csv("total_papers_082819.csv")
total_papers

Software_Package
AstroBlend        1
Astropy         538
RADMC-3D        214
SAOImage DS9    341
Spec2d          304
Stingray          2
TARDIS            4
WCSTools        123
Name: File_Name, dtype: int64

In [13]:
# Total number of unique XML files containing aliases
XML_results.File_Name.nunique()

1469

In [14]:
# Unique aliases per package
alias_per_package = XML_results.groupby('Software_Package')['Alias'].nunique()
alias_per_package.to_csv("alias_per_package_082819.csv")
alias_per_package

Software_Package
AstroBlend       2
Astropy         21
RADMC-3D        18
SAOImage DS9    21
Spec2d          22
Stingray         5
TARDIS           3
WCSTools        17
Name: Alias, dtype: int64

In [15]:
# All software mentions per journal

mentions_per_journal = XML_results.groupby('Journal_Title')['File_Name'].nunique()
mentions_per_journal.to_csv("mentions_per_journal_082819.csv")
mentions_per_journal

Journal_Title
The Astronomical Journal                       214
The Astrophysical Journal                      979
The Astrophysical Journal Letters              128
The Astrophysical Journal Supplement Series    148
Name: File_Name, dtype: int64

In [16]:
# software mentions per journal by package
mentions_per_package_by_journal = pd.DataFrame({'count' : XML_results.groupby(["Journal_Title", "Software_Package"])['File_Name'].nunique()})
mentions_per_package_by_journal.to_csv("mentions_per_package_by_journal_082819.csv")
mentions_per_package_by_journal

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Journal_Title,Software_Package,Unnamed: 2_level_1
The Astronomical Journal,Astropy,76
The Astronomical Journal,RADMC-3D,7
The Astronomical Journal,SAOImage DS9,68
The Astronomical Journal,Spec2d,25
The Astronomical Journal,WCSTools,43
The Astrophysical Journal,AstroBlend,1
The Astrophysical Journal,Astropy,364
The Astrophysical Journal,RADMC-3D,168
The Astrophysical Journal,SAOImage DS9,196
The Astrophysical Journal,Spec2d,229


In [17]:
# For each package count number of articles that mentioned their identifiers
ID_only = XML_results.loc[XML_results['Identifier'] == 1]
ID_only = pd.DataFrame({'count' : ID_only.groupby(["Software_Package", "Alias"])['File_Name'].nunique()})
ID_only.to_csv("ID_only_082819.csv")
ID_only

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Software_Package,Alias,Unnamed: 2_level_1
Astropy,10.1051/0004-6361/201322068,449
Astropy,2013A&A...558A..33A,441
Astropy,doi.org/10.1051/0004-6361/201322068,2
RADMC-3D,10.1051/0004-6361:20031768,96
RADMC-3D,10.1051/0004-6361:20040017,11
RADMC-3D,2004A&A...417..159D,96
RADMC-3D,2004A&A...417..793P,11
RADMC-3D,2011ascl.soft08016D,1
RADMC-3D,2012ascl.soft02015D,8
RADMC-3D,ascl:1202.015,2


In [18]:
# For each package count number of articles that mentioned their aliases that aren't identifiers
non_ID_only = XML_results.loc[XML_results['Identifier'] == 0]
non_ID_only = pd.DataFrame({'count' : non_ID_only.groupby(["Software_Package", "Alias"])['File_Name'].nunique()})
non_ID_only.to_csv("non_ID_only_082819.csv")
non_ID_only

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Software_Package,Alias,Unnamed: 2_level_1
AstroBlend,astroblend,1
AstroBlend,astroblend.com,1
Astropy,AstroPy,27
Astropy,Astropy,429
Astropy,Astropy Collaboration,433
Astropy,Astropy Collaboration 2013,1
Astropy,astropy,97
Astropy,astropy cosmology,1
Astropy,astropy.cosmology,5
Astropy,astropy.org,74


In [19]:
# For each package count total number of articles that mentioned each alias

XML_alias_paper = pd.DataFrame({'count' : XML_results.groupby(["Software_Package", "Alias"])['File_Name'].nunique()})
XML_alias_paper.to_csv("XML_alias_paper_082819.csv")
XML_alias_paper

Unnamed: 0_level_0,Unnamed: 1_level_0,count
Software_Package,Alias,Unnamed: 2_level_1
AstroBlend,astroblend,1
AstroBlend,astroblend.com,1
Astropy,10.1051/0004-6361/201322068,449
Astropy,2013A&A...558A..33A,441
Astropy,AstroPy,27
Astropy,Astropy,429
Astropy,Astropy Collaboration,433
Astropy,Astropy Collaboration 2013,1
Astropy,astropy,97
Astropy,astropy cosmology,1


In [20]:
#Tags per package

XML_tags = pd.DataFrame({'count' : XML_results.groupby(["Software_Package", "Parent1_Tag", "Parent2_Tag", "Parent3_Tag", "Parent4_Tag"])['File_Name'].nunique()})
XML_tags.to_csv("XML_tags_082819.csv")
XML_tags

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,count
Software_Package,Parent1_Tag,Parent2_Tag,Parent3_Tag,Parent4_Tag,Unnamed: 5_level_1
AstroBlend,ext-link,p,fn,p,1
AstroBlend,sc,p,sec,sec,1
Astropy,collab,element-citation,ref,ref-list,1
Astropy,collab,person-group,element-citation,ref,346
Astropy,collab,person-group,nlm-citation,ref,39
Astropy,comment,element-citation,ref,ref-list,40
Astropy,conf-name,element-citation,ref,ref-list,1
Astropy,disp-formula,p,sec,sec,1
Astropy,ext-link,element-citation,ref,ref-list,397
Astropy,ext-link,nlm-citation,ref,ref-list,44


In [21]:
# Total number of unique papers with software aliases by year
XML_unique_paper_per_year = pd.DataFrame(XML_results.groupby(['Software_Package', 'Pub_Year'])['File_Name'].nunique())
XML_unique_paper_per_year.to_csv("XML_unique_paper_per_year_082819.csv")
XML_unique_paper_per_year

Unnamed: 0_level_0,Unnamed: 1_level_0,File_Name
Software_Package,Pub_Year,Unnamed: 2_level_1
AstroBlend,2016,1
Astropy,2012,1
Astropy,2013,3
Astropy,2014,37
Astropy,2015,63
Astropy,2016,117
Astropy,2017,199
Astropy,2018,118
RADMC-3D,2004,1
RADMC-3D,2005,1


In [22]:
# Total number of unique papers with software aliases in the bibliography section by package
bib_only = XML_results.loc[XML_results['bib'] == "yes"]
bib_count = pd.DataFrame({'count' : bib_only.groupby(["Software_Package"])['File_Name'].nunique()})
bib_count.to_csv("bib_count_082819.csv")
bib_count

Unnamed: 0_level_0,count
Software_Package,Unnamed: 1_level_1
Astropy,463
RADMC-3D,146
SAOImage DS9,57
Spec2d,142
Stingray,1
TARDIS,4
WCSTools,37


In [23]:
# Total number of unique papers with software aliases in acknowledgements section by package
ack_only = XML_results.loc[XML_results['ack'] == "yes"]
ack_count = pd.DataFrame({'count' : ack_only.groupby(["Software_Package"])['File_Name'].nunique()})
ack_count.to_csv("ack_count_082819.csv")
ack_count

Unnamed: 0_level_0,count
Software_Package,Unnamed: 1_level_1
Astropy,486
RADMC-3D,22
SAOImage DS9,166
Spec2d,42
WCSTools,10


In [24]:
# Total number of unique papers with software aliases in footnotes section by package
fn_only = XML_results.loc[XML_results['fn'] == "yes"]
fn_count = pd.DataFrame({'count' : fn_only.groupby(["Software_Package"])['File_Name'].nunique()})
fn_count.to_csv("fn_count_082819.csv")
fn_count

Unnamed: 0_level_0,count
Software_Package,Unnamed: 1_level_1
AstroBlend,1
Astropy,48
RADMC-3D,54
SAOImage DS9,48
Spec2d,74
Stingray,1
WCSTools,32


In [25]:
# Total number of unique papers with software aliases that gave a recognizable form of credit by package
rec_credit_only = XML_results.loc[XML_results['rec_credit'] == "yes"]
rec_credit_count = pd.DataFrame({'count' : rec_credit_only.groupby(["Software_Package"])['File_Name'].nunique()})
rec_credit_count.to_csv("rec_credit_count_082819.csv")
rec_credit_count

Unnamed: 0_level_0,count
Software_Package,Unnamed: 1_level_1
AstroBlend,1
Astropy,531
RADMC-3D,189
SAOImage DS9,239
Spec2d,226
Stingray,1
TARDIS,4
WCSTools,72


In [26]:
# What papers mentioned software and gave a recognizable form of credit?
rec_credit_only_files = pd.DataFrame(rec_credit_only['File_Name'])
rec_credit_only_files.reset_index(drop=True, inplace=True)
rec_credit_only_files.to_csv('rec_credit_only_files_082819.csv', index=False)

In [27]:
# Total number of unique papers with software aliases that gave no recognizable form of credit by package
no_rec_credit_only = XML_results.loc[XML_results['rec_credit'] == "no"]
no_rec_credit_count = pd.DataFrame({'count' : no_rec_credit_only.groupby(["Software_Package"])['File_Name'].nunique()})
# count of files with alias mentions that aren't clearly recognizable credit
no_rec_credit_count.to_csv("rec_credit_count_082819.csv")
# subset of files with alias mentions that aren't clearly recognizable
no_rec_credit_only_files = pd.DataFrame(no_rec_credit_only['File_Name'])
no_rec_credit_only_files.reset_index(drop=True, inplace=True)

# of the "no-recognizable credit" files, which ones have no aliases whatsoever that point to recogniable credit? 
no_rec_credit_only_files = no_rec_credit_only_files[~no_rec_credit_only_files['File_Name'].isin(rec_credit_only_files['File_Name'])].dropna()
no_rec_credit_only_files.to_csv('rec_credit_only_files_082819.csv', index=False)
no_rec_credit_only_files

Unnamed: 0,File_Name
80,apj_826_2_191.xml
104,apj_846_1_24.xml
106,apj_847_1_44.xml
124,apjl_835_1_L8.xml
144,apjs_231_1_13.xml
170,apj_762_2_113.xml
171,apj_808_2_165.xml
172,apj_819_1_61.xml
179,10.1086_427688.xml
180,10.1086_508051.xml


Example article that mentions "AstroPy" without any attribution: https://iopscience.iop.org/article/10.3847/0004-637X/826/2/191/pdf

In [28]:
# Trends over time for each AAS Journal

XML_journal_year = pd.DataFrame(XML_results.groupby(['Journal_Title', 'Software_Package', 'Pub_Year'])['File_Name'].nunique())
XML_journal_year.to_csv("XML_journal_year_082819.csv")
XML_journal_year

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,File_Name
Journal_Title,Software_Package,Pub_Year,Unnamed: 3_level_1
The Astronomical Journal,Astropy,2014,4
The Astronomical Journal,Astropy,2015,5
The Astronomical Journal,Astropy,2016,14
The Astronomical Journal,Astropy,2017,32
The Astronomical Journal,Astropy,2018,21
The Astronomical Journal,RADMC-3D,2010,2
The Astronomical Journal,RADMC-3D,2016,1
The Astronomical Journal,RADMC-3D,2017,3
The Astronomical Journal,RADMC-3D,2018,1
The Astronomical Journal,SAOImage DS9,1998,3
