# Extracting Headings from the Publications about Exoplanets in ArXiv

## Loading the packages

In [60]:
import pandas as pd
import re
import bs4 as bs
from collections import *

## Loading the dataframe into python

Here it is important to include the statement orient = 'table', because this dataframe was exported with this option, so that the exported json is a valid json. The head of the dataframe is printed out.

In [34]:
dfExoplanets = pd.read_json('./data/dfExoplanetsArxiv_v1.json', orient = 'table')
dfExoplanets = dfExoplanets[['authors','title','abstract','xmlpath']]
dfExoplanets = dfExoplanets[dfExoplanets.xmlpath != 'None']
dfExoplanets['xmlpath'] = [i.replace('..', '.').replace('xml/', '') for i in dfExoplanets.xmlpath]

In [35]:
dfExoplanets.head()

Unnamed: 0,authors,title,abstract,xmlpath
0,[Gerard R. Lemaitre],Active Optics in Astonomy - Modeling of freefo...,Active optics techniques on large telescopes a...,./data/paper/1901/05650v1.tei.xml
1,"[R. Poleski, B. S. Gaudi, Xiaojia Xie, A. Udal...",A Wide Orbit Exoplanet OGLE-2012-BLG-0838Lb,We present the discovery of a planet on a very...,./data/paper/1901/05466v1.tei.xml
2,"[L. Affer, M. Damasso, G. Micela, E. Poretti, ...",HADES RV programme with HARPS-N at TNG. X. A s...,The HArps-n red Dwarf Exoplanet Survey is prov...,./data/paper/1901/05338v1.tei.xml
3,"[B. Fuhrmeister, S. Czesla, J. H. M. M. Schmit...",The CARMENES search for exoplanets around M dw...,"We use spectra from CARMENES, the Calar Alto h...",./data/paper/1901/05173v1.tei.xml
4,"[Gabriel A. Caceres, Eric D. Feigelson, G. Jog...",AutoRegressive Planet Search: Methodology,The detection of periodic signals from transit...,./data/paper/1901/05116v1.tei.xml


## Extracting headings of papers

This method uses Beautiful Soup and searches through the whole xml.

In [36]:
def findHeadingsXML(soup):
    try:
        heading = []
        for bodies in soup.find_all('body'):
            for div in bodies.find_all('div'):
                for heads in div.find_all('head'):
                    heading.append(heads.text)
    except:
        heading = []
    return heading

Executing of definition:

In [37]:
headinglist = []
for i in dfExoplanets.xmlpath:
    infile = open(i, "r", encoding="utf8")
    contents = infile.read()
    soup = bs.BeautifulSoup(contents,'xml')

    headinglist.append(findHeadingsXML(soup))

Appending the list of headings to the dataframe as a new column and showing the head of the new dataframe:

In [38]:
dfExoplanets['headings'] = headinglist

In [40]:
dfExoplanets.head()

Unnamed: 0,authors,title,abstract,xmlpath,headings
0,[Gerard R. Lemaitre],Active Optics in Astonomy - Modeling of freefo...,Active optics techniques on large telescopes a...,./data/paper/1901/05650v1.tei.xml,"[Introduction, Optical design with a reflectiv..."
1,"[R. Poleski, B. S. Gaudi, Xiaojia Xie, A. Udal...",A Wide Orbit Exoplanet OGLE-2012-BLG-0838Lb,We present the discovery of a planet on a very...,./data/paper/1901/05466v1.tei.xml,"[INTRODUCTION, EPOXI imaging, VVV photometry, ..."
2,"[L. Affer, M. Damasso, G. Micela, E. Poretti, ...",HADES RV programme with HARPS-N at TNG. X. A s...,The HArps-n red Dwarf Exoplanet Survey is prov...,./data/paper/1901/05338v1.tei.xml,"[INTRODUCTION, STELLAR PROPERTIES OF Gl 686, D..."
3,"[B. Fuhrmeister, S. Czesla, J. H. M. M. Schmit...",The CARMENES search for exoplanets around M dw...,"We use spectra from CARMENES, the Calar Alto h...",./data/paper/1901/05173v1.tei.xml,"[Introduction, Observations and data reduction..."
4,"[Gabriel A. Caceres, Eric D. Feigelson, G. Jog...",AutoRegressive Planet Search: Methodology,The detection of periodic signals from transit...,./data/paper/1901/05116v1.tei.xml,"[INTRODUCTION, AUTOREGRESSIVE MODELING, Overvi..."


## Count the headings

In [75]:
listHeadings = [item for sublist in headinglist for item in sublist]
listHeadings = [i.lower().replace('&', 'and') for i in listHeadings]
countHeadings = dict(Counter([i for i in listHeadings if len(i) > 3 and not re.search('[^a-zA-Z\s]', i)]))
countHeadings = OrderedDict(sorted(countHeadings.items(), key=lambda x: x[1], reverse = True))

In [76]:
countHeadings

OrderedDict([('introduction', 3730),
             ('conclusions', 1177),
             ('discussion', 1147),
             ('results', 915),
             ('conclusion', 613),
             ('observations', 416),
             ('summary', 410),
             ('observations and data reduction', 241),
             ('methods', 197),
             ('discussion and conclusions', 184),
             ('summary and conclusions', 170),
             ('luvoir', 168),
             ('results and discussion', 164),
             ('analysis', 163),
             ('data reduction', 135),
             ('summary and discussion', 125),
             ('acknowledgments', 118),
             ('method', 112),
             ('appendix', 108),
             ('data analysis', 98),
             ('parameter', 96),
             ('stellar parameters', 81),
             ('discussion and conclusion', 81),
             ('acknowledgements', 77),
             ('model', 77),
             ('overview', 69),
             ('data', 66),
  