# Handling MAGIC/PIC fits database to homogeneize mine some features

In what follows, the idea is to download (fits) data offered at [MAGIC fits database](http://vobs.magic.pic.es/fits/) and republish in a somewhat similar way.

The processing steps bellow should provide a detailed description of the decisions and data access/handling found during the workflow.

**Information to have**:
  * files not available for download: from the webpage, a considerable number of files are not available;
  * files not able to open: can happen, and we should know which and possibly why;
  

In [1]:
import pandas
import wget
import bs4

def eprint(string):
#     from sys import stderr
#     print >> stderr, "{}".format(string)
#     stderr.flush()
    print "\nERROR:{}".format(string)

In [2]:
class Web(object):
    url   = 'http://vobs.magic.pic.es/fits/'
    table = {'class':'mytable'}
    _html = None

    class _HTML(object):
        fields = {'Source'   :None,
                  'Article'  :None,
                  'Year'     :None,
                  'Reference':None,
                  'Download' :None}

        def __init__(self,html):
            self._html = html
            
        def html(self):
            return self._html
        
        def extract_fields(self):
            """
            """
            def check_header(row,fields):
                """
                """
                cells = row.findAll('th')
                for i,cell in enumerate(cells):
                    txt = cell.find(text=True)
                    if txt in fields.keys():
                        fields[txt] = i
                return all([ v is not None for v in fields.values() ])
                
            def get_doi_url(url):
                # The url of the reference ("aurl").
                from bs4 import BeautifulSoup as BS
                import urllib2
                import re
                _ads = 'adsabs'
                _cds = 'cdsads'
                _axv = 'arxiv'
                if (_ads in url or _cds in url or _axv in url):
                    soup = BS(urllib2.urlopen (url))
                    trs = soup.findAll('tr')
                    filter(lambda x: 'doi' in x.get_text().lower(), trs)
                    tr = filter(lambda x: 'doi' in x.get_text().lower(), trs)[0]
                    doi = tr.get_text()
                    url = 'http://dx.doi.org/'
                    url += re.sub('DOI:','',re.sub('\n','',doi))
                return url

            def process_row(row,get_doi=False):
                cells = row.findAll('td')
                if len(cells)==5:
                    # Object source name(s) (can be more then one comma separated)
                    _i_ = self.fields['Source']
                    src = cells[_i_].find(text=True)
                    src = src.strip()
                    # Article reference (url), usually a ref to ads
                    _i_ = self.fields['Article']
                    art = cells[_i_].find('a',href=True)
                    url = art['href']
                    if get_doi:
                        url = get_doi_url(url)
                    # We skip year of publication (third column)
                    # as well as bibcode reference (fourth column)
                    #ref = cells[3].find(text=True).encode('utf8')
                    # FITS file link for downloading it in the near future
                    _i_ = self.fields['Download']
                    fits = cells[_i_]
                    ffile = fits.find('a',href=True)
                    try:
                        ffile = ffile['href']
                    except:
                        ffile = None
                    #furl = url+ffile if ffile!=None else '_NULL_'
                    return (src,url,ffile)
                return None

            _table = {'SOURCE':[], 'URL':[], 'FITS':[]}
            for i,row in enumerate(self._html.findAll('tr')):
                if i==0:
                    ok = check_header(row,self.fields)
                    if not ok:
                        return None
                vals = process_row(row)
                if vals is not None:
                    src,url,ffile = vals
                    _table['SOURCE'].append(src)
                    _table['URL'].append(url)
                    _table['FITS'].append(ffile)
            return _table
            
            
    def __init__(self,url=None):
        if url is not None:
            self.url = url
    
    def __str__(self):
        return str(self.html)

    def get_table(self,table=None):
        from bs4 import BeautifulSoup as BS
        import urllib2
        if table is not None and isinstance(table,dict):
            self.table = table
        soup = BS(urllib2.urlopen( self.url ).read(),"html.parser")
        table = soup.find('table', self.table )
        self._html = self._HTML(table)

    @property
    def html(self):
        return self._html.html()
    
    def get_table_fields(self):
        return self._html.extract_fields()
    

In [3]:
web = Web()
web.get_table()
from IPython.display import HTML
HTML(unicode(web.html))

Source,Article,Year,Reference,Download
Mrk 421,Unprecedented study of the broadband emission of Mrk 421 during flaring activity in March 2010,2015,(2014arXiv1412.3576T),FITS
Crab Nebula,Measurement of the Crab nebula spectrum over three decades in energy with the MAGIC telescopes,2015,(2014arXiv1406.6892M),FITS
PG 1553+113,Probing the very-high-energy γ-ray spectral curvature in the blazar PG 1553+113 with the MAGIC telescopes,2015,(2014arXiv1408.1975M),FITS
1ES 0806+524,MAGIC detection of short-term variability of the high-peaked BL Lac object 1ES 0806+524,2015,"(J. Aleksić et al., MNRAS 451, 5258-5269)",FITS
HESS J1857+026,MAGIC reveals a complex morphology within the unidentified gamma-ray source HESS J1857+026,2014,"(J. Aleksić et al., A&A; 571, A96)",FITS
J2001+439,First broadband characterization and redshift determination of the VHE blazar MAGIC J2001+439,2014,(2014arXiv1409.3389M),FITS
Mrk 501,Multiwavelength Observations of Mrk 501 in 2008,2014,(2014arXiv1410.6391M),FITS
1ES 0033+595,Discovery of very high energy gamma-ray emission from the blazar 1ES 0033+595 by the MAGIC telescopes,2014,(2014arXiv1410.7059A),FITS
IC 310,Black Hole lightning due to particle acceleration at sub-horizon scales,2014,"(J. Aleksić et al., Science 346, 1080-1084)",FITS
PKS 1424+240,MAGIC long-term study of the distant TeV blazar PKS 1424+240 in a multiwavelength context,2014,"(J. Aleksić et al., A&A; 567, A135)",FITS


In [4]:
f = web.get_table_fields()

In [5]:
class Local(pandas.DataFrame):
    def __init__(self,table):
        super(Local,self).__init__(table)
    
    def describe(self):
        print super(Local,self).describe()
        print "\n-> Has Nil?"
        hows_nil = self.isnull().any()
        print hows_nil
        for c in hows_nil.index:
            if not hows_nil[c]: continue
            print "\n-> Indexes where column '{}' is null:".format(c)
            print self[self[c].isnull()].index.values


In [6]:
table = Local(f)
table.describe()
print table

                                       FITS         SOURCE  \
count                                    67             99   
unique                                   67             68   
top     mfits/base/MAGIC_2010_CygnusX3.fits  Markarian 421   
freq                                      1              6   

                                                      URL  
count                                                  99  
unique                                                 99  
top     http://www.sciencemag.org/content/320/5884/175...  
freq                                                    1  

-> Has Nil?
FITS       True
SOURCE    False
URL       False
dtype: bool

-> Indexes where column 'FITS' is null:
[ 0  2  4  5  6  7 12 17 18 28 29 33 34 35 36 37 41 42 43 49 50 53 57 67 73
 83 84 88 90 93 96 98]
                                      FITS                    SOURCE  \
0                                     None                   Mrk 421   
1    mfits/base/MAGIC_2015_CrabN

In [7]:
def clean_dir(dir,ext):
    _dir = dir
    import os
    from glob import glob
    if not os.path.exists(_dir):
        os.mkdir(_dir)
    if os.path.isdir(_dir):
        files = glob(os.path.join(_dir,ext))
        for f in files:
            os.remove(f)

class Download(object):
    def __init__(self,outdir,clean=True):
        import os
        self._outdir = outdir
        if clean:
            self.clean_outdir()
        self._md5 = os.path.join(self._outdir,'md5sum.txt')
        
    def download(self,url):
        import wget
        filename = wget.download(url,out=self._outdir)
        return filename

    def clean_outdir(self,ext="*"):
        _dir = self._outdir
        clean_dir(dir,ext)
                

    def create_md5sum_file(self,files_list):
        import hashlib
        dir = self._outdir
        md5txt = self._md5
        md5 = {}
        for f in files_list:
            fname = os.path.join(dir,f) if dir else f
            h = None
            with open(fname,'rb') as fp:
                h = hashlib.md5(fp.read()).hexdigest()
            md5.update({f:h})
        with open(md5txt,'w') as fp:
            for _file,_hash in md5.iteritems():
                fp.write("%s    %s\n"%(_hash,_file))
        return md5

    def is_exist_files(self,files_list):
        import os
        dir = self._outdir
        md5_file = self._md5
        
        # First we see if there is a file list (md5sum) to look for
        def check_md5sum(files_list,md5_file):
            if os.path.isfile(md5_file):
                md5 = read_md5sum_file(md5_file)
                md5_files_list = md5.keys()
                leng_inters = len(set(md5_files_list).intersection(files_list))
                return leng_inters == len(files_list)
            # If there is *no* md5-file, return *None*
            return None

        # Also, check if the files are actually there (inside the dir)
        def check_glob(files_list,dir):
            files_ext = '*.fits'
            dir_files_list = read_dir_content(dir,files_ext)
            leng_matches = sum(map(lambda v: v in dir_files_list, files_list))
            return leng_matches == len(files_list)

        md5_check = check_md5sum(files_list,md5_file)
        if md5_check in (True,False):
            return md5_check
        glob_check = check_glob(files_list,dir)
        if glob_check:
            self.create_md5sum_file(files_list,dir=dir)
        return glob_check

def read_dir_content(dir,ext='*.fits'):
    from glob import glob
    dir_files_list = glob(os.path.join(dir,ext))
    return [ os.path.basename(f) for f in dir_files_list ]

def read_md5sum_file(md5txt):
    import os
    assert os.path.isfile(md5txt)

    md5_hashs,md5_files = [],[]
    with open(md5txt,'r') as mdf:
        for line in mdf.readlines():
            _h,_f = line.split()
            md5_hashs.append(_h.strip())
            md5_files.append(_f.strip())
    md5 = dict(zip(md5_files,md5_hashs))
    return md5

In [8]:
download_dir = 'FITS_pic/'
download_handler = Download(download_dir,clean=False)

In [9]:
import os

fits_files = table.FITS.dropna().apply(os.path.basename)
if download_handler.is_exist_files(fits_files):
    print("FITS files exist locally. Passing by download step..")
    fits_files = fits_files.apply(lambda f: os.path.join(download_dir,f))
else:
    print("FITS files do not exist locally. Downloading them...")
    furls = web.url + table.FITS.dropna()
    fits_files = furls.apply(lambda f: download_handler.download(f))
    md5s = download_handler.create_md5sum_file(fits_files)
    del furls

table['FITS'] = fits_files

FITS files exist locally. Passing by download step..


In [10]:
print table

                                    FITS                    SOURCE  \
0                                    NaN                   Mrk 421   
1    FITS_pic/MAGIC_2015_CrabNebula.fits               Crab Nebula   
2                                    NaN               PG 1553+113   
3       FITS_pic/MAGIC_2015_1ES0806.fits              1ES 0806+524   
4                                    NaN            HESS J1857+026   
5                                    NaN                 J2001+439   
6                                    NaN                   Mrk 501   
7                                    NaN              1ES 0033+595   
8         FITS_pic/MAGIC_2014_Ic310.fits                    IC 310   
9   FITS_pic/MAGIC_2014_PKS1424+240.fits              PKS 1424+240   
10         FITS_pic/MAGIC_2014_3C58.fits                     3C 58   
11        FITS_pic/MAGIC_2014_AEaqr.fits                AE Aquarii   
12                                   NaN              PKS 1510-089   
13        FITS_pic/M

#### Header keywords
 Primarily, regarding particularly MAGIC data, we should take a look at their [FITS format for MAGIC data](http://vobs.magic.pic.es/fits/mfits/tdas/tdas-fits.pdf) document, chapter 4. There we'll find the following list of keywords:

__FITS standard__, _highlighted by MAGIC_:
 * AUTHOR
 * DATE
 * DATE-OBS
 * EXTNAME
 * OBJECT
 * ORIGIN
 * TELESCOP
 
__MAGIC__, _to describe extension content_:
 * CONFLEVE
 * EFFICIEN
 * EMAX
 * EMIN
 * PERIOD
 * PHIMAX
 * PHIMIN
 * REFPAPER
 * SIZEMIN
 * SIZEMAX
 * TOBS
 * ZMIN
 * ZMAX
 * TZERO
 * VERSION

__MAGIC__, _SPECTRUM extension_:
 * ISINTEGR : 'F' means density flux; 'I' means integrated flux.
 
On the other hand, there is the [FITS standard](http://heasarc.gsfc.nasa.gov/docs/fcg/standard_dict.html) and the [FITS commons](http://heasarc.gsfc.nasa.gov/docs/fcg/common_dict.html) set of keywords.


```
This data dictionary lists the 53 keywords currently defined in the
FITS Standard:

(blank)  CROTAn   EQUINOX  NAXISn   TBCOLn   TUNITn
AUTHOR   CRPIXn   EXTEND   OBJECT   TDIMn    TZEROn
BITPIX   CRVALn   EXTLEVEL OBSERVER TDISPn   XTENSION
BLANK    CTYPEn   EXTNAME  ORIGIN   TELESCOP
BLOCKED  DATAMAX  EXTVER   PCOUNT   TFIELDS
BSCALE   DATAMIN  GCOUNT   PSCALn   TFORMn
BUNIT    DATE     GROUPS   PTYPEn   THEAP
BZERO    DATE-OBS HISTORY  PZEROn   TNULLn
CDELTn   END      INSTRUME REFERENC TSCALn
COMMENT  EPOCH    NAXIS    SIMPLE   TTYPEn
```

```
     Dictionary of Commonly Used FITS Keywords

This data dictionary contains FITS keywords that have been widely used
within the astronomical community.  It is recommended that these
keywords only be used as defined here.  These keywords may be grouped
within the following 7 broad categories:

1. Keywords that describe the data or the FITS file itself:

    TITLE FILENAME FILETYPE ROOTNAME
    PROGRAM CREATOR CONFIGUR
    NEXTEND HDUNAME HDUVER HDULEVEL
    TLMINn TLMAXn TDMINn TDMAXn TDBINn
    TSORTKEY PROGRAM CREATOR CONFIGUR
    HDUCLASS HDUDOC HDUVERS HDUCLASn

2.  Keywords that describe the observation:

    SUNANGLE MOONANGL
    RA DEC RA_NOM DEC_NOM
    RA_OBJ DEC_OBJ RA_PNT DEC_PNT PA_PNT
    RA_SCX DEC_SCX RA_SCY DEC_SXY RA_SCZ DEC_SCZ
    ORIENTAT AIRMASS LATITUDE
    OBJNAME OBS_ID

3.  Keywords that describe the instrument that took the data:

    OBS_MODE DATAMODE
    APERTURE DETNAM FILTER FILTERn GRATING GRATINGn
    SATURATE

4.  Keywords that give the date or duration of the observation:

    TIME-OBS TIME-END DATE-END
    EXPOSURE EXPTIME TELAPSE ELAPTIME ONTIME LIVETIME

5.  Keywords that denote non-standard FITS keyword format conventions:

    HIERARCH INHERIT CONTINUE

6.  File checksum keywords:

    CHECKSUM DATASUM CHECKVER

7.  Hierarchical file grouping keywords:

    GRPNAME GRPIDn GRPLCn
```

---

#### Improvement

 * Given all those lists of keywords, I will just add `RA` and `DEC`.


In [11]:
# The following keywords may not be seen in the Extended BinTableHDU header, but inside the PrimaryHDU' header.
# That's because the MAGIC data handling system decided to not repeat unnecessarily such data.
# See the 'General Keywords' section (4.1) of the document 'FITS Format for MAGIC data', version 0.2.
#
MAGIC_FITS_STANDARD_KEYWORDS = [
    'AUTHOR',
    'DATE',
    'DATE-OBS',
    'EXTNAME',
    'OBJECT',
    'ORIGIN',
    'TELESCOP']
MAGIC_FITS_EXTENSION_KEYWORDS = [
    'CONFLEVE',
    'EFFICIEN',
    'EMAX',
    'EMIN',
    'PERIOD',
    'PHIMAX',
    'PHIMIN',
    'REFPAPER',
    'SIZEMIN',
    'SIZEMAX',
    'TOBS',
    'ZMIN',
    'ZMAX',
    'TZERO',
    'VERSION']
# These keywords will be used to update the SPECTRUM extensions with *all* of them
FITS_KEYWORDS = MAGIC_FITS_STANDARD_KEYWORDS + MAGIC_FITS_EXTENSION_KEYWORDS

In [12]:
# Now we can process the fits files themselves.
# He start noting that we want the SPECTRUM Data Unit(s)
#  available (or not) in the fits files; discard the other DU.
# Things we want to do:
# - get the OBJECT name
# - get the each object position
# - get the observation date
# - transform the data vectors (x) to frequency(Hz) and (y) to flux(erg/s/cm2)
# Then we should follow the following workflow:
# - open the fits file
# - find the necessary data unit (SPECTRUM)
# - open its header
#  - get some keywords from the header
# - open its data; data here are vectors
#  - it can be from 2 to 4 vectors
#   - energy
#   - flux
#   - Denergy
#   - Dflux
#  - convert the ?energy vectors to 'Hz' units
#  - convert the ?flux vectors to 'erg/s/cm2' units

# Here we just define the functions we'll need..
def resolve_name(name):
    from astropy.coordinates import get_icrs_coordinates as get_coords
    try:
        icrs = get_coords(name)
        pos = (icrs.ra.value,icrs.dec.value)
    except:
        pos = None
    return pos

def fix_dateobs(date):
    try:
        dt = str(date).split('-')
        y = int(dt[0])
    except:
        return '1999-01-01'
    try:
        m = int(dt[1])
    except:
        m = 1
    try:
        d = int(dt[2])
    except:
        d = 1
    return '{:4d}-{:02d}-{:02d}'.format(y,m,d)

def merge_header_keywords(header_p,header_s):
    # Extension's header has the highest priority; keywords there
    # should not be overwritten. Relevant keywords are the ones in:
    # 'FITS_KEYWORDS'
    f_header = {'COMMENT':[]}
    _kw = list(set(header_p.keys()).intersection(FITS_KEYWORDS))
    for k in _kw:
        f_header.update({k : header_p[k]})
    if 'COMMENT' in header_p.keys():
        f_header['COMMENT'].extend(header_p['COMMENT'])
    _kw = list(set(header_s.keys()).intersection(FITS_KEYWORDS))
    for k in _kw:
        f_header.update({k : header_s[k]})
    if 'COMMENT' in header_s.keys():
        f_header['COMMENT'].extend(header_s['COMMENT'])
    return f_header
    
def trans_data(table):
    import numpy as np
    from astropy import units
    Unit = units.Unit
    
    units.set_enabled_equivalencies(units.spectral())
    uEn = Unit('Hz')
    uFn = Unit('erg s-1 cm-2')
    uEc = Unit('TeV')
    conv = {Unit('ph TeV s-1 cm-2') : lambda x,y: (x/Unit('ph')).to(uFn),
            Unit('ph TeV-1 s-1 cm-2') : lambda x,y: ((y.to(uEc)**2)*(x/Unit('ph'))).to(uFn),
            Unit('ph s-1 cm-2') : lambda x,y: None,
            Unit('GeV') : lambda x: x.to(uEn, equivalencies=units.spectral())}

    vE = table['energy']
    uE = vE.unit
    vEn = conv[uE](vE)

    vF = table['flux']
    uF = vF.unit
    vFn = conv[uF](vF,vE)

    if vFn is None:
        print("Flux data could not be transformed. Unrecognised units ({})?".format(uF.to_string()))
        return False

    def set_null(column,null_expression,new_null_value=-999):
        _idx = np.where(null_expression(column))
        column[_idx] = new_null_value
        column.null = new_null_value
        
    nullval = -999
    new_nullval = nullval
    
    table['energy'] = vEn
    table['energy'].unit = vEn.unit
    set_null( table['energy'], lambda x:x==0.0)
    table['flux'] = vFn
    table['flux'].unit = vFn.unit
    set_null( table['flux'], lambda x:x==0.0)
    set_null( table['flux'], lambda x:x>0.001)

    if 'Denergy' in table.colnames:
        vDE = table['Denergy']
        uDE = vDE.unit
        vDEn = conv[uDE](vDE)
        table['energy_error'] = vDEn
        table['energy_error'].unit = vDEn.unit
        set_null( table['energy_error'], lambda x:x==0.0)
        del table['Denergy']
    else:
        uDE = table['energy'].unit
        vDEn = np.asarray([nullval]*len(vE),dtype=int)
        table['energy_error'] = vDEn
        table['energy_error'].unit = uDE
        table['energy_error'].null = nullval

    if 'Dflux' in table.colnames:
        vDF = table['Dflux']
        uDF = vDF.unit
        vDFn = conv[uDF](vDF,vE) # Notice we use the energy bin/value of the measurement.
        table['flux_error'] = vDFn
        table['flux_error'].unit = vDFn.unit
        set_null( table['flux_error'], lambda x:x==0.0)
        del table['Dflux']
    else:
        uDF = table['flux'].unit
        vDFn = np.asarray([nullval]*len(vE),dtype=int)
        table['flux_error'] = vDFn
        table['flux_error'].unit = uDF
        table['flux_error'].null = nullval

    return True

# def header_to_dict(header):
#     from collections import OrderedDict
#     out = OrderedDict()
#     for card in header.cards:
#         k = card[0]
#         v = card[1]
#         c = card[2]
#         out[k] = v   # 'c' is out for the time being
#     return out

In [13]:
class SourceFits(object):
    """
    """
    def __init__(self,source_name,filename):
        self._source = source_name
        self._file = Fits(filename)
        self._spectra = []

    def __len__(self):
        return len(self._spectra)
    
    def __nonzero__(self):
        return bool(self._file)
    
    def hasSpectra(self):
        if not self._file.isValid():
            return False
        return self._file.hasSpectrum()

    def loadSpectra(self):
        if self.hasSpectra():
            self._spectra = self._file.get_spectra()
        else:
            assert len(self._spectra)==0
            
    def getSpectra(self):
        if not self._spectra:
            self.loadSpectra()
        return self._spectra

    
class Fits(object):
    """
    Handles the file's infomation set
    """
    _file = None
    _hdulist = None
    _specHDU = 'SPECTRUM'

    def __init__(self, filename):
        self._file = filename
        self.read_file()

    def __nonzero__(self):
        return self._file != None and self._hdulist != None
    
    @property
    def name(self):
        return self._file

    @property
    def header(self):
        return self._hdulist[0].header

    def read_file(self):
        from astropy.io import fits
        filename = self._file
        try:
            hdulist = fits.open(filename)
        except:
            hdulist = None
        self._hdulist = hdulist

    def isValid(self):
        return self._hdulist != None

    def hasSpectrum(self):
        duname = self._specHDU
        if not self.isValid():
            return None
        return any([ duname in du.name for du in self._hdulist ])

    def get_spectra(self):
        duname = self._specHDU
        if not self.hasSpectrum():
            return None
        hdulist = self._hdulist
        dui = [ i for i,du in enumerate(hdulist) if duname in du.name ]
        dus = [ Spectrum(self,hdu=ii) for ii in dui ]
        return dus

    def get_hdu(self,hdu):
        if not self.isValid():
            return None
        return self._hdulist[hdu]

    
class Spectrum(object):
    """
    Handles fits Spectrum HDU
    """
    def __init__(self,fits,hdu):
        assert isinstance(fits,Fits)
        self._hdu = fits.get_hdu(hdu)
        self._file = fits
        self._i = hdu
        self._table = None
        self._meta = None
#         self._fileout = None
        
    def __len__(self):
        return len(self._table) if self else 0
    
    def __nonzero__(self):
        return True if self._table else False
    
    def __str__(self):
        return str(self._table)
    
    @property
    def label(self):
        return self.header['EXTNAME']

    @property
    def header(self):
        return self._hdu.header

    def upgrade_metadata(self, metadata):
        phdu_header = self._file.header
        shdu_header = self.header
        header = merge_header_keywords(phdu_header,shdu_header)
        assert isinstance(header,dict)

        if not 'OBJECT' in header.keys():
            header['OBJECT'] = None
            print "\n'OBJECT' keyword not defined."
            return False
            
        if ('SRCPOS1' in header.keys()) and ('SRCPOS2' in header.keys()):
            pos = header['SRCPOS1'],header['SRCPOS2']
        elif ('SRCPOS1' in phdu_header.keys()) and ('SRCPOS2' in phdu_header.keys()):
            pos = phdu_header['SRCPOS1'],phdu_header['SRCPOS2']
        else:
            pos = resolve_name(header['OBJECT'])
        if pos is None:
            print "\nObject position not defined, (RA,DEC) unresolved."
            return False
            #pos = resolve_name(source)
            #if pos is None:
            #    pos = (None,None)

        header['RA'],header['DEC'] = pos

        try:
            header['DATE-OBS'] = fix_dateobs(header['DATE-OBS'])
        except:
            header['DATE-OBS'] = fix_dateobs(None)

        for k,v in metadata.iteritems():
            _key = k.upper()
            _value = v
            header[_key] = _value

        self._meta = header
        return True

    def transform_data(self):
        from astropy import units
        Unit = units.Unit
        import numpy as np

        # Read the extension directly into an astropy.table
        from astropy.table import Table
        fn = self._file.name
        i_du = self._i
        table = Table.read(fn,hdu=i_du)
        res = trans_data(table)
        if res:
            self._table = table

    def retrieve_table(self):
        if not self._table:
            return None
        table = self._table
        header = self._meta
        table.meta.update(header)
        return table.copy()

    def suggest_output_filename(self,output_dir=None):
        def fix_filename(table,filename,odir):
            import os,re
            filename = os.path.basename(filename)
            filename = re.sub('[+]','p',filename)
            extname = '_' + table.meta['EXTNAME'] + '.fits'
            fitsfilename = os.path.join(odir,re.sub('.fits',extname,filename))
            return fitsfilename
        filename = self._file.name
        table = self._table
        odir = output_dir if output_dir is not None else './'
        return fix_filename(table,filename,odir)
    
    def dropna(self, columns, na_value, rows=True):
        assert rows==True
        from numpy import where
        table = self._table
        idx = set()
        for c in columns:
            _idx = where(table[c] == na_value)[0]
            idx = idx.union(set(_idx.tolist()))
        idx = list(idx)
        table.remove_rows(idx)
        return idx

    
def write_to_fits(spectrum, output_dir=None, votable=False):
    def write_table(table,filename,votable):
        table.write(filename,format='fits')
        if votable:
            votfilename = filename[:-5] + '.vot'
            table.write(votfilename,format='votable')
        return filename
    table = spectrum.retrieve_table()
    filename = spectrum.suggest_output_filename(output_dir)
    return write_table(table,filename,votable)


In [14]:
def proc_fits_file(fn,source,url_article):
    """
    Returns a (plain) list with all valid spectra in it.
    """
    
    print "\n======================================================================"
    print "Taking file: ",fn
    print "Source name: ",source
    print "Article: ",url_article

    src = SourceFits(source,fn)
    
    if not src:
        eprint("***: File opening failed. Moving on.")
        return None
    
    if not src.hasSpectra():
        eprint("***: File has no SPECTRUM data-unit in it.")
        return None
        
    spectra = src.getSpectra()
    assert len(spectra) > 0
    
    out = []
    for spec in spectra:
        print "\n----------------------------------------------------------------------"
        print "\nProcessing spectrum (DataUnit): ",spec.label

        res = spec.upgrade_metadata(metadata = {'refurl':url_article})
        if not res:
            eprint("***: Metadata (Headers) verification failed. Continue to next spectrum.")
            continue
        
        spec.transform_data()
        
        print
        print spec
        if not spec:
            eprint("***: Spectrum data could not be properly transofrmed. Continue to next spectrum.")
            continue
            
        del_rows = spec.dropna(columns=['energy','flux'],na_value=-999)
        if del_rows:
            print "\n{} rows eliminated from table: {}.".format(len(del_rows),del_rows)
            if len(spec)==0:
                eprint("***: Table is empty; not to be writen. Continue to next spectrum.")
                continue
        print "\nOutput filename: {}".format(spec.suggest_output_filename(output_dir=''))
        print "**********************************************************************"
        out.append(spec)
        
    return out

In [15]:
table['SPECTRUM'] = table.apply(lambda x: proc_fits_file(x.FITS,x.SOURCE,x.URL), axis=1)


Taking file:  nan
Source name:  Mrk 421
Article:  http://cdsads.u-strasbg.fr/abs/2014arXiv1412.3576T

ERROR:***: File opening failed. Moving on.

Taking file:  FITS_pic/MAGIC_2015_CrabNebula.fits
Source name:  Crab Nebula
Article:  http://cdsads.u-strasbg.fr/abs/2014arXiv1406.6892M

----------------------------------------------------------------------

Processing spectrum (DataUnit):  SPECTRUM

      energy             flux       energy_error     flux_error   
        Hz          erg / (cm2 s)        Hz        erg / (cm2 s)  
----------------- ----------------- ------------ -----------------
1.56025356963e+25 3.62805869585e-13         -999 1.13904928544e-15
2.50087802451e+25 9.43132530586e-13         -999  1.9694134259e-15
3.94770613174e+25  2.3715794771e-12         -999 4.13655965554e-14
 6.1126770758e+25 5.03369606776e-12         -999 8.14063738269e-14
9.51868105355e+25 1.16224204472e-11         -999 1.84607595297e-13
 1.4848630797e+26 2.31956872225e-11         -999  3.8266271044e-

In [16]:
table

Unnamed: 0,FITS,SOURCE,URL,SPECTRUM
0,,Mrk 421,http://cdsads.u-strasbg.fr/abs/2014arXiv1412.3...,
1,FITS_pic/MAGIC_2015_CrabNebula.fits,Crab Nebula,http://cdsads.u-strasbg.fr/abs/2014arXiv1406.6...,[ energy flux energy_er...
2,,PG 1553+113,http://cdsads.u-strasbg.fr/abs/2014arXiv1408.1...,
3,FITS_pic/MAGIC_2015_1ES0806.fits,1ES 0806+524,http://cdsads.u-strasbg.fr/abs/2015arXiv150406...,[ energy flux energy_er...
4,,HESS J1857+026,http://cdsads.u-strasbg.fr/abs/2014A%26A...571...,
5,,J2001+439,http://cdsads.u-strasbg.fr/abs/2014arXiv1409.3...,
6,,Mrk 501,http://cdsads.u-strasbg.fr/abs/2014arXiv1410.6...,
7,,1ES 0033+595,http://cdsads.u-strasbg.fr/abs/2014arXiv1410.7...,
8,FITS_pic/MAGIC_2014_Ic310.fits,IC 310,http://cdsads.u-strasbg.fr/abs/2014Sci...346.1...,[ energy flux energy_er...
9,FITS_pic/MAGIC_2014_PKS1424+240.fits,PKS 1424+240,http://cdsads.u-strasbg.fr/abs/2014arXiv1401.0...,[ energy flux energy_er...


In [17]:
def fix_degeneracy(group):
    from collections import OrderedDict
    row = group.irow(0)
    columns = row.to_dict()
    specs = row['SPECTRUM']
    del columns['SPECTRUM']
    tdf = OrderedDict()
    tdf['OBJECT'] = []
    tdf['RA'] = []
    tdf['DEC'] = []
    tdf['DATE-OBS'] = []
    tdf['SPECTRUM'] = []
    cnt = 0
    for s in specs:
        t = s.retrieve_table()
        tdf['RA'].append( t.meta['RA'] )
        tdf['DEC'].append( t.meta['DEC'] )
        tdf['OBJECT'].append( t.meta['OBJECT'] )
        tdf['DATE-OBS'].append( t.meta['DATE-OBS'] )
        tdf['SPECTRUM'].append( s )
        cnt += 1
    for c in columns:
        tdf[c] = [ row[c] ] * cnt
    return Local( tdf )

table_proc = table.dropna().groupby('URL',group_keys=False).apply(fix_degeneracy).reset_index(drop=True)

  app.launch_new_instance()


In [18]:
table_proc

Unnamed: 0,OBJECT,RA,DEC,DATE-OBS,SPECTRUM,FITS,SOURCE,URL
0,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
1,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
2,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
3,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
4,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
5,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
6,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
7,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
8,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A
9,Segue 1,151.766700,16.081900,2011-01-01,energy flux energy_err...,FITS_pic/MAGIC_2014_Segue1.fits,Segue 1,http://adsabs.harvard.edu/abs/2014JCAP...02..008A


In [19]:
outdir = 'FITS_out/'
clean_dir(outdir,'*')

_bla = table_proc.apply(lambda d:write_to_fits(d.SPECTRUM,outdir),axis=1)
del _bla

table_proc.describe()



Unnamed: 0,RA,DEC
count,124.0,124.0
mean,159.500735,31.600207
std,95.101722,20.146307
min,2.67556,-30.225588
25%,83.633076,16.1482
50%,151.7667,36.64665
75%,199.722804,41.5131
max,356.765,71.3434


In [20]:
import pandas as pd
pd.set_option('display.max_rows',200)
pd.set_option('display.max_columns',10)
pd.set_option('display.width',500)

print table_proc


                     OBJECT          RA        DEC    DATE-OBS                                           SPECTRUM                                         FITS                                SOURCE                                                URL
0                   Segue 1  151.766700  16.081900  2011-01-01        energy             flux       energy_err...              FITS_pic/MAGIC_2014_Segue1.fits                               Segue 1  http://adsabs.harvard.edu/abs/2014JCAP...02..008A
1                   Segue 1  151.766700  16.081900  2011-01-01        energy             flux       energy_err...              FITS_pic/MAGIC_2014_Segue1.fits                               Segue 1  http://adsabs.harvard.edu/abs/2014JCAP...02..008A
2                   Segue 1  151.766700  16.081900  2011-01-01        energy             flux       energy_err...              FITS_pic/MAGIC_2014_Segue1.fits                               Segue 1  http://adsabs.harvard.edu/abs/2014JCAP...02..008A
3       

In [21]:
table_final = table_proc.dropna()[['OBJECT','RA','DEC','URL','FITS','DATE-OBS']]
print table_final

                     OBJECT          RA        DEC                                                URL                                         FITS    DATE-OBS
0                   Segue 1  151.766700  16.081900  http://adsabs.harvard.edu/abs/2014JCAP...02..008A              FITS_pic/MAGIC_2014_Segue1.fits  2011-01-01
1                   Segue 1  151.766700  16.081900  http://adsabs.harvard.edu/abs/2014JCAP...02..008A              FITS_pic/MAGIC_2014_Segue1.fits  2011-01-01
2                   Segue 1  151.766700  16.081900  http://adsabs.harvard.edu/abs/2014JCAP...02..008A              FITS_pic/MAGIC_2014_Segue1.fits  2011-01-01
3                   Segue 1  151.766700  16.081900  http://adsabs.harvard.edu/abs/2014JCAP...02..008A              FITS_pic/MAGIC_2014_Segue1.fits  2011-01-01
4                   Segue 1  151.766700  16.081900  http://adsabs.harvard.edu/abs/2014JCAP...02..008A              FITS_pic/MAGIC_2014_Segue1.fits  2011-01-01
5                   Segue 1  151.766700  16.08