# Reprocessing data from MAGIC/PIC base

In [1]:
# We start with some code to access the table at http://vobs.magic.pic.es/fits/
def get_table_from_url(url):
    from bs4 import BeautifulSoup as BS
    import urllib2
    soup = BS(urllib2.urlopen(url).read())
    table = soup.find('table',{'class','mytable'})
    return table

url = 'http://vobs.magic.pic.es/fits/'
table_pic_html = get_table_from_url(url)

In [2]:
from IPython.display import HTML
HTML(unicode(table_pic_html))

Source,Article,Year,Reference,Download
Mrk 421,Unprecedented study of the broadband emission of Mrk 421 during flaring activity in March 2010,2015,(2014arXiv1412.3576T),FITS
Crab Nebula,Measurement of the Crab nebula spectrum over three decades in energy with the MAGIC telescopes,2015,(2014arXiv1406.6892M),FITS
PG 1553+113,Probing the very-high-energy γ-ray spectral curvature in the blazar PG 1553+113 with the MAGIC telescopes,2015,(2014arXiv1408.1975M),FITS
1ES 0806+524,MAGIC detection of short-term variability of the high-peaked BL Lac object 1ES 0806+524,2015,"(J. Aleksić et al., MNRAS 451, 5258-5269)",FITS
HESS J1857+026,MAGIC reveals a complex morphology within the unidentified gamma-ray source HESS J1857+026,2014,"(J. Aleksić et al., A&A 571, A96)",FITS
J2001+439,First broadband characterization and redshift determination of the VHE blazar MAGIC J2001+439,2014,(2014arXiv1409.3389M),FITS
Mrk 501,Multiwavelength Observations of Mrk 501 in 2008,2014,(2014arXiv1410.6391M),FITS
1ES 0033+595,Discovery of very high energy gamma-ray emission from the blazar 1ES 0033+595 by the MAGIC telescopes,2014,(2014arXiv1410.7059A),FITS
IC 310,Black Hole lightning due to particle acceleration at sub-horizon scales,2014,"(J. Aleksić et al., Science 346, 1080-1084)",FITS
PKS 1424+240,MAGIC long-term study of the distant TeV blazar PKS 1424+240 in a multiwavelength context,2014,"(J. Aleksić et al., A&A 567, A135)",FITS


In [3]:
# This table now is parsed to get the columns
def get_table_fields(table):

    def get_doi_url(url):
        # The url of the reference ("aurl").
        from bs4 import BeautifulSoup as BS
        import urllib2
        import re
        _ads = 'adsabs'
        _cds = 'cdsads'
        _axv = 'arxiv'
        if (_ads in url or _cds in url or _axv in url):
            soup = BS(urllib2.urlopen (url))
            trs = soup.findAll('tr')
            filter(lambda x: 'doi' in x.get_text().lower(), trs)
            tr = filter(lambda x: 'doi' in x.get_text().lower(), trs)[0]
            doi = tr.get_text()
            url = 'http://dx.doi.org/'
            url += re.sub('DOI:','',re.sub('\n','',doi))
        return url

    def process_row(row):
        cells = row.findAll('td')
        if len(cells)==5:
            # Object source name(s) (can be more then one comma separated)
            src = cells[0].find(text=True)
            src = src.strip()
            # Article reference (url), usually a ref to ads
            art = cells[1].find('a',href=True)
            aurl = art['href']
            durl = get_doi_url(aurl)
            # We skip year of publication (third column)
            # as well as bibcode reference (fourth column)
            #ref = cells[3].find(text=True).encode('utf8')
            # FITS file link for downloading it in the near future
            fits = cells[4]
            file = fits.find('a',href=True)
            try:
                file = file['href']
            except:
                file = None
            furl = url+file if file!=None else '_NULL_'
            return (src,durl,file)
        return None
    
    magic_table = {'SOURCE':[], 'DOI':[], 'FITS':[]}
    for row in table.findAll('tr'):
        vals = process_row(row)
        if vals is not None:
            src,durl,file = vals
            magic_table['SOURCE'].append(src)
            magic_table['DOI'].append(durl)
            magic_table['FITS'].append(file)
    return magic_table
    
table_pic_dict = get_table_fields(table_pic_html)

del table_pic_html
#tf = table_filtered
#for i in range(len(tf['source'])):
#    print("%s : %s : %s"%(tf['source'][i],tf['doi'][i],tf['fits'][i]))

In [4]:
try:
    table_pic_dict = table_pic.copy()
    del table_pic
except:
    pass

import pandas as pd
table_pic = pd.DataFrame(table_pic_dict)

del table_pic_dict

def print_describe(table):
    print table.describe()
    print "\n-> Has Nil?"
    hows_nil = table.isnull().any()
    print hows_nil
    for c in hows_nil.index:
        if not hows_nil[c]: continue
        print "\n-> Indexes where column '{}' is null:".format(c)
        print table[table[c].isnull()].index.values

print_describe(table_pic)

                                                   DOI  \
count                                               99   
unique                                              99   
top     http://dx.doi.org/10.1088/0004-637X/705/2/1624   
freq                                                 1   

                                       FITS         SOURCE  
count                                    67             99  
unique                                   67             68  
top     mfits/base/MAGIC_2010_CygnusX3.fits  Markarian 421  
freq                                      1              6  

-> Has Nil?
DOI       False
FITS       True
SOURCE    False
dtype: bool

-> Indexes where column 'FITS' is null:
[ 0  2  4  5  6  7 12 17 18 28 29 33 34 35 36 37 41 42 43 49 50 53 57 67 73
 83 84 88 90 93 96 98]


In [5]:
# Now we want to download the fits files
def download_fits(url,out):
    import wget
    filename = wget.download(url,out=out)
    return filename

def clean_dir(dir,ext="*"):
    import os
    from glob import glob
    if not os.path.exists(dir):
        os.mkdir(dir)
        return True
    if os.path.isdir(dir):
        files = glob(os.path.join(dir,ext))
        for f in files:
            os.remove(f)
        return True
    return False
    
fits_download_dir = 'FITS_pic/'
clean_dir(fits_download_dir)

#files_list = table_pic.FITS.dropna()
#furls = [ url+f for f,_ in files_list ]
furls = url + table_pic.FITS.dropna()
#del files_list

#try:
#    from IPython.parallel import Client
#    ipc = Client()
#    dview = ipc[:]
#    print "-> Parallel mode"
#    ret = dview.map_sync(download_fits, furls,[fits_dir]*len(furls))
#except:
#    print "-> Serial mode"
#    ret = map(download_fits, furls,[fits_dir]*len(furls))

#print '\nTotal: {} files downloaded\n'.format(len(ret))
#for i,f in enumerate(ret):
#    f,s = files_list[i]
#    print 'Source: {}\t,file: {}\t, to {}'.format(s,f,ret[i])

#for i in range(len(table_cols['source'])):
#    src = table_cols['source'][i]
#    file = table_cols['fits'][i]
#    if not file is None:
#        print("Downloading %s"%(file))
#        download_fits(url+file,fits_dir)
#    else:
#        print("File for source %s not available."%(src))

fits_download = furls.apply(lambda f: download_fits(f,fits_download_dir))
table_pic['FITS'] = fits_download

del fits_download,furls

print_describe(table_pic)

                                                   DOI  \
count                                               99   
unique                                              99   
top     http://dx.doi.org/10.1088/0004-637X/705/2/1624   
freq                                                 1   

                                   FITS         SOURCE  
count                                67             99  
unique                               67             68  
top     FITS_pic//MAGIC_2009_OJ287.fits  Markarian 421  
freq                                  1              6  

-> Has Nil?
DOI       False
FITS       True
SOURCE    False
dtype: bool

-> Indexes where column 'FITS' is null:
[ 0  2  4  5  6  7 12 17 18 28 29 33 34 35 36 37 41 42 43 49 50 53 57 67 73
 83 84 88 90 93 96 98]


In [6]:
# Now we can process the fits files themselves.
# He start noting that we want the SPECTRUM Data Unit(s)
#  available (or not) in the fits files; discard the other DU.
# Things we want to do:
# - get the OBJECT name
# - get the each object position
# - get the observation date
# - transform the data vectors (x) to frequency(Hz) and (y) to flux(erg/s/cm2)
# Then we should follow the following workflow:
# - open the fits file
# - find the necessary data unit (SPECTRUM)
# - open its header
#  - get some keywords from the header
# - open its data; data here are vectors
#  - it can be from 2 to 4 vectors
#   - energy
#   - flux
#   - Denergy
#   - Dflux
#  - convert the ?energy vectors to 'Hz' units
#  - convert the ?flux vectors to 'erg/s/cm2' units

# Here we just define the functions we'll need..

def read_file(filename):
    from astropy.io import fits
    try:
        hdulist = fits.open(filename)
    except:
        hdulist = None
    return hdulist
    
def select_dataUnits(hdulist,du_name='SPECTRUM'):
    dui = [ i for i,du in enumerate(hdulist) if du_name in du.name ]
    dus = [ hdulist[ii] for ii in dui ]
    _du = filter(lambda x:du_name in x.name, hdulist)
    assert dus == _du
    return (dus,dui)
    
def read_header(spec,table):
    assert isinstance(table,dict)
    def read_keyword(header,table,word):
        table[word] = header.get(word, None)
    header = spec.header
    read_keyword(header,table, 'EXTNAME')
    read_keyword(header,table, 'OBJECT')
    read_keyword(header,table, 'DATE-OBS')
    read_keyword(header,table, 'SRCPOS1')
    read_keyword(header,table, 'SRCPOS2')

def attempt_fix_objectname(table):
    assert table['OBJECT'] is None
    import re
    _fn = table['filename'][:-5]
    objname = re.sub('.*/MAGIC_20[0-9][0-9]_','',_fn)
    table['OBJECT'] = objname
    print '*** OBJECT name got from file name.'


def resolve_name(name):
    from astropy.coordinates import get_icrs_coordinates as get_coords
    try:
        icrs = get_coords(name)
        pos = (icrs.ra.value,icrs.dec.value)
    except:
        pos = None
    return pos

def read_data(spec,table):
    assert isinstance(table,dict)
    cols = zip(spec.columns.names,spec.columns.units)
    for n in ['energy','Denergy','flux','Dflux']:
        table[n+'_unit'] = None
        table[n] = None
    for n,u in cols:
        if (n == 'energy' or n == 'Denergy' or n == 'flux' or n == 'Dflux'):
            table[n+'_unit'] = u
            table[n] = spec.data[n]
        else:
            print "Different field:",n

def transf_data(table):
    from astropy import units
    Unit = units.Unit
    import numpy as np
    uEn = Unit('Hz')
    uFn = Unit('erg s-1 cm-2')
    uEc = Unit('TeV')
    conv = {Unit('ph TeV s-1 cm-2') : lambda x,y: (x/Unit('ph')).to(uFn),
            Unit('ph TeV-1 s-1 cm-2') : lambda x,y: ((y.to(uEc)**2)*(x/Unit('ph'))).to(uFn),
            Unit('ph s-1 cm-2') : lambda x,y: None}
    uE = Unit(table['energy_unit'])
    vE = np.array(table['energy'],np.float)
    vE = vE * uE
    vEn = vE.to(uEn, equivalencies=units.spectral())
    uF = Unit(table['flux_unit'])
    vF = np.array(table['flux'],np.float)
    vF = vF * uF
    vFn = conv[uF](vF,vE)
    if vFn is None:
        print "vFn is None!?!:",str(uF)
        return False
    table['energy_unit'] = uEn.to_string(units.format.CDS)
    table['energy'] = vEn.value
    table['flux_unit'] = uFn.to_string(units.format.CDS)
    table['flux'] = vFn.value
    if table['Denergy'] is not None:
        uDE = Unit(table['Denergy_unit'])
        vDE = np.array(table['Denergy'],np.float)
        vDE = vDE * uDE
        vDEn = vDE.to(uEn, equivalencies=units.spectral())
        table['Denergy_unit'] = uEn.to_string(units.format.CDS)
        table['Denergy'] = vDEn.value
    else:
        uDE = Unit('')
        vDEn = np.asarray([-999]*len(vE))
        table['Denergy_unit'] = uDE.to_string()
        table['Denergy'] = vDEn
    if table['Dflux'] is not None:
        uDF = Unit(table['Dflux_unit'])
        vDF = np.array(table['Dflux'],np.float)
        vDF = vDF * uDF
        vDFn = conv[uDF](vDF,vE) # Notice we use the energy bin/value of the measurement.
        table['Dflux_unit'] = uFn.to_string(units.format.CDS)
        table['Dflux'] = vDFn.value
    else:
        uDF = Unit('')
        vDFn = np.asarray([-999]*len(vE))
        table['Dflux_unit'] = uDF.to_string()
        table['Dflux'] = vDFn
    return True

def proc_fits_file(fn,source):
    out = []
    from collections import OrderedDict
    table = OrderedDict()
    table['filename'] = None
    table['EXTNAME'] = None
    table['OBJECT'] = None
    table['SRCPOS1'] = None
    table['SRCPOS2'] = None
    table['DATE-OBS'] = None
    table['energy'] = None
    table['energy_unit'] = None
    table['Denergy'] = None
    table['Denergy_unit'] = None
    table['flux'] = None
    table['flux_unit'] = None
    table['Dflux'] = None
    table['Dflux_unit'] = None
    
    table['filename'] = fn
    print "\nProcessing file:",fn,

    hdulist = read_file(fn)
    if hdulist is None:
        print " ..failed: to open." 
        return None
    
    spectra,specdui = select_dataUnits(hdulist)
    if not len(spectra):
        print " ..failed: no SPECTRUM" 
        return None
    print
    
    for i,spec in enumerate(spectra):
        read_header(spec,table)
        #table['EXTNAME'] += '+'+str(specdui[i])
        if table['OBJECT'] is None:
            attempt_fix_objectname(table)
        pos = resolve_name(table['OBJECT'])
        if pos is None:
            pos = resolve_name(source)
            if pos is None:
                pos = (-360,-180)
            else:
                table['OBJECT'] = source
        if table['SRCPOS1'] is None or table['SRCPOS2'] is None:
            table['SRCPOS1'],table['SRCPOS2'] = pos
        read_data(spec,table)
        ret = transf_data(table)
        if ret is not True:
            continue
        #filename = write_file(table)
        #print "-> output file",filename,"created."
        out.append(table.copy())
    return out
        

In [7]:
table_pic['PROC'] = table_pic.apply(lambda x: proc_fits_file(x.FITS,x.SOURCE), axis=1)


Processing file: nan  ..failed: to open.

Processing file: FITS_pic//MAGIC_2015_CrabNebula.fits
*** OBJECT name got from file name.
*** OBJECT name got from file name.

Processing file: nan  ..failed: to open.

Processing file: FITS_pic//MAGIC_2015_1ES0806.fits
*** OBJECT name got from file name.
*** OBJECT name got from file name.
*** OBJECT name got from file name.
*** OBJECT name got from file name.

Processing file: nan  ..failed: to open.

Processing file: nan  ..failed: to open.

Processing file: nan  ..failed: to open.

Processing file: nan  ..failed: to open.

Processing file: FITS_pic//MAGIC_2014_Ic310.fits
*** OBJECT name got from file name.
*** OBJECT name got from file name.

Processing file: FITS_pic//MAGIC_2014_PKS1424+240.fits
*** OBJECT name got from file name.
*** OBJECT name got from file name.
*** OBJECT name got from file name.
*** OBJECT name got from file name.

Processing file: FITS_pic//MAGIC_2014_3C58.fits
*** OBJECT name got from file name.

Processing file: 



In [8]:

def fix_degeneracy(group):
    row = group.irow(0)
    columns = row.to_dict()
    ddf = {}
    dtabs = row['PROC']
    ddf['PROC'] = dtabs
    del columns['PROC']
    ddf['OBJECT'] = [ d['OBJECT'] for d in dtabs ]
    ddf['RA'] = [ d['SRCPOS1'] for d in dtabs ]
    ddf['DEC'] = [ d['SRCPOS2'] for d in dtabs ]
    for c in columns: ddf[c] = [row[c]]*len(dtabs)
    return pd.DataFrame(ddf)

table_proc = table_pic.dropna().groupby('DOI',group_keys=False).apply(fix_degeneracy).reset_index(drop=True)

#import os
#from glob import glob
#fits_list = glob(os.path.join(fits_download_dir,'*.fits'))
#out_list = []

#import random
#n = random.randint(0,len(fits_list))
#inps_list = fits_list

#for fn in inps_list:
#    tables = proc_fits_file(fn)
#    if tables is None:
#        print "Oops, something went wrong."
#        continue
#    for table in tables:
#        print("There we go.. %s processing succeed." % (fn))
#        for k,v in table.iteritems():
#            print"\t",k,v
#        out_list.append(table)
#
#print len(out_list)




#ret_par = ret_ser = None
#try:
#    print aaa
#    from IPython.parallel import Client
#    ipc = Client()
#    #lview = ipc.load_balanced_view()
#    lview = ipc[:]
#    lview.block = True
#    print "-> Parallel mode"
#    ret_par = lview.map_sync(proc_fits_file,fits_list)
#except:
#    print "-> Serial mode"
#    ret_ser = map(proc_fits_file,fits_list)
#
#print ret_par
#print ret_ser


  app.launch_new_instance()


In [9]:
def dict_to_table(dict_tab):
    from astropy.table import Table,Column
    from astropy.units import Unit
    tab = dict_tab
    h_keys = ['filename','EXTNAME','OBJECT','SRCPOS1','SRCPOS2','DATE-OBS']
    t_cols = ['energy','Denergy','flux','Dflux']
    cols = []
    t = Table()
    for cname in t_cols:
        cdata = tab[cname]
        cunit = Unit(tab[cname+'_unit'])
        c = Column(data=cdata,name=cname,unit=cunit)
        cols.append(c)
    for c in cols:
        t.add_column(c)
    for kword in h_keys:
        t.meta[kword.upper()] = tab[kword]
    return t


table_proc['SPECTRUM'] = table_proc.PROC.apply(dict_to_table)

def write_table(table,odir):
    import os,re
    filename = os.path.basename(table.meta['FILENAME'])
    extname = '_' + table.meta['EXTNAME'] + '.fits'
    filename = os.path.join(odir,re.sub('.fits',extname,filename))
    table.write(filename,format='fits')
    return filename

outdir = 'FITS_out/'
clean_dir(outdir)

table_proc['FITS'] = table_proc.SPECTRUM.apply(lambda d:write_table(d,outdir))



In [16]:
pd.set_option('display.max_rows',200)
pd.set_option('display.max_columns',10)
pd.set_option('display.width',100)
#del table_proc['PROC']
#del table_proc['SPECTRUM']

#print table_proc.describe(include='all')
print table_proc[['OBJECT','SOURCE','RA','DEC']]


                 OBJECT                                       SOURCE          RA         DEC
0            CrabNebula                                  Crab Nebula   83.633212   22.014460
1            CrabNebula                                  Crab Nebula   83.633212   22.014460
2                 3C454                                     3C 454.3  342.894735   18.811129
3                 3C454                                     3C 454.3  342.894735   18.811129
4                 3C454                                     3C 454.3  342.894735   18.811129
5           PG 1553+113                                  PG 1553+113  238.929350   11.190102
6         Markarian 421                                Markarian 421  166.113808   38.208833
7         Markarian 421                                Markarian 421  166.113808   38.208833
8         Markarian 421                                Markarian 421  166.113808   38.208833
9         Markarian 421                                Markarian 421  