# Cleaning duplicates from catalog

We have a catalog that contains duplicated entries, where duplicates may present different flux values depending on the amount of overlaping observations.
We want to generate a unique sources catalog, with sources with the best signal-to-noise ratio -- the *primary sources* -- among the duplicates.

In [1]:
from xmatch import xmatch

In [2]:
help(xmatch)

Help on function xmatch in module xmatch.xmatchi:

xmatch(catalog_A, catalog_B, columns_A=None, columns_B=None, radius=None, separation_unit='arcsec', method='gc', parallel=False, nprocs=None, snr_column=None)
    Input:
     - catalog_A, catalog_B : ~pandas.DataFrame
             DFs containing (at least) the columns 'ra','dec','id'
     - columns_A, columns_B : dict mapping 'ra','dec','id' columns
            In case catalog(s) have different column names for 'ra','dec','id';
            e.g, {'ra':'RA', 'dec':'Dec', 'id':'ObjID'}
    
    Output:
     - matched_catalog : ~pandas.DataFrame



In [3]:
import pandas

df = pandas.read_csv('table_flux_detections_lamassa.csv', sep=';')
df.head()

Unnamed: 0,RA,DEC,NH,ENERGY_SLOPE,ENERGY_SLOPE_ERROR,nufnu_5keV(erg.s-1.cm-2),nufnu_error_5keV(erg.s-1.cm-2),nufnu_0.5keV(erg.s-1.cm-2),nufnu_error_0.5keV(erg.s-1.cm-2),upper_limit_0.5keV(erg.s-1.cm-2),nufnu_1.5keV(erg.s-1.cm-2),nufnu_error_1.5keV(erg.s-1.cm-2),upper_limit_1.5keV(erg.s-1.cm-2),nufnu_4.5keV(erg.s-1.cm-2),nufnu_error_4.5keV(erg.s-1.cm-2),upper_limit_4.5keV(erg.s-1.cm-2)
0,01:26:58.792,-01:14:17.856,3.67e+20,-0.02,+0.40/-0.27,2.43926e-13,4.96483e-14,3.61121e-14,1.39274e-14,-999.0,4.83625e-14,2.15075e-14,-999.0,2.18113e-13,6.13238e-14,-999.0
1,01:26:42.217,-01:14:06.962,3.71e+20,1.768,+0.46/-0.28,4.89528e-14,6.8754e-15,2.44042e-13,4.65734e-14,-999.0,1.61773e-13,3.81638e-14,-999.0,5.19485e-14,2.43538e-14,-999.0
2,01:26:32.964,-01:12:19.438,3.73e+20,0.8,-999/-999,4.16794e-14,1.43179e-14,4.63593e-14,1.96725e-14,-999.0,3.16812e-14,1.90123e-14,-999.0,1.32903e-14,-4.9055e-11,5.941e-11
3,01:26:27.323,-01:18:13.705,3.78e+20,0.8,-999/-999,4.03614e-14,1.4184e-14,6.74823e-14,2.3715e-14,-999.0,3.06514e-14,-3.22777e-11,7.90159e-11,1.28542e-14,-4.8519e-11,1.18775e-10
4,01:27:22.793,-01:15:59.595,3.62e+20,0.8,-999/-999,3.75018e-14,1.37966e-14,3.11379e-14,1.66103e-14,-999.0,4.28551e-14,2.28608e-14,-999.0,1.19985e-14,-4.74274e-11,8.90714e-11


## SNR column

Let's define the objects' overall SNR estimate to be the ratio between columns `nufnu_5keV(erg.s-1.cm-2)` and `nufnu_error_5keV(erg.s-1.cm-2)`, which are the flux and error associated to the full band emission.

In [4]:
snr = df['nufnu_5keV(erg.s-1.cm-2)']/df['nufnu_error_5keV(erg.s-1.cm-2)']
df['snr'] = snr

**xmatch** needs columns 'ra','dec','id' to be defined. As well as the search radius.

In [5]:
df.reset_index(inplace=True)
df.rename(columns={'index':'ID'}, inplace=True)

In [6]:
df.head()

Unnamed: 0,ID,RA,DEC,NH,ENERGY_SLOPE,ENERGY_SLOPE_ERROR,nufnu_5keV(erg.s-1.cm-2),nufnu_error_5keV(erg.s-1.cm-2),nufnu_0.5keV(erg.s-1.cm-2),nufnu_error_0.5keV(erg.s-1.cm-2),upper_limit_0.5keV(erg.s-1.cm-2),nufnu_1.5keV(erg.s-1.cm-2),nufnu_error_1.5keV(erg.s-1.cm-2),upper_limit_1.5keV(erg.s-1.cm-2),nufnu_4.5keV(erg.s-1.cm-2),nufnu_error_4.5keV(erg.s-1.cm-2),upper_limit_4.5keV(erg.s-1.cm-2),snr
0,0,01:26:58.792,-01:14:17.856,3.67e+20,-0.02,+0.40/-0.27,2.43926e-13,4.96483e-14,3.61121e-14,1.39274e-14,-999.0,4.83625e-14,2.15075e-14,-999.0,2.18113e-13,6.13238e-14,-999.0,4.913079
1,1,01:26:42.217,-01:14:06.962,3.71e+20,1.768,+0.46/-0.28,4.89528e-14,6.8754e-15,2.44042e-13,4.65734e-14,-999.0,1.61773e-13,3.81638e-14,-999.0,5.19485e-14,2.43538e-14,-999.0,7.119993
2,2,01:26:32.964,-01:12:19.438,3.73e+20,0.8,-999/-999,4.16794e-14,1.43179e-14,4.63593e-14,1.96725e-14,-999.0,3.16812e-14,1.90123e-14,-999.0,1.32903e-14,-4.9055e-11,5.941e-11,2.911
3,3,01:26:27.323,-01:18:13.705,3.78e+20,0.8,-999/-999,4.03614e-14,1.4184e-14,6.74823e-14,2.3715e-14,-999.0,3.06514e-14,-3.22777e-11,7.90159e-11,1.28542e-14,-4.8519e-11,1.18775e-10,2.845558
4,4,01:27:22.793,-01:15:59.595,3.62e+20,0.8,-999/-999,3.75018e-14,1.37966e-14,3.11379e-14,1.66103e-14,-999.0,4.28551e-14,2.28608e-14,-999.0,1.19985e-14,-4.74274e-11,8.90714e-11,2.718191


In [7]:
from astropy import units
cols = {'ra':'RA', 'dec':'DEC', 'id':'ID'}
radius = 6*units.arcsec

In [11]:
xcat = xmatch(df, df, columns_A=cols, columns_B=cols, radius=radius, snr_column='snr')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [12]:
xcat

Unnamed: 0_level_0,A,A,A,B,B,B,AB,AB,AB
Unnamed: 0_level_1,RA,DEC,ID,RA,DEC,ID,snr,duplicates,snrs
0,01:26:58.792,-01:14:17.856,0,01:26:58.795,-01:14:17.195,10,10.000000,0,4.913078594836078
1,01:26:42.217,-01:14:06.962,1,01:26:42.199,-01:14:07.136,11,15.704994,1,7.119993018588011
2,01:26:32.964,-01:12:19.438,2,01:26:32.618,-01:12:21.290,12,4.466154,2,2.910999518085753
3,01:26:27.323,-01:18:13.705,3,01:26:27.323,-01:18:13.705,3,2.845558,,
4,01:27:22.793,-01:15:59.595,4,01:27:22.695,-01:15:59.333,13,6.275823,4,2.718191438470348
5,01:25:44.247,-01:22:43.589,5,01:25:44.192,-01:22:44.154,15,6.100005,5,4.399994548873889
6,01:26:00.398,-01:20:38.631,6,01:26:00.389,-01:20:38.317,14,11.823797,6,8.208330494617794
7,01:25:35.964,-01:25:45.307,7,01:25:35.983,-01:25:41.160,16,7.333680,7,5.416870728074132
8,01:26:05.435,-01:26:52.958,8,01:26:05.435,-01:26:52.958,8,2.958493,,
9,01:26:11.419,-01:11:55.131,9,01:26:11.419,-01:11:55.131,9,3.750004,,


In [25]:
pcat = df.set_index('ID').loc[xcat[('B','ID')]]

In [26]:
pcat.head()

Unnamed: 0_level_0,RA,DEC,NH,ENERGY_SLOPE,ENERGY_SLOPE_ERROR,nufnu_5keV(erg.s-1.cm-2),nufnu_error_5keV(erg.s-1.cm-2),nufnu_0.5keV(erg.s-1.cm-2),nufnu_error_0.5keV(erg.s-1.cm-2),upper_limit_0.5keV(erg.s-1.cm-2),nufnu_1.5keV(erg.s-1.cm-2),nufnu_error_1.5keV(erg.s-1.cm-2),upper_limit_1.5keV(erg.s-1.cm-2),nufnu_4.5keV(erg.s-1.cm-2),nufnu_error_4.5keV(erg.s-1.cm-2),upper_limit_4.5keV(erg.s-1.cm-2),snr
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
10,01:26:58.795,-01:14:17.195,3.67e+20,-0.02,+0.17/-0.14,5.34674e-13,5.34674e-14,7.91795e-14,1.50004e-14,-999.0,1.06004e-13,2.31625e-14,-999.0,4.7803e-13,6.60676e-14,-999.0,10.0
11,01:26:42.199,-01:14:07.136,3.71e+20,1.82,+0.18/-0.14,9.83007e-14,6.2592e-15,5.69335e-13,4.693e-14,-999.0,3.31711e-13,3.62454e-14,-999.0,1.0504e-13,2.27612e-14,-999.0,15.704994
12,01:26:32.618,-01:12:21.290,3.73e+20,0.8,-999/-999,4.56758e-14,1.02271e-14,3.81165e-14,1.21148e-14,-999.0,5.20963e-14,1.65582e-14,-999.0,1.36195e-14,-3.50362e-11,5.9302e-11,4.466154
3,01:26:27.323,-01:18:13.705,3.78e+20,0.8,-999/-999,4.03614e-14,1.4184e-14,6.74823e-14,2.3715e-14,-999.0,3.06514e-14,-3.22777e-11,7.90159e-11,1.28542e-14,-4.8519e-11,1.18775e-10,2.845558
13,01:27:22.695,-01:15:59.333,3.62e+20,1.131,+0.35/-0.24,5.60677e-14,8.93392e-15,6.3445e-14,1.55279e-14,-999.0,7.94832e-14,1.94532e-14,-999.0,5.599e-14,1.94396e-14,-999.0,6.275823


In [27]:
pcat.describe()

Unnamed: 0,NH,ENERGY_SLOPE,nufnu_5keV(erg.s-1.cm-2),nufnu_error_5keV(erg.s-1.cm-2),nufnu_0.5keV(erg.s-1.cm-2),nufnu_error_0.5keV(erg.s-1.cm-2),upper_limit_0.5keV(erg.s-1.cm-2),nufnu_1.5keV(erg.s-1.cm-2),nufnu_error_1.5keV(erg.s-1.cm-2),upper_limit_1.5keV(erg.s-1.cm-2),nufnu_4.5keV(erg.s-1.cm-2),nufnu_error_4.5keV(erg.s-1.cm-2),upper_limit_4.5keV(erg.s-1.cm-2),snr
count,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0,1079.0
mean,4.597396e+20,0.745594,1.123217e-11,2.654995e-14,6.6047e-12,-9.984959e-13,-944.3744,8.864537e-12,-3.594645e-12,-932.3383,1.078167e-11,-5.504448e-12,-906.4143,inf
std,3.092824e+20,0.356392,1.674126e-10,1.76979e-13,1.1388e-10,6.269327e-12,227.233,1.411701e-10,4.566946e-11,249.4172,1.610108e-10,2.576469e-11,289.8256,
min,1.92e+20,-1.016,2.1582e-15,0.0,0.0,-9.5904e-11,-999.0,0.0,-1.42085e-09,-999.0,0.0,-3.38856e-10,-999.0,2.085714
25%,2.635e+20,0.799,1.526155e-14,3.3282e-15,6.187325e-15,2.450835e-15,-999.0,1.02014e-14,3.24463e-15,-999.0,1.31465e-14,4.021325e-15,-999.0,3.206421
50%,3.19e+20,0.8,2.81934e-14,7.1111e-15,1.50244e-14,5.96252e-15,-999.0,2.21638e-14,8.23964e-15,-999.0,2.91314e-14,1.09075e-14,-999.0,4.241186
75%,5.03e+20,0.8,6.46254e-14,1.28e-14,3.9222e-14,1.15661e-14,-999.0,5.58857e-14,1.602425e-14,-999.0,6.910105e-14,2.107255e-14,-999.0,6.547396
max,1.18e+21,3.321,3.77914e-09,2.98968e-12,3.31773e-09,2.5064e-12,1.65705e-10,3.60233e-09,3.18358e-12,2.37267e-10,3.70676e-09,4.5773e-12,3.57162e-10,inf


In [28]:
len(pcat)

1079

In [29]:
len(df)

2036

In [30]:
pcat.reset_index(inplace=True)
del pcat['snr']
pcat.head()

Unnamed: 0,ID,RA,DEC,NH,ENERGY_SLOPE,ENERGY_SLOPE_ERROR,nufnu_5keV(erg.s-1.cm-2),nufnu_error_5keV(erg.s-1.cm-2),nufnu_0.5keV(erg.s-1.cm-2),nufnu_error_0.5keV(erg.s-1.cm-2),upper_limit_0.5keV(erg.s-1.cm-2),nufnu_1.5keV(erg.s-1.cm-2),nufnu_error_1.5keV(erg.s-1.cm-2),upper_limit_1.5keV(erg.s-1.cm-2),nufnu_4.5keV(erg.s-1.cm-2),nufnu_error_4.5keV(erg.s-1.cm-2),upper_limit_4.5keV(erg.s-1.cm-2)
0,10,01:26:58.795,-01:14:17.195,3.67e+20,-0.02,+0.17/-0.14,5.34674e-13,5.34674e-14,7.91795e-14,1.50004e-14,-999.0,1.06004e-13,2.31625e-14,-999.0,4.7803e-13,6.60676e-14,-999.0
1,11,01:26:42.199,-01:14:07.136,3.71e+20,1.82,+0.18/-0.14,9.83007e-14,6.2592e-15,5.69335e-13,4.693e-14,-999.0,3.31711e-13,3.62454e-14,-999.0,1.0504e-13,2.27612e-14,-999.0
2,12,01:26:32.618,-01:12:21.290,3.73e+20,0.8,-999/-999,4.56758e-14,1.02271e-14,3.81165e-14,1.21148e-14,-999.0,5.20963e-14,1.65582e-14,-999.0,1.36195e-14,-3.50362e-11,5.9302e-11
3,3,01:26:27.323,-01:18:13.705,3.78e+20,0.8,-999/-999,4.03614e-14,1.4184e-14,6.74823e-14,2.3715e-14,-999.0,3.06514e-14,-3.22777e-11,7.90159e-11,1.28542e-14,-4.8519e-11,1.18775e-10
4,13,01:27:22.695,-01:15:59.333,3.62e+20,1.131,+0.35/-0.24,5.60677e-14,8.93392e-15,6.3445e-14,1.55279e-14,-999.0,7.94832e-14,1.94532e-14,-999.0,5.599e-14,1.94396e-14,-999.0


In [31]:
pcat.to_csv('table_flux_detections_lamassa_unique.csv', index=False)