# Matching cluster membership studies for all clusters

In [1]:
import pandas as pd
import numpy as np
from time import asctime as time

# Some custom functions:
from catalog_matching.catalog_matching import (rename_columns,
                                               get_catalogs,
                                               crossmatch_multiple_catalogs,
                                               pick_members_and_produce_k2_search_input,
                                               merge_k2_info_and_protocol,
                                               )

# Prepare ancillary information and a few variables clusters

In [2]:
# Read the paths and define some generic strings and names

paths = !ls -r downloads/clusters*/*sv
ra = 'RAJ2000'
dec = 'DEJ2000'
pmem = 'PMem'
binary = 'binary'
p_gaiac = 0.9 # assign membership probability for members identified without p-value in Gaia Collaboration (2018)
p_rebull = 0.9 # assign membership probability for members identified without p-value in Rebull et al. (2017)
crossmatch_kwargs = {'arcsec':3., 'plot':True, 'union':True, 'bijective':True} # keyword argument to pass to the cross matching function
pmem_threshold = .8 # threshold for accepted membership probability

# group paths with the same clusters

clusters = ['pleiades', 'praesepe','ngc2168','ngc6774','hyades','ngc2682']
cluster_dict = dict()
out = 'catalog_matching/matched_catalogs/membership_matches/'

for name in clusters:
    l = [path for path in paths if path.find(name) != -1 ]
    cluster_dict[name] = l

#read in cluster table characteristics

table_chars = pd.read_csv('downloads/cluster_catalog_characteristics.csv', sep=';',index_col=False)

#split usecols column into a list of column names each

table_chars.usecols = table_chars.usecols.str.split(',')

table_chars

Unnamed: 0,folder,name,ext,nskip,sep,usecols,cluster
0,clusters_Bouy,ngc2168_bouy,tsv,42,\t,"[RAJ2000, DEJ2000, DANCe, Mm]",M 35
1,clusters_CG,ngc2168,tsv,77,\t,"[RA_ICRS, DE_ICRS, PMemb, Source]",M 35
2,clusters_CG,ngc6774,tsv,77,\t,"[RA_ICRS, DE_ICRS, PMemb, Source]",Ruprecht 147
3,clusters_CG,pleiades,tsv,77,\t,"[RA_ICRS, DE_ICRS, PMemb, Source]",Pleiades
4,clusters_CG,praesepe,tsv,77,\t,"[RA_ICRS, DE_ICRS, PMemb, Source]",Praesepe
5,clusters_GaiaC,hyades_corr,csv,0,",","[RAJ2000_epoch2000, DEJ2000_epoch2000, Source]",Hyades
6,clusters_GaiaC,ngc2168,tsv,46,\t,"[RA_ICRS, DE_ICRS, Source]",M 35
7,clusters_GaiaC,ngc6774,tsv,46,\t,"[RA_ICRS, DE_ICRS, Source]",Ruprecht 147
8,clusters_GaiaC,pleiades,tsv,50,\t,"[RA_ICRS, DE_ICRS, Source]",Pleiades
9,clusters_GaiaC,praesepe,tsv,53,\t,"[RA_ICRS, DE_ICRS, Source]",Praesepe


## The clusters are treated in this order: M67, Hyades, M35, Ruprecht 147, Pleiades, Praesepe

# M 67

In [3]:
#read in DataFrames with usecols
name = 'M 67'
sname = 'ngc2682'
tc = table_chars[table_chars.cluster == name]
pathlist = cluster_dict[sname]
cats = get_catalogs(tc, pathlist)

cols = dict(zip( ['Gonzalez', 'Gao'],
               [['EPIC_id',pmem,ra,dec],
                ['gaia_id',ra,dec,pmem],
                ]))
renamed_catalogs = dict()

  df = pd.read_table(p,skiprows=r, usecols=row.usecols, delimiter=row['sep'])


In [4]:
# Gao

r = "Gao"
c1 = rename_columns(cats["Gao"], "Gao", cols["Gao"])
renamed_catalogs[r] = c1

# Gonzalez

r = "Gonzalez"
c2 = rename_columns(cats["Gonzalez"], "Gonzalez", cols["Gonzalez"])
gonzalezmap = {'SN':0.1,'BN':0.1,'N':0.1,'M':.9,'SM':.9,'BM':.9,'U':.5}# Map classifiers to probabilities:
binarymap = {'SN':0,'BN':0,'N':0,'M':0,'SM':0,'BM':1,'U':0}
c2["binary"] = c2.PMem_Gonzalez.str.strip().map(binarymap)
c2['PMem_Gonzalez'] = c2.PMem_Gonzalez.str.strip().map(gonzalezmap)
c2.PMem_Gonzalez = c2.PMem_Gonzalez.fillna(0.5)
c2.binary = c2.binary.fillna(0)
#c2 = c2.drop('newpmem',axis=1).dropna(how='any')
renamed_catalogs[r] = c2

### Add hints to which targets have Nardiello's PSF extracted LCs

In [5]:
# Read in Nardiello M 67 target list

nar = pd.read_csv('downloads/superstamps/ngc2682/ngc2682_nardiello_targets.csv')
nar['K_2MASS'] = nar['  K_2MASS ID'].apply(lambda x: str(x).strip().split(' ')[0])
nar['ID'] = nar['  K_2MASS ID'].apply(lambda x: str(x).strip().split(' ')[2])
nar = nar.drop('  K_2MASS ID',axis=1)
nar.columns = nar.columns.str.strip()
nar = nar.rename(index=str, columns={'RA':'RAJ2000_N',
                                       'DEC':'DEJ2000_N',
                                       'ID':'id_N',})
nar2 = nar[['id_N','RAJ2000_N','DEJ2000_N']]

renamed_catalogs["N"] = nar2

### Cross-match all three:

In [6]:
cross = crossmatch_multiple_catalogs(renamed_catalogs, name, sname, **crossmatch_kwargs)

In [7]:
cross.to_csv('{}{}_allmembers.csv'.format(out,sname),index=False)

In [8]:
cross.columns.values[-2][8:]

'N_Gao_Gonzalez'

##### Pick out likely members, and write out RA, Dec file to match with K2

In [9]:
coords = cross.columns.values[-2][8:]
cross = pick_members_and_produce_k2_search_input(cross, sname, name,
                                             coords=coords,
                                             PMem_mean_threshold=pmem_threshold)

There are 4671 candidate members in M 67.
There are 1344 members in M 67 with membership probability above 0.8.


Now go to [the K2 search mask](https://archive.stsci.edu/k2/data_search/search.php) and feed it the file you just saved.

Take both long and short cadence, radius is 0.05 arcmin data. Sort by angular separation, then K2 ID, then Campaign. Need to shift first lines to the row after the header rows manually. Display coordinates in degrees. Suppress null outputs (bottom left button).

### Post K2 Search Cleaning

In [10]:
merge_k2_info_and_protocol(cross, sname, coords)

# HYADES

In [11]:
#read in DataFrames with usecols
name = 'Hyades'
sname = 'hyades'
tc = table_chars[table_chars.cluster == name]
pathlist = cluster_dict[sname]
cats = get_catalogs(tc, pathlist)

cols = dict(zip(  ['Douglas', 'Reino', 'GaiaC'],
               [[ra,dec,pmem,binary],
                [pmem,ra,dec,"GaiaDR1"],
                ["ID",ra,dec],
                ]))
renamed_catalogs = dict()

  df = pd.read_table(p,skiprows=r, usecols=row.usecols, delimiter=row['sep'])


In [12]:
# Douglas

r = "Douglas"
c1 = rename_columns(cats[r], r, cols[r])
c1.PMem_Douglas *= 1e-2
c1.binary_Douglas = c1.binary_Douglas.map({"N":0,"Y":1})

In [13]:
renamed_catalogs[r] = c1

# Reino

r = "Reino"
c2 = rename_columns(cats[r], r, cols[r])
renamed_catalogs[r] = c2

# Gaia Collaboration

r = "GaiaC"
c3 = rename_columns(cats[r], r, cols[r] )
c3["PMem_GaiaC"] = p_gaiac
renamed_catalogs[r] = c3


In [14]:
cross = crossmatch_multiple_catalogs(renamed_catalogs, name, sname, **crossmatch_kwargs)

In [15]:
cross.to_csv('{}{}_allmembers.csv'.format(out,sname),index=False)

##### Pick out likely members, and write out RA, Dec file to match with K2

In [16]:
coords = cross.columns.values[-2][8:]
cross = pick_members_and_produce_k2_search_input(cross, sname, name,
                                             coords=coords,
                                             PMem_mean_threshold=pmem_threshold)

There are 896 candidate members in Hyades.
There are 655 members in Hyades with membership probability above 0.8.


Now go to [the K2 search mask](https://archive.stsci.edu/k2/data_search/search.php) and feed it the file you just saved.

Take both long and short cadence, radius is 0.05 arcmin data. Sort by angular separation, then K2 ID, then Campaign. Need to shift first lines to the row after the header rows manually. Display coordinates in degrees. Suppress null outputs (bottom left button).

### Post K2 Search Cleaning

In [17]:
merge_k2_info_and_protocol(cross, sname, coords)

# M35

In [18]:
#read in DataFrames with usecols
name = 'M 35'
sname = 'ngc2168'
tc = table_chars[table_chars.cluster == name]
pathlist = cluster_dict[sname]
cats = get_catalogs(tc, pathlist)

cols = dict(zip(['Bouy', 'CG', 'GaiaC'],
                [['DANCe_id',ra,dec,pmem],
                 [ra, dec, 'gaia_id', pmem],
                 ['gaia_id',ra,dec],]))

renamed_catalogs = dict()

  df = pd.read_table(p,skiprows=r, usecols=row.usecols, delimiter=row['sep'])


In [19]:
# Bouy

r = "Bouy"
c0 = rename_columns(cats[r], r, cols[r])
c0.DANCe_id_Bouy = c0.DANCe_id_Bouy.str.replace('+','')
renamed_catalogs[r] = c0

# CG

r = "CG"
c1 = rename_columns(cats[r], r, cols[r])
renamed_catalogs[r] = c1

# GaiaC

r = "GaiaC"
c2 = rename_columns(cats[r], r, cols[r])
c2["PMem_GaiaC"] = p_gaiac
renamed_catalogs[r] = c2

##### Add hints to which targets have PSF detrended LCs from Soares-Furtado

In [20]:
sf = pd.read_csv('downloads/superstamps/ngc2168/object-summary_c0.csv')
sf = sf.rename(index=str, columns={'ra':'RAJ2000_SF',
                                   'dec':'DEJ2000_SF',
                                   'object':'2MASS_id_SF',})
sf = sf[['2MASS_id_SF','RAJ2000_SF','DEJ2000_SF']]
sf['2MASS_id_SF'] = sf['2MASS_id_SF'].apply(lambda c : ''.join(digit for digit in c if digit.isdigit()))

renamed_catalogs["SF"] = sf

### Cross-match all catalogs

In [21]:
cross = crossmatch_multiple_catalogs(renamed_catalogs, name, sname, **crossmatch_kwargs)

In [22]:
cross.to_csv('{}{}_allmembers.csv'.format(out,sname),index=False)

##### Pick out likely members, and write out RA, Dec file to match with K2

In [23]:
coords = cross.columns.values[-2][8:]
cross = pick_members_and_produce_k2_search_input(cross, sname, name,
                                             coords=coords,
                                             PMem_mean_threshold=pmem_threshold)

There are 338749 candidate members in M 35.
There are 1614 members in M 35 with membership probability above 0.8.


Now go to [the K2 search mask](https://archive.stsci.edu/k2/data_search/search.php) and feed it the file you just saved.

Take both long and short cadence, radius is 0.05 arcmin data. Sort by angular separation, then K2 ID, then Campaign. Need to shift first lines to the row after the header rows manually. Display coordinates in degrees. Suppress null outputs (bottom left button).

### Post K2 Search Cleaning

In [24]:
merge_k2_info_and_protocol(cross, sname, coords)

# RUPRECHT 147

In [25]:
#read in DataFrames with usecols
name = 'Ruprecht 147'
sname = 'ngc6774'
tc = table_chars[table_chars.cluster == name]
pathlist = cluster_dict[sname]
cats = get_catalogs(tc, pathlist)

cols = dict(zip(['CG', 'Curtis', 'Olivares', 'GaiaC'],
               [[ra,dec,'gaia_id',pmem],
                [ra,dec,'2MASS_id',pmem],
                [ra, dec, pmem, "EPIC"],
                ['gaia_id',ra,dec]]))

renamed_catalogs = dict()

  df = pd.read_table(p,skiprows=r, usecols=row.usecols, delimiter=row['sep'])


In [26]:
# CG

r = "CG"
c0 = rename_columns(cats[r], r, cols[r])
renamed_catalogs[r] = c0

# GaiaC

r = "GaiaC"
c1 = rename_columns(cats['GaiaC'], 'GaiaC', cols['GaiaC'])
c1["PMem_GaiaC"] = p_gaiac
renamed_catalogs[r] = c1

# Curtis

r= "Curtis"
c2 = rename_columns(cats[r], r, cols[r])
c2['2MASS_id_Curtis']= c2['2MASS_id_Curtis'].str.replace('-','')
c2.loc[c2.PMem_Curtis == 'Y','PMem_Curtis'] = 0.9
c2.loc[c2.PMem_Curtis == 'P','PMem_Curtis'] = 0.7
c2.loc[c2.PMem_Curtis == 'N','PMem_Curtis'] = 0.1
c2.loc[c2.PMem_Curtis == 'B','PMem_Curtis'] = 0.0
renamed_catalogs[r] = c2

# Olivares

r = "Olivares"
c3 = rename_columns(cats[r], r, cols[r])
renamed_catalogs[r] = c3


In [27]:
cross = crossmatch_multiple_catalogs(renamed_catalogs, name, sname, **crossmatch_kwargs)

In [28]:
cross.to_csv('{}{}_allmembers.csv'.format(out,sname),index=False)

##### Pick out likely members, and write out RA, Dec file to match with K2

In [29]:
coords = cross.columns.values[-2][8:]
cross = pick_members_and_produce_k2_search_input(cross, sname, name,
                                             coords=coords,
                                             PMem_mean_threshold=pmem_threshold)

There are 353 candidate members in Ruprecht 147.
There are 213 members in Ruprecht 147 with membership probability above 0.8.


Now go to [the K2 search mask](https://archive.stsci.edu/k2/data_search/search.php) and feed it the file you just saved.

Take both long and short cadence, radius is 0.05 arcmin data. Sort by angular separation, then K2 ID, then Campaign. Need to shift first lines to the row after the header rows manually. Display coordinates in degrees. Suppress null outputs (bottom left button).

### Post K2 Search Cleaning

In [30]:
merge_k2_info_and_protocol(cross, sname, coords)

[GO 7035](https://keplerscience.arc.nasa.gov/data/k2-programs/GO7035.txt) has 32 SC targets, of which we use 25.

# PLEIADES

In [31]:
#read in DataFrames with usecols
name = 'Pleiades'
sname = 'pleiades'
tc = table_chars[table_chars.cluster == name]
pathlist = cluster_dict[sname]
cats = get_catalogs(tc, pathlist)

cols = dict(zip(['CG', 'Rebull', 'GaiaC', 'Olivares'],
                [[ra,dec,'gaia_id',pmem],
                 ['EPIC_id',ra,dec,pmem],
                 ["gaia_id", ra,dec],
                 [ra,dec,pmem,binary],
                 ]))

renamed_catalogs = dict()

  df = pd.read_table(p,skiprows=r, usecols=row.usecols, delimiter=row['sep'])


In [32]:
# CG

r = "CG"
c0 = rename_columns(cats[r], r, cols[r])
renamed_catalogs[r] = c0

# Rebull

r = "Rebull"
c1 = rename_columns(cats[r], r, cols[r])
renamed_catalogs[r] = c1

# GaiaC

r = "GaiaC"
c2 = rename_columns(cats[r], r, cols[r])
c2["PMem_GaiaC"] = p_gaiac
renamed_catalogs[r] = c2

# Olivares

r = "Olivares"
c3 = rename_columns(cats[r], r, cols[r])
renamed_catalogs[r] = c3

In [33]:
cross = crossmatch_multiple_catalogs(renamed_catalogs, name, sname, **crossmatch_kwargs)

In [34]:
cross.to_csv('{}{}_allmembers.csv'.format(out,sname),index=False)

##### Pick out likely members, and write out RA, Dec file to match with K2

In [35]:
coords = cross.columns.values[-2][8:]
cross = pick_members_and_produce_k2_search_input(cross, sname, name,
                                                 coords=coords,
                                                 PMem_mean_threshold=pmem_threshold)

There are 1424987 candidate members in Pleiades.
There are 2033 members in Pleiades with membership probability above 0.8.


Now go to [the K2 search mask](https://archive.stsci.edu/k2/data_search/search.php) and feed it the file you just saved.

Take both long and short cadence, radius is 0.05 arcmin data. Sort by angular separation, then K2 ID, then Campaign. Display coordinates in degrees. Suppress null outputs (bottom left button).

### Post K2 Search Cleaning

In [36]:
merge_k2_info_and_protocol(cross, sname, coords)

# PRAESEPE

In [37]:
#read in DataFrames with usecols
name = 'Praesepe'
sname = 'praesepe'
tc = table_chars[table_chars.cluster == name]
pathlist = cluster_dict[sname]
cats = get_catalogs(tc, pathlist)

cols = dict(zip(['CG', 'Douglas', 'Rebull', 'GaiaC'],
                [[ra, dec, 'gaia_id', pmem],
                 [ra, dec, pmem,binary],
                 [ra, dec, "EPIC_id"],
                 ['gaia_id', ra, dec],
                 ]))

renamed_catalogs = dict()

  df = pd.read_table(p,skiprows=r, usecols=row.usecols, delimiter=row['sep'])


In [38]:
# CG

r = "CG"
c0 = rename_columns(cats[r], r, cols[r])
renamed_catalogs[r] = c0

# Rebull

r = "Rebull"
c1 = rename_columns(cats[r], r, cols[r])
c1["PMem_Rebull"] = p_rebull
renamed_catalogs[r] = c1

# GaiaC

r = "GaiaC"
c2 = rename_columns(cats[r], r, cols[r])
c2["PMem_GaiaC"] = p_gaiac
renamed_catalogs[r] = c2

# Douglas

r = "Douglas"
c3 = rename_columns(cats[r], r, cols[r])
c3.binary_Douglas = c3.binary_Douglas.map({"N":0,"Y":1})
c3.PMem_Douglas = c3.PMem_Douglas.str.strip().replace("","-999").astype(float).replace(-999,np.nan) / 100
renamed_catalogs[r] = c3

In [39]:
cross = crossmatch_multiple_catalogs(renamed_catalogs, name, sname, **crossmatch_kwargs)

In [40]:
cross.to_csv('{}{}_allmembers.csv'.format(out,sname),index=False)

##### Pick out likely members, and write out RA, Dec file to match with K2

In [41]:
coords = cross.columns.values[-2][8:]
cross = pick_members_and_produce_k2_search_input(cross, sname, name,
                                                 coords=coords,
                                                 PMem_mean_threshold=pmem_threshold)

There are 1391 candidate members in Praesepe.
There are 1281 members in Praesepe with membership probability above 0.8.


Now go to [the K2 search mask](https://archive.stsci.edu/k2/data_search/search.php) and feed it the file you just saved.

Take only long cadence, radius is 0.05 arcmin data. Select all columns and only long cadence data. Sort by angular separation, then K2 ID, then Campaign. Need to shift first lines to the row after the header rows manually. Display coordinates in degrees. Suppress null outputs (bottom left button).

### Post K2 Search Cleaning

In [42]:
merge_k2_info_and_protocol(cross, sname, coords)