# Compile labelled data

### 2023-10-20 - 2024-04-17 Johannes Sahlmann

- compile labelled dataset as input for model training
- generate label table

Gaia Data Release 3. Stellar multiplicity, a teaser for the hidden treasure
https://ui.adsabs.harvard.edu/abs/2023A%26A...674A..34G/abstract


Table 11

In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

import logging
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from pystrometry.utils.archives import get_gaiadr_data
import matplotlib as mp
from pystrometry.utils import du437_tools

# import matplotlib.pyplot as pl

universal_helpers not available


In [2]:
logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [3]:
overwrite = False
overwrite_labelled_list = False
# overwrite_labelled_list = True

# dataset_tag = 'v0.0'
dataset_tag = 'v0.6'
data_path = f'/media/team_workspaces/Gaia-Orbit-Classification/sandbox/dataset_{dataset_tag}'



# Confirmed exoplanets

In [4]:
df = pd.DataFrame()

reference = '2023AA...674A..34G'

sids = """6421118739093252224
4062446910648807168
1594127865540229888
4745373133284418816
2367734656180397952
5855730584310531200
637329067477530368
4976894960284258048
2603090003484152064"""

label = 'exoplanet'

df['source_id'] = np.array(sids.split('\n')).astype(np.int64)
df['reference'] = reference
df['label'] = label
df['id'] = 'none'

# df.loc[df['source_id']==2367734656180397952, 'id'] = 'BD-170063'
# df.loc[df['source_id']==1594127865540229888, 'id'] = 'HD132406'
# df.loc[df['source_id']==637329067477530368, 'id'] = 'HD81040'

id2 = """HD175167
HD164604
HD132406
HR810
BD−170063 
HD111232
HD81040
HD142
GJ876"""
df['id'] = np.array(id2.split('\n'))

df.loc[len(df)] = [3424193536079703808, '2023MNRAS.526.5155S', label, 'HD39392'] # mass < 20 Mjup


exoplanets = df.copy()

labelled_sources = exoplanets
labelled_sources

Unnamed: 0,source_id,reference,label,id
0,6421118739093252224,2023AA...674A..34G,exoplanet,HD175167
1,4062446910648807168,2023AA...674A..34G,exoplanet,HD164604
2,1594127865540229888,2023AA...674A..34G,exoplanet,HD132406
3,4745373133284418816,2023AA...674A..34G,exoplanet,HR810
4,2367734656180397952,2023AA...674A..34G,exoplanet,BD−170063
5,5855730584310531200,2023AA...674A..34G,exoplanet,HD111232
6,637329067477530368,2023AA...674A..34G,exoplanet,HD81040
7,4976894960284258048,2023AA...674A..34G,exoplanet,HD142
8,2603090003484152064,2023AA...674A..34G,exoplanet,GJ876
9,3424193536079703808,2023MNRAS.526.5155S,exoplanet,HD39392


# Confirmed brown-dwarf companions

In [5]:
df = pd.DataFrame()

reference = '2023AA...674A..34G'

sids = """2651390587219807744
2778298280881817984
3309006602007842048
3750881083756656128
3751763647996317056
685029558383335168
855523714036230016
824461960796102528
873616860770228352
5563001178343925376"""

label = 'brown_dwarf_companion'

df['source_id'] = np.array(sids.split('\n')).astype(np.int64)
df['reference'] = reference
df['label'] = label

id2 = """BD−004475
HD5433  
HD30246
HD91669
HD89707
HD77065
HD92320
HD82460
BD+291539 
HD52756"""
df['id'] = np.array(id2.split('\n'))

df.loc[len(df)] = [43574131143039104, '2023arXiv231007827F', label, 'LHS1610']

 



brown_dwarf_companions = df.copy()

labelled_sources = pd.concat([exoplanets, brown_dwarf_companions]).reset_index(drop=True)
labelled_sources

Unnamed: 0,source_id,reference,label,id
0,6421118739093252224,2023AA...674A..34G,exoplanet,HD175167
1,4062446910648807168,2023AA...674A..34G,exoplanet,HD164604
2,1594127865540229888,2023AA...674A..34G,exoplanet,HD132406
3,4745373133284418816,2023AA...674A..34G,exoplanet,HR810
4,2367734656180397952,2023AA...674A..34G,exoplanet,BD−170063
5,5855730584310531200,2023AA...674A..34G,exoplanet,HD111232
6,637329067477530368,2023AA...674A..34G,exoplanet,HD81040
7,4976894960284258048,2023AA...674A..34G,exoplanet,HD142
8,2603090003484152064,2023AA...674A..34G,exoplanet,GJ876
9,3424193536079703808,2023MNRAS.526.5155S,exoplanet,HD39392


# false-positive orbit solutions
https://www.cosmos.esa.int/web/gaia/dr3-known-issues#FalsePositive

In [6]:
df = pd.DataFrame()

reference = 'CosmosPages'  # Marcussen & Albrecht

sids = """4698424845771339520
5765846127180770432"""
# sids = """4698424845771339520"""
label = 'false_positive_orbit' # i.e. confirmed binary stars

df['source_id'] = np.array(sids.split('\n')).astype(np.int64)
df['id'] = ['WD 0141-675', 'HIP 64690'] 
# df['id'] = ['WD 0141-675'] 
df['reference'] = reference
df['label'] = label


# df.loc[len(df)] = [1509952656931878272, 'HIP67526', '2023MNRAS.526.5155S', label]
    
    
    

false_postive_orbits = df.copy()

false_postive_orbits


Unnamed: 0,source_id,id,reference,label
0,4698424845771339520,WD 0141-675,CosmosPages,false_positive_orbit
1,5765846127180770432,HIP 64690,CosmosPages,false_positive_orbit


In [7]:
# from astropy.table import Table
# Table.read('/media/team_workspaces/Gaia-Orbit-Classification/jsahlmann/data/unger23/tableA1.tex', format='latex')
# Table.read?



df = pd.DataFrame()

# reference = '2023arXiv231002758U'
reference = '2023AA...680A..16U'

sids = """3751763647996317056
1594127865540229888
6421118739093252224
824461960796102528
637329067477530368
855523714036230016
685029558383335168
1035000055055287680
5999024986946599808
5563001178343925376
2884087104955208064
3309006602007842048
1712614124767394816
2367734656180397952
873616860770228352
2161507648230817792
5957920668132624256
6647630950597964544
1142214430312151424
3626268998574790656
4994200964065634432
3937211745905473024
1224551770875466496
4133650458966620672
4724313637321332864
1318110830190386048
1181993180456516864
4753355209745022208
4745373133284418816
2778298280881817984
3750881083756656128
2651390587219807744"""



ids = """HD89707
HD132406
HD175167
HD82460
HD81040
HD92320
HD77065
HD68638A
CD-4610046
HD52756
HD40503
HD30246
HIP66074
BD-170063
BD+291539
HD166356
HD162020
HD164427
HD48679
HD112758
HD3277
HD114762
HD140913
HD151528
HD17289
HD148284
HD132032
HD17155
HR810
HD5433
HD91669
BD-004475"""



label = 'none'

df['id'] = np.array(ids.split('\n'))
df['source_id'] = np.array(sids.split('\n')).astype(np.int64)
df['reference'] = reference
df['label'] = label 

for id in ['HD68638A', 'CD-4610046', 'HD132032', 'HD91669']:
    df.loc[df['id']==id, 'label'] = 'brown_dwarf_companion'
    
for id in ['HD140913', 'HD17155', 'HD148284', 'HD48679', 'HD166356']:
    df.loc[df['id']==id, 'label'] = 'very_low_mass_stellar_companion'
    
for id in ['HD151528', 'HD114762', 'HD112758', 'HD164427', 'HD162020']:
    df.loc[df['id']==id, 'label'] = 'binary_star'



unger23 = df.copy()
# display(unger23)
# labelled_sources = pd.concat([labelled_sources, df]).reset_index(drop=True)
# labelled_sources
labelled_sources = pd.concat([exoplanets, brown_dwarf_companions, unger23[(unger23['label'].isin(['brown_dwarf_companion'])) &  (~unger23['source_id'].isin(labelled_sources['source_id'])) ]]).reset_index(drop=True)
labelled_sources = pd.concat([labelled_sources, false_postive_orbits])
labelled_sources



Unnamed: 0,source_id,reference,label,id
0,6421118739093252224,2023AA...674A..34G,exoplanet,HD175167
1,4062446910648807168,2023AA...674A..34G,exoplanet,HD164604
2,1594127865540229888,2023AA...674A..34G,exoplanet,HD132406
3,4745373133284418816,2023AA...674A..34G,exoplanet,HR810
4,2367734656180397952,2023AA...674A..34G,exoplanet,BD−170063
5,5855730584310531200,2023AA...674A..34G,exoplanet,HD111232
6,637329067477530368,2023AA...674A..34G,exoplanet,HD81040
7,4976894960284258048,2023AA...674A..34G,exoplanet,HD142
8,2603090003484152064,2023AA...674A..34G,exoplanet,GJ876
9,3424193536079703808,2023MNRAS.526.5155S,exoplanet,HD39392


## Check that these sources are in the provided dataset
### read selected astrometric orbits

In [8]:
out_path = data_path
outfile = os.path.join(out_path, 'nss_two_body_orbit_astrometric_orbits.parquet')
print(outfile)
nss_selected = pd.read_parquet(outfile)

gaia_source = pd.read_parquet(os.path.join(out_path, 'gaia_source_astrometric_orbits.parquet'))
gaia_source = gaia_source.drop_duplicates(subset='source_id')


nss_selected = nss_selected.merge(gaia_source, on='source_id', suffixes=('', '_gaia_source'))


logging.info('Dataset has {} unique source_ids, i.e. {} duplicate source_ids'.format(len(nss_selected['source_id'].unique()), len(nss_selected)-len(nss_selected['source_id'].unique())))
display(nss_selected['nss_solution_type'].value_counts())


/media/team_workspaces/Gaia-Orbit-Classification/sandbox/dataset_v0.6/nss_two_body_orbit_astrometric_orbits.parquet


INFO:root:Dataset has 169129 unique source_ids, i.e. 0 duplicate source_ids


nss_solution_type
Orbital                           134598
AstroSpectroSB1                    33467
OrbitalAlternative                   619
OrbitalTargetedSearch                339
OrbitalTargetedSearchValidated        96
OrbitalAlternativeValidated           10
Name: count, dtype: int64

In [9]:
assert np.all(labelled_sources['source_id'].isin(nss_selected['source_id']).values)

# Substellar-companion candidates -> read binary_masses table

In [10]:
# query archive
root_data_dir = '/media/team_workspaces/Gaia-Orbit-Classification/jsahlmann/data'
gaia_data_release = 'gaiadr3'
input_list_name = 'nss'
data_dir = os.path.join(root_data_dir, gaia_data_release)
plot_dir = os.path.join(data_dir, 'figures', input_list_name)

gaia_table_name='binary_masses'
analysis_dataset_name = '{}_all'.format(gaia_table_name)

binary_masses = get_gaiadr_data(analysis_dataset_name, data_dir, gaia_data_release=gaia_data_release, gaia_table_name=gaia_table_name)

Retrieved 195315 rows from gaiadr3.binary_masses


In [11]:
np.int64(float(np.int64(5773484949857279104)))

5773484949857278976

In [12]:
if 0:
    binary_masses[binary_masses['source_id'].isin([5773484949857279104, 2171489736355655680, 5323844651848467968])]
    # nss_selected[nss_selected['source_id'].isin([5773484949857279104, 2171489736355655680, 5323844651848467968])]

In [13]:
binary_masses

Unnamed: 0,source_id,m1,m1_lower,m1_upper,m2,m2_lower,m2_upper,fluxratio,fluxratio_lower,fluxratio_upper,combination_method,m1_ref,flag
0,5440671679301441664,0.861337,0.736825,0.915485,,0.325583,1.101632,,0.00,0.43,Orbital+M1,IsocLum,
1,5440672572654653696,0.644935,0.492264,0.694710,,0.230318,0.834767,,0.21,0.98,Orbital+M1,IsocLum,
2,5440681162589284352,0.722400,0.630926,0.772952,,0.296844,0.896350,,0.00,0.39,Orbital+M1,IsocLum,
3,5440752493405629568,0.884711,0.806696,0.941229,,0.338917,1.013381,,0.00,0.38,Orbital+M1,IsocLum,
4,5440794412286244224,0.801557,0.751420,0.851694,,0.660394,0.719544,,0.04,,Orbital+M1,IsocLum,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
195310,5440652162969615360,1.344820,1.241583,1.398992,,0.644598,1.331231,,0.00,0.23,Orbital+M1,IsocLum,
195311,5440658313363001216,1.556945,1.404454,1.612562,,0.596142,1.682539,,0.00,0.36,Orbital+M1,IsocLum,
195312,5440663398604297216,0.765894,0.630354,0.818681,,0.241529,1.164911,,0.00,0.59,Orbital+M1,IsocLum,
195313,5440667659212053760,0.893303,0.809744,0.947392,,0.429710,1.520896,,0.00,0.40,Orbital+M1,IsocLum,


In [14]:
# nss_selected.query('source_id==1916454200349735680').iloc[0]

In [15]:
masses_selected = binary_masses[binary_masses['source_id'].isin(nss_selected['source_id'])]
print(len(masses_selected))

131142


# We have to clean the binary_masses table because it contains duplicate source_ids with different mass estimates
these come from different combination_methods

In [16]:
def is_astrometric_mass_estimate(x):
    col = 'combination_method'
    # if 'SB2' in x[col]:
    if 'SB2' in x:
        return False
    elif 'Eclipsing' in x:
    # elif 'EclipsingSpectro' in x[col]:
        return False
    return True

print(len(masses_selected))
masses_selected = masses_selected[masses_selected['combination_method'].apply(is_astrometric_mass_estimate)]
print(len(masses_selected))


131142
131037


In [17]:
binary_masses = masses_selected
assert (len(binary_masses)-binary_masses['source_id'].nunique()) == 0

In [18]:
binary_masses.query('source_id==1916454200349735680')

Unnamed: 0,source_id,m1,m1_lower,m1_upper,m2,m2_lower,m2_upper,fluxratio,fluxratio_lower,fluxratio_upper,combination_method,m1_ref,flag
78007,1916454200349735680,0.629258,0.48012,0.679035,,0.14832,0.631463,,0.22,1.0,Orbital+M1,IsocLum,


In [19]:
tmp = binary_masses[['source_id']].value_counts().rename_axis('source_id').reset_index(name='counts')
binary_masses[binary_masses['source_id'].isin(tmp[tmp['counts'] > 1]['source_id'])].sort_values('source_id')[['source_id', 'm1', 'combination_method']]

Unnamed: 0,source_id,m1,combination_method


In [20]:
binary_masses['combination_method'].value_counts()

combination_method
Orbital+M1            111792
AstroSpectroSB1+M1     17578
Orbital+SB1+M1          1513
SB1+M1                   154
Name: count, dtype: int64

# Add mass information to NSS solution table

In [21]:
# set primary mass
bm_cols = ['source_id', 'm1', 'm2_lower', 'm2_upper', 'm2', 'combination_method']
logging.info(len(nss_selected))
nss_selected_with_primary_mass_from_binary_masses = nss_selected.merge(binary_masses[bm_cols], on='source_id', how='inner')
logging.info(len(nss_selected_with_primary_mass_from_binary_masses))
print(f"number of replicates: {len(nss_selected_with_primary_mass_from_binary_masses) - nss_selected_with_primary_mass_from_binary_masses['source_id'].nunique()}")


INFO:root:169129
INFO:root:131037


number of replicates: 0


In [22]:
binary_masses.columns

Index(['source_id', 'm1', 'm1_lower', 'm1_upper', 'm2', 'm2_lower', 'm2_upper',
       'fluxratio', 'fluxratio_lower', 'fluxratio_upper', 'combination_method',
       'm1_ref', 'flag'],
      dtype='object')

# Solutions without mass estimate

In [23]:
nss_selected_without_primary_mass_from_binary_masses = nss_selected[~nss_selected['source_id'].isin(nss_selected_with_primary_mass_from_binary_masses['source_id'])]
nss_selected_without_primary_mass_from_binary_masses['nss_solution_type'].value_counts()

nss_solution_type
Orbital                           24513
AstroSpectroSB1                   13254
OrbitalAlternative                  162
OrbitalTargetedSearch               111
OrbitalTargetedSearchValidated       42
OrbitalAlternativeValidated          10
Name: count, dtype: int64

# Substellar companion candidates

In [24]:
# nss_selected_with_primary_mass_from_binary_masses.columns

In [25]:
# nss_selected_with_primary_mass_from_binary_masses[nss_selected_with_primary_mass_from_binary_masses['m2_upper'] < 0.08][['source_id', 'm2_lower', 'm2_upper']]

In [26]:
nss_selected_with_primary_mass_from_binary_masses['m2_MS_estimate_lower'] = nss_selected_with_primary_mass_from_binary_masses['m2_lower']
nss_selected_with_primary_mass_from_binary_masses['m2_MS_estimate_upper'] = nss_selected_with_primary_mass_from_binary_masses['m2_upper']


substellar_companion_candidates = nss_selected_with_primary_mass_from_binary_masses.query('m2_MS_estimate_lower < 0.08')
better_substellar_companion_candidates = nss_selected_with_primary_mass_from_binary_masses.query('m2_MS_estimate_upper < 0.08')
print(len(substellar_companion_candidates))
print(len(better_substellar_companion_candidates))

1839
30


In [27]:
# substellar_companion_candidates['source_id'].value_counts()

In [28]:
# add to labelled data
df = pd.DataFrame()
reference = '2023A%26A...674A..34G'
label = 'substellar_companion_candidates'

substellar_companion_candidates_non_duplicated = substellar_companion_candidates[~substellar_companion_candidates['source_id'].isin(labelled_sources['source_id'])]


df['source_id'] = substellar_companion_candidates_non_duplicated['source_id']
df['id'] = df['source_id'].astype(str)
df['reference'] = reference
df['label'] = label


for sid in df['source_id']:
    if sid in better_substellar_companion_candidates['source_id'].values:
        df.loc[df['source_id']==sid, 'label'] = 'better_substellar_companion_candidates'

substellar_candidates = df.copy()
        

substellar_candidates['label'].value_counts()



label
substellar_companion_candidates           1792
better_substellar_companion_candidates      27
Name: count, dtype: int64

In [29]:
# better_substellar_companion_candidates['source_id']

# make sure we have a unique list

In [30]:
assert (labelled_sources['source_id'].value_counts()!=1).sum() == 0

In [31]:
# labelled_sources['source_id'].value_counts()

In [32]:
# df[df['source_id'].isin(substellar_companion_candidates_non_duplicated['source_id'])]

# Plot colour-magnitude diagram

In [33]:
if 0:
    sel1 = substellar_companion_candidates
    assert sel1['source_id'].nunique() == len(sel1)


    gaia_data_release = 'gaiadr3'
    source_id_array = sel1['source_id']
    gaia_table_name = 'gaia_source'

    nss_source_sel1 = get_gaiadr_data('dr3b', data_dir, source_id_array, gaia_data_release, gaia_table_name=gaia_table_name, overwrite_query=False)

    sel1 = sel1.merge(nss_source_sel1.drop_duplicates(subset='source_id', keep="first"), left_on='source_id', right_on='source_id', suffixes=('', '_gaia_source'))
    colour_by = 'bp_rp'
    # colour_by = 'mass_function_msun'
    norm = mp.colors.LogNorm()
    colormap='rainbow'

    fig = pl.figure()
    ax = pl.gca()

    sel1.nss.plot_cmd(title='Substellar-companion candidates', c=colour_by, colormap=colormap, ax=ax)
    # sel1.nss.plot_cmd()


    ax = pl.gca()
    sel3 = nss_selected[nss_selected['source_id'].isin(labelled_sources[labelled_sources['label'].isin(['exoplanet', 'brown_dwarf_companion'])]['source_id'])]
    sel3.nss.add_absolute_magnitude()
    sel3.nss.plot_cmd(title='Substellar-companion candidates', c='k', ax=ax)


    sel4 = nss_selected[nss_selected['source_id'].isin(labelled_sources[labelled_sources['label'].isin(['better_substellar_companion_candidates'])]['source_id'])]
    sel4.nss.add_absolute_magnitude()

    # sel3.plot('bp_rp', 'absolute_phot_g_mean_mag', kind='scatter', ax=ax, c='k')
    # pl.show()

In [34]:
if 0:
    sel2 = nss_selected_without_primary_mass_from_binary_masses
    print(sel2['source_id'].nunique())


    gaia_data_release = 'gaiadr3'
    source_id_array = sel2['source_id']
    gaia_table_name = 'gaia_source'

    nss_source_sel2 = get_gaiadr_data('dr3c', data_dir, source_id_array, gaia_data_release, gaia_table_name=gaia_table_name, overwrite_query=False)

    sel2 = sel2.merge(nss_source_sel2.drop_duplicates(subset='source_id', keep="first"), left_on='source_id', right_on='source_id', suffixes=('', '_gaia_source'))
    colour_by = 'bp_rp'
    norm = mp.colors.LogNorm()
    colormap='rainbow'

    cutoff = 1e-2
    sel2[sel2['mass_function_msun']<cutoff].nss.plot_cmd(title=f'Solutions without mass estimate and f(M)<{cutoff}', c=colour_by, colormap=colormap)
    # sel1.nss.plot_cmd()

In [35]:
if 0:
    fig = pl.figure()
    ax = pl.gca()

    sel2 = sel2.nss.add_absolute_magnitude()
    sel4 = pd.concat([sel1, sel2[sel2['mass_function_msun']<cutoff]]).reset_index(drop=True)

    sel4.nss.plot_cmd(title='Substellar-companion candidates', c=colour_by, colormap=colormap, ax=ax)
    # sel2[sel2['mass_function_msun']<cutoff].nss.plot_cmd(title=f'Solutions without mass estimate and f(M)<{cutoff}', c=colour_by, colormap=colormap, ax=ax)


In [36]:

if 0:

    fig = pl.figure()
    ax = pl.gca()
    col = 'mass_function_msun'
    # col = 'phot_g_mean_mag'
    col = 'absolute_phot_g_mean_mag'
    bins = np.logspace(-5,1,100)
    logx = True

    if 'phot_g_mean_mag' in col:
        bins = np.linspace(-5,20,100)
        logx=False
    sel1[col].plot(kind='hist', histtype='step', logx=logx, bins=bins, lw=2, ax=ax, log=True, label='Substellar-companion candidates', color='k')
    sel2[col].plot(kind='hist', histtype='step', logx=logx, bins=bins, lw=2, ax=ax, log=True, label='Solutions without mass estimate', color='g')
    nss_selected[col].plot(kind='hist', histtype='step', logx=logx, bins=bins, lw=2, ax=ax, log=True, label='all solutions', color='0.7')

    sel3 = nss_selected[nss_selected['source_id'].isin(labelled_sources[labelled_sources['label'].isin(['exoplanet', 'brown_dwarf_companion'])]['source_id'])]
    sel3[col].plot(kind='hist', histtype='step', logx=logx, bins=bins, lw=2, ax=ax, log=True, label='Exoplanets & BD companions', color='b')
    sel4 = nss_selected[nss_selected['source_id'].isin(labelled_sources[labelled_sources['label'].isin(['better_substellar_companion_candidates'])]['source_id'])]
    sel4[col].plot(kind='hist', histtype='step', logx=logx, bins=bins, lw=2, ax=ax, log=True, label='better Substellar-companion candidates', color='r')


    ax.set_xlabel(col)
    pl.legend(loc=2)
    pl.show()

In [37]:
# col = 'mass_function_msun'

# sel2[sel2[col] < 1e-2]

In [38]:
# sel3

# ruled-out substellar candidates that actually are binary stars

In [39]:
df = pd.DataFrame()

reference = '2023AJ....165..266M'  # Marcussen & Albrecht

sids = """1916454200349735680
2052469973468984192
5122670101678217728"""
label = 'binary_star' # i.e. confirmed binary stars

df['source_id'] = np.array(sids.split('\n')).astype(np.int64)
df['id'] = ['RX J2317.5+3700', 'Ross 1063', 'HD12357'] 
df['reference'] = reference
df['label'] = label


df.loc[len(df)] = [1509952656931878272, 'HIP67526', '2023MNRAS.526.5155S', label]
df.loc[len(df)] = [2843794470563210496, 'BD+244697', '2023MNRAS.526.5155S', label]
df.loc[len(df)] = [691524682806651776, 'BD+261888', '2023MNRAS.526.5155S', 'very_low_mass_stellar_companion']
df.loc[len(df)] = [273059284652294528, 'HD30339', '2023MNRAS.526.5155S', 'very_low_mass_stellar_companion']
 
df.loc[len(df)] = [1572914025633785728, 'HIP59432', '2023MNRAS.526.5155S', label]
df.loc[len(df)] = [4594158089392172928, 'HD160508', '2023MNRAS.526.5155S', label]

df.loc[len(df)] = [6672349380737067904, 'HD191760', '2023MNRAS.526.5155S', 'very_low_mass_stellar_companion']

df.loc[len(df)] = [3139814096324426112 , 'TYC 0173-02410-1', '2023MNRAS.526.5155S', label]
df.loc[len(df)] = [1510226916364001024 , 'GSC 03467-00030', '2023MNRAS.526.5155S', label]
     
df.loc[len(df)] = [5266148569447305600, 'HD42936', '2020NatAs...4..419B', 'very_low_mass_stellar_companion']    # AstroSpectroSB1
df.loc[len(df)] = [2047188847334279424, 'HD185501', '2020AJ....159..233H', label]    # AstroSpectroSB1
    
    
    

binary_stars = df.copy()

binary_stars
# labelled_sources = pd.concat([labelled_sources, df]).reset_index(drop=True)
# labelled_sources

Unnamed: 0,source_id,id,reference,label
0,1916454200349735680,RX J2317.5+3700,2023AJ....165..266M,binary_star
1,2052469973468984192,Ross 1063,2023AJ....165..266M,binary_star
2,5122670101678217728,HD12357,2023AJ....165..266M,binary_star
3,1509952656931878272,HIP67526,2023MNRAS.526.5155S,binary_star
4,2843794470563210496,BD+244697,2023MNRAS.526.5155S,binary_star
5,691524682806651776,BD+261888,2023MNRAS.526.5155S,very_low_mass_stellar_companion
6,273059284652294528,HD30339,2023MNRAS.526.5155S,very_low_mass_stellar_companion
7,1572914025633785728,HIP59432,2023MNRAS.526.5155S,binary_star
8,4594158089392172928,HD160508,2023MNRAS.526.5155S,binary_star
9,6672349380737067904,HD191760,2023MNRAS.526.5155S,very_low_mass_stellar_companion


In [40]:
binary_stars_labelled = binary_stars[binary_stars['source_id'].isin(substellar_candidates['source_id'])]


In [41]:
labelled_sources = pd.concat([labelled_sources, binary_stars_labelled]).reset_index(drop=True)

In [42]:
binary_stars[binary_stars['source_id'].isin(nss_selected_without_primary_mass_from_binary_masses['source_id'])]

Unnamed: 0,source_id,id,reference,label
8,4594158089392172928,HD160508,2023MNRAS.526.5155S,binary_star
9,6672349380737067904,HD191760,2023MNRAS.526.5155S,very_low_mass_stellar_companion


In [43]:
very_low_mass_stellar_companions = unger23[(unger23['source_id'].isin(substellar_candidates['source_id'])) & (~unger23['label'].isin(['none']))]
display(very_low_mass_stellar_companions)
labelled_sources = pd.concat([labelled_sources, very_low_mass_stellar_companions]).reset_index(drop=True)

Unnamed: 0,id,source_id,reference,label
22,HD140913,1224551770875466496,2023AA...680A..16U,very_low_mass_stellar_companion
27,HD17155,4753355209745022208,2023AA...680A..16U,very_low_mass_stellar_companion


In [44]:

substellar_candidates_labelled = substellar_candidates[~substellar_candidates['source_id'].isin(labelled_sources['source_id'])]

labelled_sources = pd.concat([labelled_sources, substellar_candidates_labelled]).reset_index(drop=True)


# Write labelled list to file

In [45]:
labelled_sources['source_id'].value_counts()

source_id
6421118739093252224    1
1990171816426178304    1
1980553185643943552    1
2058368058774622464    1
4188152322648398592    1
                      ..
4836664278068407680    1
4835956295659534720    1
4835053149936636288    1
4833052859111291904    1
4610670898614672512    1
Name: count, Length: 1845, dtype: int64

In [46]:
labelled_sources

Unnamed: 0,source_id,reference,label,id
0,6421118739093252224,2023AA...674A..34G,exoplanet,HD175167
1,4062446910648807168,2023AA...674A..34G,exoplanet,HD164604
2,1594127865540229888,2023AA...674A..34G,exoplanet,HD132406
3,4745373133284418816,2023AA...674A..34G,exoplanet,HR810
4,2367734656180397952,2023AA...674A..34G,exoplanet,BD−170063
...,...,...,...,...
1840,6253141880955998848,2023A%26A...674A..34G,substellar_companion_candidates,6253141880955998848
1841,1175731663601464064,2023A%26A...674A..34G,substellar_companion_candidates,1175731663601464064
1842,4609952707067682432,2023A%26A...674A..34G,substellar_companion_candidates,4609952707067682432
1843,4610358980909706496,2023A%26A...674A..34G,substellar_companion_candidates,4610358980909706496


In [47]:
labelled_sources[labelled_sources['source_id']==1224551770875466496]

Unnamed: 0,source_id,reference,label,id
30,1224551770875466496,2023AA...680A..16U,very_low_mass_stellar_companion,HD140913


In [48]:
if 1:
    outfile = os.path.join(data_path, 'labelled_sources.parquet')

    assert len(labelled_sources) == labelled_sources['source_id'].nunique()

    if overwrite_labelled_list or (os.path.isfile(outfile) is False):    
        labelled_sources.to_parquet(outfile, index=False)
        logging.info(f"Wrote {len(labelled_sources)} rows to  {outfile}")


In [49]:
display(labelled_sources['label'].value_counts())

label
substellar_companion_candidates           1787
better_substellar_companion_candidates      26
brown_dwarf_companion                       14
exoplanet                                   10
binary_star                                  3
very_low_mass_stellar_companion              3
false_positive_orbit                         2
Name: count, dtype: int64

In [50]:
len(labelled_sources[labelled_sources['label'].isin(['substellar_companion_candidates', 'better_substellar_companion_candidates','brown_dwarf_companion', 'exoplanet', 'very_low_mass_stellar_companion', 'binary_star'])])

1843

In [51]:
print(labelled_sources['label'].value_counts().to_latex())

\begin{tabular}{lr}
\toprule
 & count \\
label &  \\
\midrule
substellar_companion_candidates & 1787 \\
better_substellar_companion_candidates & 26 \\
brown_dwarf_companion & 14 \\
exoplanet & 10 \\
binary_star & 3 \\
very_low_mass_stellar_companion & 3 \\
false_positive_orbit & 2 \\
\bottomrule
\end{tabular}



In [52]:
len(labelled_sources)

1845

# for paper

In [53]:
lbl1 = labelled_sources[~labelled_sources['label'].isin(['substellar_companion_candidates', 'better_substellar_companion_candidates'])]
lbl1 = lbl1.sort_values('label').rename(columns={'source_id': 'Gaia DR3 source\_id', 'id': 'Name'})
lbl1['Reference'] = [f"\cite{{{s}}}" for s in lbl1['reference']]
lbl1['label'] = lbl1['label'].replace({'binary_star': '\\bs', 'brown_dwarf_companion': '\\bdc', 'exoplanet': '\\exoplanet', 'false_positive_orbit': '\\fpo', 'very_low_mass_stellar_companion': '\\vlmsc'})
display(lbl1)
cols = ['Gaia DR3 source\_id', 'Name', 'label', 'Reference']
print(lbl1[cols].to_latex(index=False))

Unnamed: 0,Gaia DR3 source\_id,reference,label,Name,Reference
29,2047188847334279424,2020AJ....159..233H,\bs,HD185501,\cite{2020AJ....159..233H}
27,5122670101678217728,2023AJ....165..266M,\bs,HD12357,\cite{2023AJ....165..266M}
26,2052469973468984192,2023AJ....165..266M,\bs,Ross 1063,\cite{2023AJ....165..266M}
15,685029558383335168,2023AA...674A..34G,\bdc,HD77065,\cite{2023AA...674A..34G}
14,3751763647996317056,2023AA...674A..34G,\bdc,HD89707,\cite{2023AA...674A..34G}
13,3750881083756656128,2023AA...674A..34G,\bdc,HD91669,\cite{2023AA...674A..34G}
12,3309006602007842048,2023AA...674A..34G,\bdc,HD30246,\cite{2023AA...674A..34G}
11,2778298280881817984,2023AA...674A..34G,\bdc,HD5433,\cite{2023AA...674A..34G}
10,2651390587219807744,2023AA...674A..34G,\bdc,BD−004475,\cite{2023AA...674A..34G}
19,5563001178343925376,2023AA...674A..34G,\bdc,HD52756,\cite{2023AA...674A..34G}


\begin{tabular}{rlll}
\toprule
Gaia DR3 source\_id & Name & label & Reference \\
\midrule
2047188847334279424 & HD185501 & \bs & \cite{2020AJ....159..233H} \\
5122670101678217728 & HD12357 & \bs & \cite{2023AJ....165..266M} \\
2052469973468984192 & Ross 1063 & \bs & \cite{2023AJ....165..266M} \\
685029558383335168 & HD77065 & \bdc & \cite{2023AA...674A..34G} \\
3751763647996317056 & HD89707 & \bdc & \cite{2023AA...674A..34G} \\
3750881083756656128 & HD91669 & \bdc & \cite{2023AA...674A..34G} \\
3309006602007842048 & HD30246 & \bdc & \cite{2023AA...674A..34G} \\
2778298280881817984 & HD5433   & \bdc & \cite{2023AA...674A..34G} \\
2651390587219807744 & BD−004475 & \bdc & \cite{2023AA...674A..34G} \\
5563001178343925376 & HD52756 & \bdc & \cite{2023AA...674A..34G} \\
824461960796102528 & HD82460 & \bdc & \cite{2023AA...674A..34G} \\
43574131143039104 & LHS1610 & \bdc & \cite{2023arXiv231007827F} \\
1035000055055287680 & HD68638A & \bdc & \cite{2023AA...680A..16U} \\
5999024986946599808 & 