In [1]:
import sys
sys.path.append('../../scripts/')
from query import *
import pandas as pd

df_Ralf = pd.read_excel('../../data/Ralf/2ES_targetlist_astrid_export_2024Nov_comments.xlsx', engine='openpyxl', header=1)
merged_df = pd.read_excel('../../results/combined_query_with_mass_detection_limit.xlsx', dtype={'source_id': str, 'source_id_dr2': str, 'source_id_dr3': str, 'HIP Number': str})

display(df_Ralf.head())
display(merged_df.head())

"Important:  Gaia archive will be intermittently unavailable due to scheduled maintenance on 10-12-2024 from 08:00 to 10:00 (CET)"


Unnamed: 0,star_ID,prio,sys,rad,ram,ras,ded,dem,des,dis,...,i_dd_sig,RV_Prec(390-870) 10m,RV_Prec(390-870) 30m,HZ Rin,HZ Rout,P(HZ),mdl(hz) 10min,mdl(hz) 30min,Notes RL,Notes MH
0,HD224953A,3,Multiple system,0,2,8.728,-68,16,50.751,15.195652,...,-999.0,0.66,0.38,0.27,0.39,90.14,9.75,5.61,Close stellar comp.,
1,HD55,0,Unknown system,0,5,17.689,-67,49,57.323,16.391723,...,-999.0,0.48,0.27,0.36,0.52,126.51,8.98,5.05,-999,Big jump from pre to post upgrade datasets. Ta...
2,HD693,2,Unknown system,0,11,15.858,-15,28,4.72,18.886134,...,-999.0,1.19,0.69,1.64,2.36,923.99,62.99,36.52,mdl(hz) > 10mE,
3,HD739,2,Unknown system,0,11,44.021,-35,7,59.213,21.719064,...,-999.0,2.36,1.36,1.66,2.39,926.66,127.69,73.59,mdl(hz) > 10mE,
4,HD1237,3,Unknown system,0,16,12.678,-79,51,4.245,17.497813,...,-999.0,0.27,0.15,0.76,1.1,339.67,8.41,4.67,"Warm Jupiter at 134d, PMS star <100Myr","One known planet, b at 133.7d, 3.37 MJ (Naef e..."


Unnamed: 0,source_id,source_id_dr2,source_id_dr3,RA,DEC,V_mag,Phot G Mean Mag,Phot BP Mean Mag,Phot RP Mean Mag,BP-RP,...,Radius [R_Sun],HZ_limit [AU],RV precision [m/s],HZ Detection Limit [M_Earth],Spectral Type,HD Number,GJ Number,HIP Number,Object Type,HZ Detection Limit Simplified [Earth Mass]
0,22269508511466624,22269508511466624,22269508511466624,41.482248,10.666519,12.687016,11.891757,12.94054,10.87849,2.06205,...,0.659257,0.259182,0.912258,3.888545,M,,,,HighPM*,3.888505
1,1173206811240350592,1173206811240350592,1173206811240350592,221.961398,7.416476,11.617188,10.907482,11.867918,9.918146,1.949772,...,0.87265,0.343274,0.557978,2.912921,M,,,,HighPM*,2.912901
2,6349915534307041152,6349915534307041152,6349915534307041152,312.940241,-79.31548,11.810019,10.907413,12.067825,9.833761,2.234064,...,0.367781,0.144694,0.610223,1.535877,M,,GJ 808,,HighPM*,1.535867
3,2683023811628007296,2683023811628007296,2683023811628007296,330.540802,1.398992,9.136045,8.403359,9.387209,7.420015,1.967194,...,0.702667,0.276365,0.178149,0.795958,M,HD 209290,GJ 846,108782.0,HighPM*,0.795956
4,3919427007885527680,3919427007885527680,3919427007885527680,179.383579,11.82881,11.797612,10.899825,12.05527,9.827121,2.228149,...,0.548498,0.21799,0.608185,2.008597,M,,GJ 3695,58321.0,HighPM*,2.008582


Merge

In [2]:
# ---------------------------------------------------------------- #
#  HD
# ---------------------------------------------------------------- #
# Split 'HD Number' into two separate columns 'HD Number 1' and 'HD Number 2'
merged_df[['HD Number 1', 'HD Number 2']] = merged_df['HD Number'].str.split(', ', expand=True, n=1)
# Clean up 'HD Number 1' and 'HD Number 2' by removing extra spaces after 'HD'
merged_df['HD Number 1'] = merged_df['HD Number 1'].str.replace(r'HD\s+', 'HD', regex=True)
merged_df['HD Number 2'] = merged_df['HD Number 2'].fillna('').str.replace(r'HD\s+', 'HD', regex=True)

# ---------------------------------------------------------------- #
#  HIP
# ---------------------------------------------------------------- #
merged_df['HIP Number'] = merged_df['HIP Number'].apply(lambda x: f'HIP{x}' if pd.notna(x) and x != '' else x)

# ---------------------------------------------------------------- #
#  GJ
# ---------------------------------------------------------------- #    
# Split 'GJ Number' into two separate columns 'GJ Number 1' and 'GJ Number 2'
merged_df[['GJ Number 1', 'GJ Number 2']] = merged_df['GJ Number'].str.split(', ', expand=True, n=1)
# Clean up 'GJ Number 1' and 'GJ Number 2' by removing extra spaces after 'GJ'
merged_df['GJ Number 1'] = merged_df['GJ Number 1'].str.replace(r'\s+', '', regex=True)
merged_df['GJ Number 2'] = merged_df['GJ Number 2'].fillna('').str.replace(r'\s+', '', regex=True)


# ---------------------------------------------------------------- #
#  Merge
# ---------------------------------------------------------------- #
# Perform left merges on various columns and combine results
merge_keys = ['HD Number 1', 'HD Number 2', 'HIP Number', 'GJ Number 1', 'GJ Number 2']
merged_RJ = pd.concat([df_Ralf.merge(merged_df, left_on='star_ID  ', right_on=key, how='left') for key in merge_keys])

# Sort the combined DataFrame by 'source_id' to prioritize non-null values
merged_RJ.sort_values(by='source_id', ascending=False, inplace=True)

# Remove duplicate entries based on 'star_ID  ', keeping the first occurrence
merged_RJ.drop_duplicates(subset='star_ID  ', keep='first', inplace=True)

# Reset the index of the final DataFrame
merged_RJ.reset_index(drop=True, inplace=True)

# Save the final DataFrame to an Excel file
filename = '../../results/merged_RJ.xlsx'
merged_RJ.to_excel(filename, index=False)
adjust_column_widths(filename)