In [6]:
import sys
import os
import pandas as pd

sys.path.append(os.path.abspath('../')) 

from src.eclipsing_binary.config import get_paths

In [7]:
paths = get_paths()

In [32]:
ecl_col_specs = [(0, 19), (20, 26), (27, 34), (35, 46), (46, 58), (58, 64), (64, 70)]
ecl_names = ['object_name', 'I_magnitude', 'V_magnitude', 'period_days', 
                'epoch_of_minimum', 'main_eclipse_dip', 'second_eclipse_dip']
ecl_data_types = [str, float, float, float, float, float, float]
ecl_dict = dict(zip(ecl_names,ecl_data_types))
ecl_df = pd.read_fwf(paths['ecl_file'], colspecs=ecl_col_specs, names=ecl_names, header=None).astype(ecl_dict, errors='ignore')


In [35]:
ecl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40204 entries, 0 to 40203
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   object_name         40204 non-null  object 
 1   I_magnitude         40204 non-null  float64
 2   V_magnitude         40204 non-null  object 
 3   period_days         40204 non-null  float64
 4   epoch_of_minimum    40204 non-null  float64
 5   main_eclipse_dip    40204 non-null  object 
 6   second_eclipse_dip  40204 non-null  object 
dtypes: float64(3), object(4)
memory usage: 2.1+ MB


In [19]:
ident_col_specs = [(0, 19), (20, 24), (25, 36), (37, 49), (50, 66), (67, 82), (83, 99), (100, 116)]
ident_names = ['object_name', 'type', 'RA_coord', 'DEC_coord', 'OGLE-IV', 'OGLE-III', 'OGLE-II', 'other_names']
ident_df = pd.read_fwf(paths['ident_file'], colspecs=ident_col_specs, names=ident_names, header=None)

In [49]:
ext_col_specs = [(2, 14), (15, 30), (31, 41), (42, 55), (56, 63), (65, 72), (73, 82), (83, 92), (93,102), (103,112), (113,121), (122,130), (131,138), (138,149), (150,161)]
ext_names = ['RA_deg', 'Dec_deg',  'RA_h', 'Dec_h', 'E(V-I)', '-sigma1', '+sigma2', '(V-I)_RC', '(V-I)_0', 'E(V-I)peak', 'E(V-I)sfd', 'box', 'sep', 'RA_coord', 'DEC_coord']
ext_df = pd.read_fwf(paths['ext_file'], colspecs=ext_col_specs, names=ext_names, header=3)
ext_df.drop(ext_df.tail(1).index, inplace=True)

In [50]:
ext_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40204 entries, 0 to 40203
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   RA_deg      40204 non-null  object 
 1   Dec_deg     40204 non-null  object 
 2   RA_h        40204 non-null  float64
 3   Dec_h       40204 non-null  float64
 4   E(V-I)      40204 non-null  float64
 5   -sigma1     40204 non-null  float64
 6   +sigma2     40204 non-null  float64
 7   (V-I)_RC    40204 non-null  float64
 8   (V-I)_0     40204 non-null  float64
 9   E(V-I)peak  40204 non-null  float64
 10  E(V-I)sfd   40204 non-null  float64
 11  box         40204 non-null  float64
 12  sep         40204 non-null  float64
 13  RA_coord    40204 non-null  object 
 14  DEC_coord   40204 non-null  object 
dtypes: float64(11), object(4)
memory usage: 4.6+ MB


In [53]:
df_merge = ecl_df.merge(ident_df, on='object_name', how='left')

In [54]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40204 entries, 0 to 40203
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   object_name         40204 non-null  object 
 1   I_magnitude         40204 non-null  float64
 2   V_magnitude         40204 non-null  object 
 3   period_days         40204 non-null  float64
 4   epoch_of_minimum    40204 non-null  float64
 5   main_eclipse_dip    40204 non-null  object 
 6   second_eclipse_dip  40204 non-null  object 
 7   type                40204 non-null  object 
 8   RA_coord            40204 non-null  object 
 9   DEC_coord           40204 non-null  object 
 10  OGLE-IV             40204 non-null  object 
 11  OGLE-III            40204 non-null  object 
 12  OGLE-II             40204 non-null  object 
 13  other_names         166 non-null    object 
dtypes: float64(3), object(11)
memory usage: 4.3+ MB


In [55]:
df_merge_plus = df_merge.merge(ext_df, on=['RA_coord','DEC_coord'], how='left')

In [58]:
df_merge_plus.nunique()

object_name           40204
I_magnitude            5498
V_magnitude            6048
period_days           40185
epoch_of_minimum      27991
main_eclipse_dip       1956
second_eclipse_dip      986
type                      3
RA_coord              38469
DEC_coord             36478
OGLE-IV               38484
OGLE-III              31110
OGLE-II                 910
other_names             166
RA_deg                38469
Dec_deg               36478
RA_h                  38469
Dec_h                 36478
E(V-I)                  464
-sigma1                 264
+sigma2                 330
(V-I)_RC                479
(V-I)_0                  20
E(V-I)peak              475
E(V-I)sfd              3685
box                       9
sep                     453
dtype: int64

In [64]:
duplicate_rows = df_merge_plus[df_merge_plus.duplicated(subset=['RA_coord', 'DEC_coord'], keep=False)]
print(duplicate_rows)

              object_name  I_magnitude V_magnitude  period_days  \
21887  OGLE-LMC-ECL-22012       16.952      17.346     7.612962   
21888  OGLE-LMC-ECL-22012       16.952      17.346     7.612962   
36687  OGLE-LMC-ECL-36840       16.934      17.348     7.613053   
36688  OGLE-LMC-ECL-36840       16.934      17.348     7.613053   

       epoch_of_minimum main_eclipse_dip second_eclipse_dip type     RA_coord  \
21887         7001.5904            0.403              0.121   NC  05:40:08.97   
21888         7001.5904            0.403              0.121   NC  05:40:08.97   
36687         7001.6077            0.415              0.115   NC  05:40:08.97   
36688         7001.6077            0.415              0.115   NC  05:40:08.97   

         DEC_coord  ...      Dec_h E(V-I) -sigma1 +sigma2 (V-I)_RC (V-I)_0  \
21887  -70:37:17.9  ... -70.621639  0.326   0.128   0.156    1.212   0.886   
21888  -70:37:17.9  ... -70.621639  0.326   0.128   0.156    1.212   0.886   
36687  -70:37:17.9  ... 

In [65]:
df_merge_plus.drop_duplicates('object_name').info()

<class 'pandas.core.frame.DataFrame'>
Index: 40204 entries, 0 to 40205
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   object_name         40204 non-null  object 
 1   I_magnitude         40204 non-null  float64
 2   V_magnitude         40204 non-null  object 
 3   period_days         40204 non-null  float64
 4   epoch_of_minimum    40204 non-null  float64
 5   main_eclipse_dip    40204 non-null  object 
 6   second_eclipse_dip  40204 non-null  object 
 7   type                40204 non-null  object 
 8   RA_coord            40204 non-null  object 
 9   DEC_coord           40204 non-null  object 
 10  OGLE-IV             40204 non-null  object 
 11  OGLE-III            40204 non-null  object 
 12  OGLE-II             40204 non-null  object 
 13  other_names         166 non-null    object 
 14  RA_deg              40204 non-null  object 
 15  Dec_deg             40204 non-null  object 
 16  RA_h     