In [1]:
import pandas as pd
import os

In [2]:
code_path = '/home/john/code/Astro/'
support_data_path = os.path.join(code_path, 'variable_star')
in_file = os.path.join(support_data_path, 'all_star_summary.csv')
const_file = os.path.join(support_data_path, 'constellation_names.txt')
const_target_file = os.path.join(support_data_path, 'constellation_names_targets.txt')
sequence_file = os.path.join(support_data_path,'baa_sequences.txt')
sequence_database_file = os.path.join(support_data_path,'baa_sequence_byclass.csv')

In [3]:
# get the constallation name abbreviations, to limit variable stars to sensible names
def load_const_pattern(filename):
    '''
    get the contellation names from text file
    so that only stars with a valid constellation name are returned
    '''
    f_const = open(filename)
    line = ' '
    const_abbr = []
    while line != '':
        line = f_const.readline()
        const_abbr.append(line.split('\t')[-1].replace('\n',''))
    const_abbr = const_abbr[2:]
    f_const.close()
    const_abbr_low = [ const.lower() for const in const_abbr ]
    pattern = '|'.join(const_abbr_low)
    pattern = pattern[:-1]

    return pattern

In [4]:
def load_seq_pattern():
    '''
    get the sequence files available on BAA and return as regexp pattern
    so that only stars for which there is a sequence file are returned.
    '''
    seq = pd.read_csv(sequence_file,delimiter='\t')
    obj = seq['Object Id']
    obj.describe()
    obs_lower = obj.str.lower()
    pattern = '|'.join(obs_lower)
    return pattern  

In [5]:
def load_baa_sequence_db(seq_db_file):
    '''
    Load the BAA star summary data, containing sequence info, variable type, RA, dec, period etc.
    '''
    db = pd.read_csv(seq_db_file)
    # clean nans in required fields
    db['Star'] = db['Star'].str.encode('ascii','ignore').str.decode('ascii')  #deal with non-ascii chars
    db['Con'] = db['Con'].str.encode('ascii','ignore').str.decode('ascii')  #deal with non-ascii chars
    db = db[db['Star']!='']  #empty cells
    db = db[db['Star'].isna()==False]
    db = db[db['Con'].isna()==False]
    db = db[db['Max'].isna()==False]
    db = db[db['Min'].isna()==False]    
    db = db[db['Chart'].isna()==False]  
    db['Star']=db['Star'].str.lower()
    db['Con']=db['Con'].str.lower()
    db['Full Name'] = db['Star'] + ' '+ db['Con']
    return db

In [6]:
seq_db = load_baa_sequence_db(sequence_database_file)
seq_db.describe()
seq_db.head()
seq_db_pattern = '|'.join(seq_db['Full Name'].values)

In [8]:
# read in BAA variable star summary data as a Panda Data Frame
vsdb_full = pd.read_csv(in_file)
vsdb_full['Variable Star'] = vsdb_full['Variable Star'].str.lower()
# load the string matching patterns for constellations and BAA sequence files for filtering later.
const_pattern = load_const_pattern(const_file)
const_target_pattern = load_const_pattern(const_target_file)
# old seq pattern - contains stars not in BAA list
#seq_pattern = load_seq_pattern()
vsdb_full.describe()

Unnamed: 0,Number of Observations,Number of Observers,Maximum Magnitude,Minimum Magnitude,Range
count,2645.0,2645.0,2645.0,2645.0,2645.0
mean,904.272968,9.827977,9.882658,11.994442,2.111784
std,3113.952786,25.811231,2.991841,3.413102,2.315482
min,1.0,1.0,-0.6,1.8,0.0
25%,2.0,1.0,7.6,9.4,0.1
50%,22.0,1.0,10.1,11.9,1.36
75%,298.0,4.0,12.0,14.8,3.3
max,59231.0,302.0,19.0,21.0,17.2


In [9]:
# Database filtering
# VALID CONSTELLATION: where the Variable Star column has where there are two fields in the name (e.g. 'TX Dra')
#     and the VS Constellation column matches a valid constellation
vsdb_allconst = vsdb_full
vsdb_allconst['Valid_nFields'] = vsdb_allconst['Variable Star'].str.count(' ')

#remove rows which don't have 1 nFields
vsdb_allconst.drop(vsdb_allconst[vsdb_allconst.Valid_nFields != 1].index, inplace=True)

# set the second field to be a 'Constellation' description.  Needs to be lower case to match pattern, above.
vs_const = vsdb_allconst['Variable Star'].str.split(expand=True)
vsdb_allconst['VS Constellation'] = vs_const[1].str.lower()

# check whether str.contains is working...
vsdb_allconst['VS Constellation'].str.contains(const_pattern)==True

# drop all rows where the str.contains() conditional statement 
vsdb_allconst.drop(vsdb_allconst[vsdb_allconst['VS Constellation'].str.contains(const_pattern)==False].index, inplace=True)
vsdb_allconst.describe()

Unnamed: 0,Number of Observations,Number of Observers,Maximum Magnitude,Minimum Magnitude,Range,Valid_nFields
count,2157.0,2157.0,2157.0,2157.0,2157.0,2157.0
mean,1021.777932,10.982383,9.598873,11.771966,2.173092,1.0
std,3361.628004,27.893377,2.901859,3.405555,2.368961,0.0
min,1.0,1.0,-0.6,1.8,0.0,1.0
25%,2.0,1.0,7.4,9.1,0.1,1.0
50%,21.0,1.0,9.9,11.4,1.4,1.0
75%,363.0,5.0,11.5,14.5,3.48,1.0
max,59231.0,302.0,18.9,21.0,17.2,1.0


In [10]:
# Now look at TARGET CONSTELLATIONS, but selecting the constallations that I want to observe
# create a pattern to search which is an OR of each element in const_abbr (i.e. any constallation abbreviation)
vsdb_target_const = vsdb_allconst
vsdb_target_const.drop(vsdb_target_const[vsdb_target_const['VS Constellation'].str.contains(const_target_pattern)==False].index, inplace=True)
vsdb_target_const.describe()

Unnamed: 0,Number of Observations,Number of Observers,Maximum Magnitude,Minimum Magnitude,Range,Valid_nFields
count,1872.0,1872.0,1872.0,1872.0,1872.0,1872.0
mean,1152.171474,12.19391,9.613996,11.924455,2.310459,1.0
std,3528.519585,29.106162,2.860561,3.405055,2.416486,0.0
min,1.0,1.0,-0.6,2.5,0.0,1.0
25%,2.0,1.0,7.5,9.4,0.1,1.0
50%,31.0,2.0,10.0,11.5,1.6,1.0
75%,569.75,6.0,11.4,14.7,3.8,1.0
max,59231.0,302.0,18.9,21.0,17.2,1.0


In [11]:
# SEQUENCE FILE EXISTS
# next, filter on the VS which are available as a sequence file from the BAA
vsdb_target_seq = vsdb_allconst
vsdb_target_seq.drop(vsdb_target_seq[vsdb_target_seq['Variable Star'].str.contains(seq_db_pattern)==False].index, inplace=True)
vsdb_target_seq.describe()
# filtering conditions on min magnitude (always visible with binoculars) and range (for SNR)
# based on the numbers in the output below.  
# range has 75% of values > v1.3, so not too significant on the number of targets
# min magnutude has only 25% of values >9.0, so more significant 

  vsdb_target_seq.drop(vsdb_target_seq[vsdb_target_seq['Variable Star'].str.contains(seq_db_pattern)==False].index, inplace=True)


Unnamed: 0,Number of Observations,Number of Observers,Maximum Magnitude,Minimum Magnitude,Range,Valid_nFields
count,586.0,586.0,586.0,586.0,586.0,586.0
mean,3182.226962,30.825939,8.704863,12.024573,3.31971,1.0
std,5690.546601,44.58995,2.766638,3.747934,2.514736,0.0
min,1.0,1.0,0.0,3.0,0.0,1.0
25%,78.25,2.0,6.7,8.9,1.4,1.0
50%,1023.5,9.0,8.525,11.53,2.7,1.0
75%,3928.5,48.0,10.8,15.3,5.075,1.0
max,59231.0,302.0,15.8,21.0,17.2,1.0


In [12]:
vsdb_target_seq.head()

Unnamed: 0,Variable Star,Number of Observations,Number of Observers,First Observation JD / UT,Latest Observation JD / UT,Maximum Magnitude,Minimum Magnitude,Range,Valid_nFields,VS Constellation
35,ab aur,10630,71,2441013.400000 2 Mar 1971 21:36:00.000000,2459906.392000 22 Nov 2022 21:24:28.800000,6.5,8.5,2.0,1,aur
36,ab cas,79,1,2450676.425000 15 Aug 1997 22:12:00.000000,2451432.502000 11 Sep 1999 00:02:52.800000,10.3,11.4,1.1,1,cas
38,ab dra,15104,59,2440402.290000 29 Jun 1969 18:57:36.000000,2459906.305000 22 Nov 2022 19:19:12.000000,10.1,16.8,6.7,1,dra
42,ac her,11311,87,2440391.471000 18 Jun 1969 23:18:14.400000,2459908.240000 24 Nov 2022 17:45:36.000000,6.57,9.1,2.53,1,her
50,ad per,3082,55,2440689.300000 12 Apr 1970 19:12:00.000000,2459908.260000 24 Nov 2022 18:14:24.000000,7.3,9.4,2.1,1,per


In [None]:
# start filtering on conditions
# first, range > v1.0
# this reduces the number of objects from ~915 to ~750
vsdb_targets = vsdb_target_seq[vsdb_target_seq['Range']>=1.0]
vsdb_targets.describe()

In [None]:
# filter on min magnitude to find suitable binocular targets
#  reduces targets to ~200
vsdb_targets = vsdb_targets[vsdb_targets['Minimum Magnitude']<=9.0]
vsdb_targets.describe()

In [None]:
vsdb_targets.hist('Number of Observations', bins=100)

In [None]:
vsdb_targets.sort_values(by='Number of Observations', ascending=True).head(30)
vsdb_targets.sort_values(by='VS Constellation', ascending=True).head(30)


In [None]:
vsdb_allconst[['VS Constellation']].value_counts()

In [None]:
# slicing via []
vsdb_allconst[vsdb_allconst['VS Constellation']=='cyg']

In [None]:
#  this conditional statement returns a pandas Series of booleans
vsdb_allconst['VS Constellation']=='cyg'

In [None]:
type(vsdb_allconst['VS Constellation']=='cyg')

In [None]:
# check the dtype. Note the () are needed to prevent dtype operating on 'cyg'
(vsdb_allconst['VS Constellation']=='cyg').dtype

In [None]:
# slicing via loc
# this is equivalent (I think!) to 
# vsdb[vsdb['VS Constellation']=='cyg']
vsdb_allconst.loc[vsdb_allconst['VS Constellation']=='cyg']

In [None]:
vsdb_allconst[vsdb_allconst['VS Constellation']=='cyg'].sort_values(by='Number of Observations', ascending=False)

In [None]:
vsdb_target_seq.sort_values(by='VS Constellation').head(30)

In [None]:
vsdb_targets['Number of Observations'].hist()

In [None]:
# generate a target list from the 25% least observed variables
# vs_target_list = vsdb_targets[vsdb_targets['Number of Observations']<=275]
# although probably better to try filtering on EXISTING baa sequence file first
# then possibly time since last obs?

In [None]:
vs_target_list.sort_values('VS Constellation')

In [None]:
vs_target_list['VS Constellation'].value_counts()