In [1]:
import pandas as pd
import os

In [2]:
code_path = '/home/john/code/Astro/'
support_data_path = os.path.join(code_path, 'variable_star')
in_file = os.path.join(support_data_path, 'all_star_summary.csv')
const_file = os.path.join(support_data_path, 'constellation_names.txt')
const_target_file = os.path.join(support_data_path, 'constellation_names_targets.txt')
sequence_file = os.path.join(support_data_path,'baa_sequences.txt')
sequence_database_file = os.path.join(support_data_path,'baa_sequence_byclass.csv')

In [3]:
# get the constallation name abbreviations, to limit variable stars to sensible names
def load_const_pattern(filename):
    '''
    get the contellation names from text file
    so that only stars with a valid constellation name are returned
    '''
    f_const = open(filename)
    line = ' '
    const_abbr = []
    while line != '':
        line = f_const.readline()
        const_abbr.append(line.split('\t')[-1].replace('\n',''))
    const_abbr = const_abbr[2:]
    f_const.close()
    const_abbr_low = [ const.lower() for const in const_abbr ]
    pattern = '|'.join(const_abbr_low)
    pattern = pattern[:-1]

    return pattern

In [4]:
def load_seq_pattern():
    '''
    get the sequence files available on BAA and return as regexp pattern
    so that only stars for which there is a sequence file are returned.
    '''
    seq = pd.read_csv(sequence_file,delimiter='\t')
    obj = seq['Object Id']
    obj.describe()
    obs_lower = obj.str.lower()
    pattern = '|'.join(obs_lower)
    return pattern  

In [5]:
def load_baa_sequence_db(seq_db_file):
    '''
    Load the BAA star summary data, containing sequence info, variable type, RA, dec, period etc.
    '''
    db = pd.read_csv(seq_db_file)
    # clean nans in required fields
    db['Star'] = db['Star'].str.encode('ascii','ignore').str.decode('ascii')  #deal with non-ascii chars
    db['Con'] = db['Con'].str.encode('ascii','ignore').str.decode('ascii')  #deal with non-ascii chars
    db = db[db['Star']!='']  #empty cells
    db = db[db['Star'].isna()==False]
    db = db[db['Con'].isna()==False]
    db = db[db['Max'].isna()==False]
    db = db[db['Min'].isna()==False]    
    db = db[db['Chart'].isna()==False]  
    db['Star']=db['Star'].str.lower()
    db['Con']=db['Con'].str.lower()
    db['Full Name'] = db['Star'] + ' '+ db['Con']
    return db

In [6]:
seq_db = load_baa_sequence_db(sequence_database_file)
seq_db_pattern = '|'.join(seq_db['Full Name'].values)
seq_db.describe()

Unnamed: 0,Star,Con,RA,Dec,Type,Max,Min,Period,Chart,Class,Eclipse Duration,Full Name
count,520,520,520,520,440,520.0,520.0,410,520,520,76,520
unique,294,49,429,496,85,197.0,295.0,395,290,3,15,520
top,r,cyg,20 06,+17 54,SRb,7.7,7.7,158,AAVSO,pulsating,EB,r and
freq,20,46,4,3,97,12.0,9.0,2,160,241,17,1


In [7]:
seq_db_noeclipsing = seq_db[seq_db['Class']!='eclipsing']
seq_db_noeclipsing_pattern = '|'.join(seq_db_noeclipsing['Full Name'].values)
seq_db_noeclipsing.describe()

Unnamed: 0,Star,Con,RA,Dec,Type,Max,Min,Period,Chart,Class,Eclipse Duration,Full Name
count,444,444,444,444,440,444,444.0,334,444,444,0.0,444
unique,262,48,376,428,85,191,278.0,324,238,2,0.0,444
top,r,cyg,19 34,+17 54,SRb,7,8.6,120,AAVSO,pulsating,,r and
freq,20,39,4,3,97,11,6.0,2,160,241,,1


In [8]:
# read in BAA variable star summary data as a Panda Data Frame
vsdb_full = pd.read_csv(in_file)
vsdb_full['Variable Star'] = vsdb_full['Variable Star'].str.lower()
# load the string matching patterns for constellations and BAA sequence files for filtering later.
const_pattern = load_const_pattern(const_file)
const_target_pattern = load_const_pattern(const_target_file)
# old seq pattern - contains stars not in BAA list
#seq_pattern = load_seq_pattern()
vsdb_full.describe()

Unnamed: 0,Number of Observations,Number of Observers,Maximum Magnitude,Minimum Magnitude,Range
count,2645.0,2645.0,2645.0,2645.0,2645.0
mean,904.272968,9.827977,9.882658,11.994442,2.111784
std,3113.952786,25.811231,2.991841,3.413102,2.315482
min,1.0,1.0,-0.6,1.8,0.0
25%,2.0,1.0,7.6,9.4,0.1
50%,22.0,1.0,10.1,11.9,1.36
75%,298.0,4.0,12.0,14.8,3.3
max,59231.0,302.0,19.0,21.0,17.2


In [9]:
# Database filtering
# VALID CONSTELLATION: where the Variable Star column has where there are two fields in the name (e.g. 'TX Dra')
#     and the VS Constellation column matches a valid constellation
vsdb_allconst = vsdb_full
vsdb_allconst['Valid_nFields'] = vsdb_allconst['Variable Star'].str.count(' ')

#remove rows which don't have 1 nFields
vsdb_allconst.drop(vsdb_allconst[vsdb_allconst.Valid_nFields != 1].index, inplace=True)

# set the second field to be a 'Constellation' description.  Needs to be lower case to match pattern, above.
vs_const = vsdb_allconst['Variable Star'].str.split(expand=True)
vsdb_allconst['VS Constellation'] = vs_const[1].str.lower()

# check whether str.contains is working...
vsdb_allconst['VS Constellation'].str.contains(const_pattern)==True

# drop all rows where the str.contains() conditional statement 
vsdb_allconst.drop(vsdb_allconst[vsdb_allconst['VS Constellation'].str.contains(const_pattern)==False].index, inplace=True)
vsdb_allconst.describe()

Unnamed: 0,Number of Observations,Number of Observers,Maximum Magnitude,Minimum Magnitude,Range,Valid_nFields
count,2157.0,2157.0,2157.0,2157.0,2157.0,2157.0
mean,1021.777932,10.982383,9.598873,11.771966,2.173092,1.0
std,3361.628004,27.893377,2.901859,3.405555,2.368961,0.0
min,1.0,1.0,-0.6,1.8,0.0,1.0
25%,2.0,1.0,7.4,9.1,0.1,1.0
50%,21.0,1.0,9.9,11.4,1.4,1.0
75%,363.0,5.0,11.5,14.5,3.48,1.0
max,59231.0,302.0,18.9,21.0,17.2,1.0


In [10]:
# Next, cross-reference against stars which exist in the VSS sequence lists
# this checks whether each Variable Star in vsdb is present in seq_db
# majority AREN'T in the sequence lists (seq_db)
vsdb_allconst['Variable Star'].isin(seq_db['Full Name']).value_counts()

False    1739
True      418
Name: Variable Star, dtype: int64

In [11]:
# then filter, based on this
vsdb_allconst = vsdb_allconst[vsdb_allconst['Variable Star'].isin(seq_db['Full Name'])]
vsdb_allconst.describe()

Unnamed: 0,Number of Observations,Number of Observers,Maximum Magnitude,Minimum Magnitude,Range,Valid_nFields
count,418.0,418.0,418.0,418.0,418.0,418.0
mean,4435.294258,42.392344,8.207823,12.260646,4.052823,1.0
std,6429.820994,48.796491,2.720266,3.8613,2.463371,0.0
min,1.0,1.0,0.0,3.0,0.0,1.0
25%,718.0,6.0,6.5,8.8,2.0,1.0
50%,2210.0,20.0,7.7,12.15,3.6,1.0
75%,5522.75,65.0,10.3,15.8,5.8225,1.0
max,59231.0,302.0,14.3,20.8,17.2,1.0


In [None]:
# Now look at TARGET CONSTELLATIONS, but selecting the constallations that I want to observe
# create a pattern to search which is an OR of each element in const_abbr (i.e. any constallation abbreviation)
vsdb_target_const = vsdb_allconst
vsdb_target_const.drop(vsdb_target_const[vsdb_target_const['VS Constellation'].str.contains(const_target_pattern)==False].index, inplace=True)
vsdb_target_const.describe()

In [None]:
# SEQUENCE FILE EXISTS
# next, filter on the VS which are available as a sequence file from the BAA
vsdb_target_seq = vsdb_allconst
#vsdb_target_seq.drop(vsdb_target_seq[vsdb_target_seq['Variable Star'].str.contains(seq_db_pattern)==False].index, inplace=True)
#filter out all eclipsing binaries
vsdb_target_seq.drop(vsdb_target_seq[vsdb_target_seq['Variable Star'].str.contains(seq_db_noeclipsing_pattern)==False].index, inplace=True)
vsdb_target_seq.describe() 

In [None]:
seq_db.describe()

In [None]:
seq_db_noeclipsing.describe()

In [None]:
vsdb_target_seq.head()

In [None]:
# start filtering on conditions

# filtering conditions on min magnitude (always visible with binoculars) and range (for SNR)
# based on the numbers in the output below.  
# range has 75% of values > v1.3, so not too significant on the number of targets
# min magnutude has only 25% of values >9.0, so more significant 

# first, range > v1.0
# this reduces the number of objects from ~915 to ~750
vsdb_targets = vsdb_target_seq[vsdb_target_seq['Range']>=1.0]
vsdb_targets.describe()

In [None]:
# filter on min magnitude to find suitable binocular targets
#  reduces targets to ~200
vsdb_targets = vsdb_targets[vsdb_targets['Minimum Magnitude']<=9.0]
vsdb_targets.describe()

In [None]:
vsdb_targets.hist('Number of Observations', bins=100)

In [None]:
vsdb_targets.sort_values(by='Number of Observations', ascending=True).head(30)

In [None]:
vsdb_allconst[['VS Constellation']].value_counts()

In [None]:
vsdb_targets.sort_values(by='VS Constellation', ascending=True).head(30)

In [None]:
# slicing via []
vsdb_targets[vsdb_targets['VS Constellation']=='gem'].sort_values(by='Latest Observation JD / UT')

In [None]:
seq_db[seq_db['Con']=='gem'].sort_values(by='Star')

In [None]:
# check whether a star name is contained within the seq_db data frame
seq_db['Full Name'].str.count('aw gem').max()
# or
vsdb_targets['Variable Star'].isin(['tx dra']).value_counts()

In [None]:
# this checks whether each Variable Star in vsdb is present in seq_db
vsdb_targets['Variable Star'].isin(seq_db['Full Name']).value_counts()