In [1]:
import pandas as pd
import numpy as np

In [2]:
# Watermark is not required for this code, but is included for information. 
import watermark
%load_ext watermark
%watermark -a "ELEANOR LUTZ" -d -v -iv -m

numpy     1.15.4
watermark 1.8.1
pandas    0.23.4
ELEANOR LUTZ 2019-07-11 

CPython 3.7.1
IPython 7.2.0

compiler   : MSC v.1900 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 63 Stepping 2, GenuineIntel
CPU cores  : 12
interpreter: 64bit


## Data Source
The data used in this Jupyter Notebook is from the [HYG Database version 3](http://www.astronexus.com/hyg) by David Nash. 

In [3]:
df = pd.read_csv('./data/hygdata_v3/hygdata_v3.csv', low_memory=False)
display(df.head())

# Remove the sun because it doesn't make sense in a star chart
df = df[df['proper'] != 'Sol']

# Translate plaintext Bayer designations into non-ASCII greek letters
greek_dict = {'Alp': u"α",'Bet': u"β",'Chi': u"χ",'Del': u"δ",'Eps': u"ε",'Eta': u"η",
              'Gam': u"γ",'Iot': u"ι",'Kap': u"κ",'Lam': u"λ",'Mu': u"μ",'Nu': u"ν",
              'Ome': u"ω",'Omi': u"ο",'Phi': u"φ",'Pi': u"π",'Psi': u"ψ",'Rho': u"ρ",
              'Sig': u"σ",'Tau': u"τ",'The': u"θ",'Ups': u"υ",'Xi': u"ξ",'Zet': u"ζ"}

print(df[pd.notnull(df['bayer'])]['bayer'].unique())
def get_greek_letter(n):
    if str(n) == 'nan':
        return(np.nan)
    split = n.split("-")
    greek = greek_dict.get(split[0])
    if len(split) > 1:
        r = greek + split[1]
    else: 
        r = greek
    return(r)

df['greek_letters'] = df['bayer'].apply(get_greek_letter)
display(df.head())

print(len(df[pd.notnull(df['spect'])]['spect'].unique()), 'unique spectral designations')
def get_first_letter(name):
    '''Preprocess spectral designations to remove numbers'''
    if str(name) != 'nan':
        if len(name) > 1:
            if name[0:2] == 'sd':
                # remove MK system luminosity class to look just at
                # Morgan-Keenan designations
                name = name[2::]
            alphas = ''.join(c for c in name if c not in '?:!/;.,[]{}()')
            return(alphas[0].upper())
        else:
            return(name.upper())
    return(name)
    
df['dist'].replace(to_replace=100000, value=np.nan, inplace=True)
    
df['spect_desig'] = df['spect'].apply(get_first_letter)    
print(len(df[pd.notnull(df['spect_desig'])]['spect_desig'].unique()), 'unique spectral designations')
print(df[pd.notnull(df['spect_desig'])]['spect_desig'].unique())

color_dict = { 
    'O':'#5A90C3', 'B':'#93C2F1', 'A':'#f3e8d3', 'F':'#d4bf94',
    'G':'#FFD423', 'K':'#F99220', 'M':'#FF2620',  'L':'#FF2620',
    'T':'#FF6199', 'Y':'#6B22FF', 
    'C':'#979330', 'R':'#979330', 'W':'#979330', 'N':'#979330',
    'S':'#979330', 'D':'#979330', 'P':'#979330',
    'nan': '#000000' # unknown
}
df['color'] = df['spect_desig'].replace(to_replace=color_dict)
df['color'] = df['color'].replace(to_replace=np.nan, value='#000000')
df['linecolor'] = df['color'].replace(['#000000'], ['#f3e8d3']) # beige outline for black NANs

display(df.head())
df.to_csv('./data/processed/hygdata_processed.csv', index=False)

print(len(df), 'total stars available in database')
df = df[df['mag'] <= 6.5]
print(len(df), 'stars visible to the human eye')
df.to_csv('./data/processed/hygdata_processed_mag65.csv', index=False)

Unnamed: 0,id,hip,hd,hr,gl,bf,proper,ra,dec,dist,...,bayer,flam,con,comp,comp_primary,base,lum,var,var_min,var_max
0,0,,,,,,Sol,0.0,0.0,0.0,...,,,,1,0,,1.0,,,
1,1,1.0,224700.0,,,,,6e-05,1.089009,219.7802,...,,,Psc,1,1,,9.63829,,,
2,2,2.0,224690.0,,,,,0.000283,-19.49884,47.9616,...,,,Cet,1,2,,0.392283,,,
3,3,3.0,224699.0,,,,,0.000335,38.859279,442.4779,...,,,And,1,3,,386.901132,,,
4,4,4.0,224707.0,,,,,0.000569,-51.893546,134.2282,...,,,Phe,1,4,,9.366989,,,


['Tau' 'The' 'Zet' 'Alp' 'Bet' 'Kap-1' 'Eps' 'Gam-3' 'Kap-2' 'Gam' 'Chi'
 'Sig' 'Iot' 'Pi' 'Rho' 'Kap' 'Eta' 'Lam-1' 'Bet-1' 'Bet-2' 'Lam' 'Bet-3'
 'Lam-2' 'Del' 'Mu' 'Xi' 'Phi-1' 'Omi' 'Nu' 'Phi-2' 'Ups-1' 'Phi-3'
 'Ups-2' 'Phi-4' 'Ome' 'Psi-1' 'Ups' 'Psi-2' 'Phi' 'Psi-3' 'Psi' 'Tau-1'
 'Tau-2' 'Eta-1' 'Gam-2' 'Eta-2' 'Gam-1' 'Xi-1' 'Pi-1' 'Pi-2' 'Xi-2'
 'Iot-1' 'Iot-2' 'Eta-3' 'Rho-1' 'Rho-2' 'Rho-3' 'The-1' 'Tau-3' 'Zet-1'
 'Zet-2' 'Tau-4' 'Chi-1' 'Chi-2' 'Chi-3' 'Tau-5' 'Tau-6' 'Tau-7' 'Tau-8'
 'Tau-9' 'Ome-1' 'Omi-1' 'Omi-2' 'Ome-2' 'Ups-4' 'Del-1' 'Del-2' 'Del-3'
 'The-2' 'Sig-1' 'Sig-2' 'Pi-3' 'Pi-4' 'Pi-5' 'Pi-6' 'Nu-1' 'Nu-2' 'Nu-3'
 'Psi-4' 'Psi-5' 'Psi-6' 'Psi-7' 'Psi-8' 'Psi-9' 'Mu-1' 'Mu-2' 'Sig-3'
 'Alp-1' 'Alp-2' 'Zet-3' 'Zet-4' 'Eps-1' 'Eps-2']


Unnamed: 0,id,hip,hd,hr,gl,bf,proper,ra,dec,dist,...,flam,con,comp,comp_primary,base,lum,var,var_min,var_max,greek_letters
1,1,1.0,224700.0,,,,,6e-05,1.089009,219.7802,...,,Psc,1,1,,9.63829,,,,
2,2,2.0,224690.0,,,,,0.000283,-19.49884,47.9616,...,,Cet,1,2,,0.392283,,,,
3,3,3.0,224699.0,,,,,0.000335,38.859279,442.4779,...,,And,1,3,,386.901132,,,,
4,4,4.0,224707.0,,,,,0.000569,-51.893546,134.2282,...,,Phe,1,4,,9.366989,,,,
5,5,5.0,224705.0,,,,,0.000665,-40.591202,257.732,...,,Phe,1,5,,21.998851,,,,


4307 unique spectral designations
14 unique spectral designations
['F' 'K' 'B' 'G' 'M' 'A' 'C' 'R' 'O' 'W' 'N' 'S' 'D' 'P']


Unnamed: 0,id,hip,hd,hr,gl,bf,proper,ra,dec,dist,...,comp_primary,base,lum,var,var_min,var_max,greek_letters,spect_desig,color,linecolor
1,1,1.0,224700.0,,,,,6e-05,1.089009,219.7802,...,1,,9.63829,,,,,F,#d4bf94,#d4bf94
2,2,2.0,224690.0,,,,,0.000283,-19.49884,47.9616,...,2,,0.392283,,,,,K,#F99220,#F99220
3,3,3.0,224699.0,,,,,0.000335,38.859279,442.4779,...,3,,386.901132,,,,,B,#93C2F1,#93C2F1
4,4,4.0,224707.0,,,,,0.000569,-51.893546,134.2282,...,4,,9.366989,,,,,F,#d4bf94,#d4bf94
5,5,5.0,224705.0,,,,,0.000665,-40.591202,257.732,...,5,,21.998851,,,,,G,#FFD423,#FFD423


119613 total stars available in database
8912 stars visible to the human eye
