In [1]:
import numpy as np
import pandas as pd
import glob, os

In [2]:
# Watermark is not required for this code, but is included for information. 
import watermark
%load_ext watermark
%watermark -a "ELEANOR LUTZ" -d -v -iv -m

pandas    0.23.4
numpy     1.15.4
watermark 1.8.1
ELEANOR LUTZ 2019-07-26 

CPython 3.7.1
IPython 7.2.0

compiler   : MSC v.1900 64 bit (AMD64)
system     : Windows
release    : 10
machine    : AMD64
processor  : Intel64 Family 6 Model 63 Stepping 2, GenuineIntel
CPU cores  : 12
interpreter: 64bit


In [3]:
# Read in processed star data (created in 1_process_starbase_data.ipynb)
df_stars = pd.read_csv("./data/processed/hygdata_processed.csv", low_memory=False)

# Exclude "culture" folders that are replicated in other folders. 
# "chinese_medieval" and "western" are used instead of the alternates in the list below ("exclude") 
exclude = ['western_SnT', 'western_hlad', 'western_rey', 'chinese_contemporary', 'chinese']
fabfiles = glob.glob('./data/skycultures/*/')
fabfiles = [x.split("\\")[-2] for x in fabfiles]
fabfiles = [x for x in fabfiles if x not in exclude]

# Print all cultures that will be analyzed in this Jupyter notebook
print(fabfiles)

['arabic', 'arabic_moon_stations', 'armintxe', 'aztec', 'belarusian', 'boorong', 'chinese_medieval', 'dakota', 'egyptian', 'hawaiian_starlines', 'indian', 'inuit', 'japanese_moon_stations', 'kamilaroi', 'korean', 'lokono', 'macedonian', 'maori', 'maya', 'mongolian', 'mulapin', 'navajo', 'norse', 'northern_andes', 'ojibwe', 'romanian', 'sami', 'sardinian', 'seleucid', 'siberian', 'tongan', 'tukano', 'tupi', 'western']


In [4]:
'''
Hacky shortcut to process .fab files so that Pandas can open variable-length
rows as comma-delimited. 
Just puts longest row at top so Pandas doesn't encounter unexpected columns.
'''
for name in fabfiles: 
    fname = './data/skycultures/'+name+'/constellationship.fab'
    savename = './data/processed/skycultures/sorted_constellations/'+name+'_sorted.csv'
    df = pd.read_csv(fname, header=None)
    # For some reason newer files are tab delimited with spacer tabs
    if name in ['mulapin', 'seleucid']:
        df['len'] = df[0].str.strip().str.len()
        df = df.sort_values(by='len', ascending=False)
        del df['len']
    else: 
        df['len'] = df[0].str.count(' ')
        df = df.sort_values(by='len', ascending=False)
        del df['len']
    df.to_csv(savename, index=False, header=None)
    df.head()
    
print('---DONE---')

---DONE---


In [5]:
def get_loc(vals, to_find, df_stars):
    ''' 
    Use the HYG v3.0 database to convert the asterism star IDs into 
    right ascension and declination values
    '''
    vals = [x for x in vals if str(x) != 'nan']
    if ',' not in str(vals[0]):
        vallist = []
        for val in vals:
            if str(val) != 'nan':
                df_val = df_stars[df_stars['hip'] == val]
                missing = {'78727': '144069'}
                if len(df_val) >= 1:
                    vallist.append(df_val.iloc[0][to_find])
                elif val in missing.keys():
                    # Manually fill in the two stars that are missing for some reason
                    # SOURCE: Hipparcos catalog. Mapped to HD ID identifier. 
                    # http://tdc-www.harvard.edu/catalogs/hipparcos.html
                    # https://www.cosmos.esa.int/web/hipparcos/search-facility
                    val = missing.get(str(int(val)))
                    print(val)
                    df_val = df_stars[df_stars['hd'] == val]
                    vallist.append(df_val.iloc[0][to_find])
        vallist = [str(x) for x in vallist]
        str1 = ', '.join(vallist)
        return str1
    
def get_scatter_size_cultures(val, max_count = 35):
    '''
    Get scatter point size for individual stars by the number of times they 
    appear in the dataset. 
    '''
    return max_count + val ** 2

In [6]:
'''
Process star ID data into RA/DEC coordinates by asterism and star.

The final data from this cell (star_df) includes:
1. the stellar ID for each star represented in an asterism (star_ID)
2. the number of cultures that use that star (count),
3. the right ascension of the star (ra)
4. the declination of the star (dec)
5. the arbitrary size at which to plot the star in my map design (size)
'''

starlist = [] # keep track of all stars included in all cultures combined

for name in fabfiles: 
    fname = './data/processed/skycultures/sorted_constellations/'+name+'_sorted.csv'
    savename = './data/processed/skycultures/constellations_ra_dec/'+name+'_ra_dec.csv'
    df_new = pd.read_csv(fname, delim_whitespace=True, header=None, encoding='utf8', engine='python')

    asts = df_new[0].tolist()
    vals = df_new[1].tolist()
    df_new.drop(df_new.columns[[0, 1]], axis=1, inplace=True)

    starlist_temp = [item for sublist in df_new.values.tolist() for item in sublist]
    starlist += set(starlist_temp)

    dec = df_new.apply(lambda row: get_loc(row, to_find='dec', df_stars=df_stars), axis=1).tolist()
    ras = df_new.apply(lambda row: get_loc(row, to_find='ra', df_stars=df_stars), axis=1).tolist()

    df_new['ra'] = ras
    df_new['dec'] = dec
    df_new['ast_ID'] = asts
    df_new['vals'] = vals

    df_new = df_new[['ast_ID', 'vals', 'ra', 'dec']]
    df_new = df_new.sort_values(by='ast_ID')
    df_new.to_csv(savename, index=None)

starlist = [int(x) for x in starlist if str(x) != 'nan']
starlist_savename = './data/processed/starlist_cultures_design.csv'

starlist_df = pd.DataFrame({'star_ID': starlist})
star_df = pd.DataFrame(starlist_df['star_ID'].value_counts().reset_index())
star_df.columns = ['star_ID', 'count']
star_df['ra'] = star_df['star_ID'].apply(lambda x: get_loc([x], to_find='ra', df_stars=df_stars))
star_df['dec'] = star_df['star_ID'].apply(lambda x: get_loc([x], to_find='dec', df_stars=df_stars))
star_df['size'] = star_df['count'].apply(get_scatter_size_cultures)

star_df = star_df.sort_values(by='count', ascending=False)
star_df.to_csv(starlist_savename, index=False)
display(star_df.head())

# Check that all data was successfully converted
print(star_df['ra'].isnull().sum(), 'null values in ra data')

Unnamed: 0,star_ID,count,ra,dec,size
0,26727,28,5.679313,-1.942572,819
1,25930,28,5.533445,-0.299092,819
2,26311,26,5.603559,-1.20192,711
4,21421,24,4.598677,16.509301,611
3,17499,24,3.747927,24.113339,611


0 null values in ra data


In [7]:
'''
For all stars that are only included in one culture, 
go back and find the culture that contains the star. 
(This isn't the most efficient way to do this but it's good enough.) 
'''

fname = './data/processed/starlist_cultures_design.csv'
savename = './data/processed/starlist_cultures_design_single.csv'
fnames = glob.glob('./culture_star_data/*/constellationship_sorted.csv')

df = pd.read_csv(fname)
df = df[df['count'] == 1]
df['culture'] = np.nan

if not os.path.exists(savename): # Inefficient so don't run code by accident
    for index, row in df.iterrows():
        star = row['star_ID']
        for name in fabfiles: 
            fname = './data/processed/skycultures/sorted_constellations/'+name+'_sorted.csv'
            df_new = pd.read_csv(fname, delim_whitespace=True, header=None, encoding='utf8', engine='python')
            df_new.drop(df_new.columns[[0, 1]], axis=1, inplace=True)
            starlist_temp = [item for sublist in df_new.values.tolist() for item in sublist]
            starlist_temp = set(starlist_temp)
            if star in starlist_temp:
                df.loc[index, 'culture'] = name
    df.to_csv(savename, index=False)
    
display(df.head())

Unnamed: 0,star_ID,count,ra,dec,size,culture
1252,110023,1,22.285139,-5.387164,36,
1253,10623,1,2.279251,83.561414,36,
1254,100751,1,20.427459,-56.73509,36,
1255,117299,1,23.783866,57.451359,36,
1256,25142,1,5.380556,3.544452,36,


In [8]:
'''
Combine all culture data into a dataframe by star ID pair.

The final data from this cell (df_main) includes:
1. The two star IDs in that vector line, in alphabetical order from smallest to largest ID (star_ID_pair)
2. the culture responsible for that line  (culture)
    NOTE: each line is repeated for as many times as there are cultures
3. the right ascensions of the two stars (ras)
4. the declinations of the two stars (decs)
'''

savename = './data/processed/star_pairs.csv'
df_main = pd.DataFrame({'star_ID_pair':[], 'culture':[], 'ras':[], 'decs':[]})

for culture in fabfiles: 
    fname = './data/processed/skycultures/sorted_constellations/'+culture+'_sorted.csv'
    star_ID_pairs, cultures = [], []
    df = pd.read_csv(fname, delim_whitespace=True, header=None, encoding='utf8', engine='python')
    namelist = []
    for index, row in df.iterrows():
        temp_list = [x for x in row.tolist() if str(x) != 'nan'] # remove extra cells
        if len(temp_list) > 0: 
            name = temp_list[0]
            temp_list = temp_list[2:] # remove asterism number and number of stars
            # some errors in chinese medieval where last star is repeated
            if not (len(temp_list) % 2 == 0) or (len(temp_list) == 1):
                if temp_list[-1] == temp_list[-2]:
                    temp_list = temp_list[:-1]
            assert (len(temp_list) % 2 == 0) or (len(temp_list) == 1)
            temp_list = [int(x) for x in temp_list] # all strings and non-numbers should now be gone
            temp_list = [temp_list[i:i+2] for i in range(0, len(temp_list), 2)] # break into line pairs
            temp_list = [sorted(x) for x in temp_list] # order line pairs smallest to largest
            star_ID_pairs += temp_list
            namelist.append(name)

    star_ID_1_dec = [get_loc([x[0]], to_find='dec', df_stars=df_stars) for x in star_ID_pairs]
    star_ID_2_dec = [get_loc([x[1]], to_find='dec', df_stars=df_stars) for x in star_ID_pairs]
    star_ID_1_ra = [get_loc([x[0]], to_find='ra', df_stars=df_stars) for x in star_ID_pairs]
    star_ID_2_ra = [get_loc([x[1]], to_find='ra', df_stars=df_stars) for x in star_ID_pairs]

    ras = [[x, y] for x, y, in zip(star_ID_1_ra, star_ID_2_ra)]
    decs = [[x, y] for x, y, in zip(star_ID_1_dec, star_ID_2_dec)]

    star_ID_pairs = ["".join(c for c in str(x) if c not in "'[]") for x in star_ID_pairs]
    ras = ["".join(c for c in str(x) if c not in "'[]") for x in ras]
    decs = ["".join(c for c in str(x) if c not in "'[]") for x in decs]

    df = pd.DataFrame({'star_ID_pair': star_ID_pairs, 'culture': [culture]*len(star_ID_pairs),
                       'ras': ras, 'decs': decs})
    df_main = pd.concat([df, df_main])
    
df_main.to_csv(savename, index=False)
df_main.head()

Unnamed: 0,star_ID_pair,culture,ras,decs
0,"7588, 9007",western,"1.628556, 1.932564","-57.236757, -51.608896"
1,"9007, 10602",western,"1.932564, 2.275154","-51.608896, -51.512165"
2,"10602, 11407",western,"2.275154, 2.449755","-51.512165, -47.70384"
3,"11407, 12413",western,"2.449755, 2.663326","-47.70384, -42.89167"
4,"12413, 12486",western,"2.663326, 2.677781","-42.89167, -39.855375"
