### ASTR 598 - Astrostatistics - Class Project - Group 1

- Tyler Gordon
- Meredith Durbin 
- Brianna Thomas
- Joachim Moeyens
- Dino Bektesevic

In [5]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from crossmatch import xmatch

from astropy import units as u
from astropy.io import fits
from astropy.coordinates import SkyCoord

% matplotlib inline

### Data Acquisition

[Google Drive for NSC files and xmatch arrays](https://drive.google.com/drive/folders/1r0LgsJ4LjUgcxUIYwPUZl-YcwxqO0z5c?usp=sharing)

[Website for HLC files](http://das.sdss.org/va/stripe_82_variability/SDSS_82_public/)

In [6]:
# Assumes there is a directory called data 
# in the same level as this notebook
DATA_DIR = "data/"
HLC_FILES = sorted(glob.glob(os.path.join(DATA_DIR, "HLC*.fits")))
NSC_FILES = sorted(glob.glob(os.path.join(DATA_DIR, "stripe82*.txt")))
# Sorting insures cross OS compatibility for list ordering

In [7]:
sorted(os.listdir(DATA_DIR))

['HLC.RA_00_to_01.fits',
 'HLC.RA_01_to_02.fits',
 'HLC.RA_02_to_03.fits',
 'HLC.RA_03_to_04.fits',
 'HLC.RA_20_to_21.fits',
 'HLC.RA_21_to_22.fits',
 'HLC.RA_22_to_23.fits',
 'HLC.RA_23_to_24.fits',
 'stripe82_315_ra_45_-1_3_dec_0.txt',
 'stripe82_315_ra_45_0_dec_1_3.txt',
 'xmatch_hlc_nsc_1arcsec_dist.txt',
 'xmatch_hlc_nsc_1arcsec_idx.txt']

In [8]:
HLC_FILES

['data/HLC.RA_00_to_01.fits',
 'data/HLC.RA_01_to_02.fits',
 'data/HLC.RA_02_to_03.fits',
 'data/HLC.RA_03_to_04.fits',
 'data/HLC.RA_20_to_21.fits',
 'data/HLC.RA_21_to_22.fits',
 'data/HLC.RA_22_to_23.fits',
 'data/HLC.RA_23_to_24.fits']

In [9]:
NSC_FILES

['data/stripe82_315_ra_45_-1_3_dec_0.txt',
 'data/stripe82_315_ra_45_0_dec_1_3.txt']

In [36]:
# Simple function to read in HLC and NSC files into a single dataframe

def buildHLCDataFrame(datafiles,
                      filters=["u", "g", "r", "i", "z"],
                      columns=["RA_MEAN_CLIP", "DEC_MEAN_CLIP",
                               "RA_MEAN_ERR_CLIP", "DEC_MEAN_ERR_CLIP",
                               "MEAN_PSFMAG", "MEAN_PSFMAG_ERR",
                               "MEAN_OBJECT_TYPE", "RA_PM_CLIP",
                               "DEC_PM_CLIP"],
                      makeColors=True,
                      magColumnName="MEAN_PSFMAG",
                      colors=["g-i"]):
    
    dfs = []
    for datafile in datafiles:
        hdul = fits.open(datafile)
        data = hdul[1].data
        df = pd.DataFrame()

        for column in columns:
            # Assume multi-dimensional fits columns are keyed on filters
            if data[column][0].shape == (len(filters),):
                for i, filt in enumerate(filters):
                    df["{}_{}".format(column, filt)] = data[column][:, i]
            # If not multi-dimensional just add to dataframe as normal
            elif data[column][0].shape == ():
                df[column] = data[column]
            # If it isn't singular in dimension, or fits the expected number of filters
            # raise a hopefully useful error
            else:
                raise ValueError("Shape of multi-dimensional column data does not match number of filters!")

        if makeColors is True:
            for color in colors:
                # color = filt2 - filt1
                filt1 = color.split("-")[1]
                filt2 = color.split("-")[0]
                df[color] = df["{}_{}".format(magColumnName, filt2)] - df["{}_{}".format(magColumnName, filt1)]
        dfs.append(df)
    final = pd.concat(dfs)
    final.reset_index(inplace=True, drop=True)
    return final

def buildNSCDataFrame(datafiles):

    dfs = []
    for datafile in datafiles:
        df = pd.read_csv(NSC_FILES[0], sep=" ")
        dfs.append(df)
    
    final = pd.concat(dfs)
    final.reset_index(inplace=True, drop=True)
    return final

In [37]:
hlc = buildHLCDataFrame(HLC_FILES)

In [38]:
hlc.head()

Unnamed: 0,RA_MEAN_CLIP,DEC_MEAN_CLIP,RA_MEAN_ERR_CLIP,DEC_MEAN_ERR_CLIP,MEAN_PSFMAG_u,MEAN_PSFMAG_g,MEAN_PSFMAG_r,MEAN_PSFMAG_i,MEAN_PSFMAG_z,MEAN_PSFMAG_ERR_u,MEAN_PSFMAG_ERR_g,MEAN_PSFMAG_ERR_r,MEAN_PSFMAG_ERR_i,MEAN_PSFMAG_ERR_z,MEAN_OBJECT_TYPE,RA_PM_CLIP,DEC_PM_CLIP,g-i
0,1.3e-05,-0.681208,8e-06,8e-06,0.0,22.679943,21.68856,21.280607,20.960501,0.0,0.070077,0.044086,0.0388,0.177185,3.2,0.014193,0.037836,1.399336
1,4e-05,0.475432,6e-06,6e-06,0.0,22.494812,21.485416,21.199217,20.647459,0.0,0.041831,0.025648,0.031936,0.111785,3.0,-0.016163,0.030977,1.295595
2,0.000151,1.059528,2e-06,2e-06,0.0,21.816391,22.191387,20.808842,20.021967,0.0,0.097663,0.028077,0.012231,0.020769,5.4,-0.012494,-0.006675,1.007549
3,0.000205,-0.838721,1e-06,1e-06,19.7705,18.835068,18.476278,18.329546,18.270407,0.041607,0.003259,0.002162,0.002186,0.00387,6.0,0.015038,-0.001602,0.505522
4,0.000228,0.077162,2e-06,2e-06,0.0,20.968287,19.645609,18.767252,18.273226,0.0,0.008626,0.004235,0.003535,0.006249,6.0,-0.012199,-0.009051,2.201035


In [39]:
nsc = buildNSCDataFrame(NSC_FILES)

In [40]:
nsc.head()

Unnamed: 0,ra,dec,pmra,pmraerr,pmdec,pmdecerr,mjd,deltamjd,gmag,grms,...,rmag,rrms,rerr,imag,irms,ierr,class_star,fwhm,ebv,nphot
0,315.012148,-1.298594,1275.324779,1314.590162,-91.929458,1398.319357,57598.697285,10.879461,99.989998,999999.0,...,20.612076,999999.0,0.027147,99.989998,999999.0,9.99,0.978889,1.380337,0.087262,2
1,315.010754,-1.29788,-714.960501,3414.340885,-272.622409,3447.444185,57598.697285,10.879461,99.989998,999999.0,...,21.986279,999999.0,0.079821,99.989998,999999.0,9.99,0.899216,1.522221,0.087286,2
2,315.013867,-1.296287,-2339.889414,6293.682515,-2186.86337,6311.702668,57598.697285,10.879461,99.989998,999999.0,...,22.269653,999999.0,0.095987,99.989998,999999.0,9.99,0.340959,2.487719,0.087237,2
3,315.003075,-1.293646,106.992177,118.045727,18.378282,119.195841,57717.185732,360.90506,22.649281,999999.0,...,21.409351,999999.0,0.052671,99.989998,999999.0,9.99,0.866193,1.79105,0.08743,3
4,315.00315,-1.291179,-1.564847,24.141297,-6.216486,29.390321,57717.185732,360.90506,18.699413,999999.0,...,18.291037,999999.0,0.006475,99.989998,999999.0,9.99,0.984123,1.374196,0.087436,3


### Crossmatching (using DataLab crossmatch.py)

In [15]:
### 1 arcsecond in degrees
arcsecond = 0.000277778
DO_CROSSMATCH = False

In [13]:
if DO_CROSSMATCH:
    matched_1arcsec = xmatch(hlc["RA_MEAN"].values,
                             hlc["DEC_MEAN"].values,
                             nsc["ra"].values,
                             nsc["dec"].values,
                             maxdist=arcsecond)
    np.savetxt("xmatch_hlc_nsc_1arcsec_idx.txt", matched_1arcsec[0], fmt="%i")
    np.savetxt("xmatch_hlc_nsc_1arcsec_dist.txt", matched_1arcsec[1])
else:
    matched_id_1arcsec = np.loadtxt(os.path.join(DATA_DIR,"xmatch_hlc_nsc_1arcsec_idx.txt"), unpack=True)
    matched_dist_1arcsec = np.loadtxt(os.path.join(DATA_DIR,"xmatch_hlc_nsc_1arcsec_dist.txt"), unpack=True)
    print("1 arcsecond max distance crossmatch found {} matched sources".format(len(matched_id_1arcsec)))

1 arcsecond max distance crossmatch found 1481111 matched sources


In [14]:
if DO_CROSSMATCH:
    matched_2arcsec = xmatch(hlc["RA_MEAN"].values,
                             hlc["DEC_MEAN"].values,
                             nsc["ra"].values,
                             nsc["dec"].values,
                             maxdist=2*arcsecond)
    np.savetxt(os.path.join(DATA_DIR,"xmatch_hlc_nsc_2arcsec_idx.txt"), matched_2arcsec[0], fmt="%i")
    np.savetxt(os.path.join(DATA_DIR,"xmatch_hlc_nsc_2arcsec_dist.txt"), matched_2arcsec[1])
else:
    matched_id_2arcsec = np.loadtxt(os.path.join(DATA_DIR,"xmatch_hlc_nsc_2arcsec_idx.txt"), unpack=True)
    matched_dist_2arcsec = np.loadtxt(os.path.join(DATA_DIR,"xmatch_hlc_nsc_2arcsec_dist.txt"), unpack=True)
    print("2 arcsecond max distance crossmatch found {} matched sources".format(len(matched_id_2arcsec)))


2 arcsecond max distance crossmatch found 1486785 matched sources


### Data Cleaning and Proper Motion Selection

### Chi-Squared Calculation

### Plotting

- Chi-squared distribution as a function of Ra and Dec
- Proper motion as a function of RA and Dec
- Others...

In [22]:
avgra = (hlc["RA_MEAN_CLIP"]+nsc["ra"])/2.0


In [30]:
import astropy as apy
f = apy.io.fits.open("data/HLC.RA_00_to_01.fits")
a = f[1]
a.columns

ColDefs(
    name = 'LC_NAME'; format = '46A'
    name = 'IAU_NAME'; format = '24A'
    name = 'N_GOOD_EPOCHS'; format = '5I'
    name = 'MEAN_PSFMAG'; format = '5E'
    name = 'MEAN_PSFMAG_ERR'; format = '5E'
    name = 'MEAN_EXPMAG'; format = '5E'
    name = 'MEAN_EXPMAG_ERR'; format = '5E'
    name = 'RMS_PSFMAG'; format = '5E'
    name = 'RMS_EXPMAG'; format = '5E'
    name = 'CHISQ_PSFMAG'; format = '5E'
    name = 'CHISQ_EXPMAG'; format = '5E'
    name = 'N_GOOD_EPOCHS_PSF_CLIP'; format = '5I'
    name = 'N_GOOD_EPOCHS_EXP_CLIP'; format = '5I'
    name = 'MEAN_PSFMAG_CLIP'; format = '5E'
    name = 'MEAN_PSFMAG_ERR_CLIP'; format = '5E'
    name = 'MEAN_EXPMAG_CLIP'; format = '5E'
    name = 'MEAN_EXPMAG_ERR_CLIP'; format = '5E'
    name = 'RMS_PSFMAG_CLIP'; format = '5E'
    name = 'RMS_EXPMAG_CLIP'; format = '5E'
    name = 'CHISQ_PSFMAG_CLIP'; format = '5E'
    name = 'CHISQ_EXPMAG_CLIP'; format = '5E'
    name = 'MEAN_PSFMAG_ITER'; format = '5E'
    name = 'MEAN_PSFMAG_ERR_ITE