In [1]:
%matplotlib inline

import os
import healpy as hp
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import pyspark
import pyspark.sql.functions as sparkfunc

matplotlib.rcParams['figure.dpi'] = 120

from dustmaps.sfd import SFDQuery

from gaia_tools import healpix_hist, bin_column
from numpy import pi
from pyspark.sql import SparkSession


In [2]:
#spark = SparkSession.builder.config('spark.master', "local[4]") \
#                            .config('spark.memory.offHeap.enabled', 'true') \
#                            .config('spark.memory.offHeap.size', '6G') \
#                            .config('spark.driver.memory', '8G') \
#                            .config("spark.sql.execution.arrow.enabled", "true").getOrCreate()

In [3]:
def spark_start(project_path, metastore=None):
    from pyspark.sql import SparkSession

    warehouse_location = os.path.join(project_path, 'spark-warehouse')

    local_dir = os.path.join(project_path, 'spark-tmp')

    spark = ( 
            SparkSession.builder
            .appName("LSD2")
            .config("spark.sql.warehouse.dir", warehouse_location)
            .config('spark.master', "local[4]")
            .config('spark.driver.memory', '6G') # 128
            .config('spark.local.dir', local_dir)
            .config('spark.memory.offHeap.enabled', 'true')
            .config('spark.memory.offHeap.size', '4G') # 256
            .config("spark.sql.execution.arrow.enabled", "true")
            .config("spark.driver.maxResultSize", "6G")
            .config("spark.driver.extraJavaOptions", f"-Dderby.system.home={metastore}")
            .enableHiveSupport()
            .getOrCreate()
                    )   

    return spark

In [4]:
root_dir = "/epyc/users/ctslater"

spark = spark_start(root_dir)

In [5]:
gaia_ = spark.read.load("/epyc/data/gaia_dr2_1am_dup/")
gaia = gaia_.drop("hpix12").withColumn("hpix12",
                        sparkfunc.floor(gaia_['source_id']/34359738368))

In [6]:
%%time

sfd_order = 9
sfd = SFDQuery()
npix = hp.nside2npix(hp.order2nside(sfd_order))

angle_theta, angle_phi = hp.pix2ang(hp.order2nside(sfd_order), np.arange(npix), nest=True)

reddening = sfd.query_equ(np.rad2deg(angle_phi), 90.0 - np.rad2deg(angle_theta))

EBV_map_hpix12 = hp.ud_grade(reddening, hp.order2nside(12), order_in='NEST')
pandas_df = pd.DataFrame({"hpix12": np.arange(len(EBV_map_hpix12)),
                                             "EBV": EBV_map_hpix12})

reddening_df = spark.createDataFrame(pandas_df)
gaia_w_EBV = gaia.join(reddening_df, on="hpix12")

gaia_g0 = gaia_w_EBV.withColumn("phot_g0", gaia_w_EBV['phot_g_mean_mag'] - 3.1*gaia_w_EBV['EBV']).\
                      withColumn("bp_rp_0", gaia_w_EBV['bp_rp'] - 1.2919*gaia_w_EBV['EBV'])

CPU times: user 8.53 s, sys: 6.13 s, total: 14.7 s
Wall time: 11.8 s


In [None]:
%%time
dereddened_density2 = healpix_hist(gaia_g0. \
                                  where((gaia_g0['dup'] == 0) &
                                        (gaia_g0['bp_rp_0'] > 0.5) & (gaia_g0['bp_rp_0'] < 0.75) &
                                        (gaia_g0['phot_g0'] > 19) & (gaia_g0['phot_g0'] <= 20.5) &
                                        (gaia_g0['parallax'] < 0.2)),
                                   NSIDE=512, agg={"hpix__": "count"})

In [8]:
def spark_mu_l(alpha, delta, mu_ra, mu_dec):
    from pyspark.sql.functions import sin, cos, sqrt
    alpha_g =  192.85948 * pi/180.0
    delta_g = 27.12825 * pi/180.0
    C1 = np.sin(delta_g) * cos(delta) - np.cos(delta_g) * sin(delta) * cos(alpha - alpha_g)
    C2 = np.cos(delta_g)* sin(alpha - alpha_g)
    cosb = sqrt(C1**2 + C2**2)    
    return (mu_ra * C1 + mu_dec * C2)/cosb

def spark_mu_b(alpha, delta, mu_ra, mu_dec):
    from pyspark.sql.functions import sin, cos, sqrt
    alpha_g =  192.85948 * pi/180.0
    delta_g = 27.12825 * pi/180.0
    C1 = np.sin(delta_g) * cos(delta) - np.cos(delta_g) * sin(delta) * cos(alpha - alpha_g)
    C2 = np.cos(delta_g)* sin(alpha - alpha_g)
    cosb = sqrt(C1**2 + C2**2)
    return (mu_ra * -C2 + mu_dec * C1)/cosb

In [None]:
%%time

def star_cuts(df):
    return ((df['dup'] == 0) &
            (df['bp_rp_0'] > 0.5) & (df['bp_rp_0'] < 0.75) &
            (df['phot_g0'] > 19) &  (df['phot_g0'] <= 20.5) &
            (df['parallax'] < 0.2))

def add_mu_b(df):
    return df.withColumn("mu_b", spark_mu_b(df['ra']*pi/180.0, df['dec']*pi/180.0,
                                            df['pmra'], df['pmdec']))

gaia_g0_mu_b = add_mu_b(gaia_g0.where(star_cuts(gaia_g0)))
            
map_negative_b = healpix_hist(gaia_g0_mu_b.select("hpix12", "mu_b").where(gaia_g0_mu_b['mu_b'] < 0),
                              NSIDE=512, agg={"hpix__": "count"})

#map_pos_b = healpix_hist(gaia_g0_mulb.where(star_cuts(gaia_g0_mulb) & (gaia_g0_mulb['mu_b'] >= 0)),
#                              NSIDE=512, agg={"hpix__": "count"})