In [2]:
%matplotlib inline
from __future__ import print_function
import sys
import numpy as np
import matplotlib.pyplot as plt

from operator import itemgetter

This notebook runs on a local spark instance, not AWS.

In [4]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import Row

sc = SparkContext('local[*]')
sc.setLogLevel("WARN")

spark = SparkSession(sc)

In [18]:
forcePhotSummary = spark.read.parquet("rrlyrae_summary.parquet")
forcePhotSummary.createOrReplaceTempView("forcePhotSummary")

forcePhot = spark.read.parquet("rrlyrae_test.parquet")
forcePhot.createOrReplaceTempView("forcePhot")

In [14]:
%%time
count = spark.sql("SELECT count(*) FROM forcePhot")
count.show()

+--------+
|count(1)|
+--------+
|  136249|
+--------+

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.38 s


In [15]:
%%time
count = spark.sql("SELECT count(*) FROM forcePhot WHERE mag < 16")
count.show()

+--------+
|count(1)|
+--------+
|   46409|
+--------+

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 680 ms


This command (below) was used for generating the summary statistics table, but is not necessary to repeat since we load in the saved results.

In [55]:
#forcePhotSummary = spark.sql("SELECT lcId, count(*) as n_epochs, "
#                             "min(mag) as min_mag, max(mag) as max_mag "
#                             "FROM forcePhot GROUP BY lcId")
#forcePhotSummary.createOrReplaceTempView("forcePhotSummary")

In [16]:
spark.sql("SELECT * from forcePhotSummary LIMIT 1").show()

+---------+--------+------------------+------------------+
|     lcId|n_epochs|           min_mag|           max_mag|
+---------+--------+------------------+------------------+
|853522439|     675|17.092892426172458|18.083258466619952|
+---------+--------+------------------+------------------+



In [19]:
forcePhotSummary.count()

178

In [20]:
targetObjects = spark.sql("SELECT forcePhot.lcId, band, mag, mjd FROM  forcePhot "
                          "JOIN forcePhotSummary ON (forcePhotSummary.lcId = forcePhot.lcId) "
                          "WHERE forcePhotSummary.min_mag < 12")

In [21]:
targetObjects.count()

12812

In [32]:
targetObjects.select("lcId").distinct().count()

18

In [26]:
x = targetObjects.rdd.groupBy(itemgetter('lcId')).collect()

In [30]:
len(x)

18

In [44]:
def detailed_LC_analysis(input_rdd):
    lcId, input_rows = input_rdd
    mags = [x['mag'] for x in input_rows]
    lcIds = [x['lcId'] for x in input_rows]
    
    return [(lcId, min(lcIds), max(lcIds), len(mags), np.nanmean(mags))]

x = targetObjects.rdd.groupBy(itemgetter('lcId')).map(detailed_LC_analysis).collect()
x

[[(287285255, 287285255, 287285255, 673, 11.482300352345973)],
 [(287458311, 287458311, 287458311, 675, 10.022322097977989)],
 [(287880199, 287880199, 287880199, 674, 10.001545780055375)],
 [(709769223, 709769223, 709769223, 675, 12.088193160696683)],
 [(287687687, 287687687, 287687687, 675, 8.923237627353485)],
 [(709873671, 709873671, 709873671, 674, 11.739290821059145)],
 [(709581831, 709581831, 709581831, 673, 11.799644730545701)],
 [(709583879, 709583879, 709583879, 673, 11.131700399771296)],
 [(709912583, 709912583, 709912583, 1349, 10.232110091720132)],
 [(709773319, 709773319, 709773319, 675, 11.852150355963804)],
 [(287305735, 287305735, 287305735, 674, 10.533753061649605)],
 [(287273991, 287273991, 287273991, 673, 9.9942437617196109)],
 [(709785607, 709785607, 709785607, 675, 11.506493569791211)],
 [(287548423, 287548423, 287548423, 675, 10.029157568184372)],
 [(709683207, 709683207, 709683207, 674, 10.323791706865439)],
 [(287335431, 287335431, 287335431, 675, 10.55186016967