In [None]:
# Sample spark job to make sure the kernel is running correctly

import random
num_samples = 100000000
def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1
count = sc.parallelize(range(0, num_samples)).filter(inside).count()
pi = 4 * count / num_samples
print(pi)

In [None]:
# Sample for how to run a spark job in the structure that the Transformation team uses.

from enterprise_sales_process.process_box_edw_sales import ProcessBoxEDWSales

from spark_process_common.data_loaders import CompositeDataLoader, CsvDataLoader, OrcDataLoader
from spark_process_common.spark_context import JupyterSparkContext

csv_source_paths = {
    "edwsales_source": { "path": "../enterprise_sales_process/tests/fixtures/edw_sales.csv", "delim": '|'},
    "dasamount_source": { "path": "../enterprise_sales_process/tests/fixtures/das_sales.csv" },
    "dastons_source": { "path": "../enterprise_sales_process/tests/fixtures/das_sales_tons.csv" }
}
orc_source_paths = {
    "objacctmapping_source": "../enterprise_sales_process/tests/fixtures/object-account-mapping.orc",
    "boxplants_source": "../enterprise_sales_process/tests/fixtures/box-plants.orc",
    "boxstyles_source": "../enterprise_sales_process/tests/fixtures/box-styles.orc",
    "statecodes_source": "../enterprise_sales_process/tests/fixtures/state-codes.orc",
    "bucurrencyconv_source": "../enterprise_sales_process/tests/fixtures/bu-currency-conversions.orc",
    "currencyconv_source": "../enterprise_sales_process/tests/fixtures/currency-conversion-rates.orc"
}

csv_data_loader = CsvDataLoader(csv_source_paths)
orc_data_loader = OrcDataLoader(orc_source_paths)
comp_data_loader = CompositeDataLoader([csv_data_loader, orc_data_loader])

spark_context = JupyterSparkContext(sc)

boxEDWSales = ProcessBoxEDWSales(comp_data_loader, spark_context)
df = boxEDWSales.transform()
df.show()

In [None]:
print(os.environ["PYTHONPATH"])
print(os.environ["SPARK_HOME"])

In [None]:
# This is needed to read files directly from S3.  It only needs to be executed once per kernel session.  
# The keys can be found in the AWS Resources wiki under the Enterprise Sales tab in teams.

sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", "ACCESS_KEY")
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", "SECRET_KEY")

In [None]:
# This is a sample of how to load a CSV file from S3.

df = sqlContext.read.format("csv") \
        .option("header","true") \
        .option("inferSchema","true") \
        .load("s3a://wrktdtransformationrawproddtl001/enterprise-sales/edw/ParentCustomer/EDW_parentCustomer.csv")

df.show(20)