## NOTE, THIS NOTEBOOK RUNS ONLY ON DATABRICKS
If run without Databricks on Spark, Spark context etc. should be set

### 1. First we create connection to S3 bucket within the same region

In [None]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

spark.conf.set("spark.databricks.io.cache.enabled", "true")

access_key = dbutils.secrets.get(scope = "s3-access", key = "aws-access-key")
secret_key = dbutils.secrets.get(scope = "s3-access", key = "aws-secret-key")
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", access_key)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secret_key)

aws_region = "us-west-1"
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3." + aws_region + ".amazonaws.com")

s3_bucket = "s3a://ejkquant-uswest1/"

### 2. Then we bucket the data by Datetime into 10 different partitions for efficient joins in later stages
Bucketed data is stored in Tables in Databricks File System utilizing SSD, see https://docs.databricks.com/dbfs/index.html

In [None]:
fn = s3_bucket+"features_data/MSFT.parquet"
MSFT = spark.read.format("parquet").load(fn)

fn = s3_bucket+"features_data/SP500.parquet"
SP500 = spark.read.format("parquet").load(fn)

fn = s3_bucket+"features_data/30Y_BOND.parquet"
BOND = spark.read.format("parquet").load(fn)

fn = s3_bucket+"features_data/10Y_NOTE.parquet"
NOTE = spark.read.format("parquet").load(fn)

fn = s3_bucket+"features_data/GOLD.parquet"
GOLD = spark.read.format("parquet").load(fn)

fn = s3_bucket+"features_data/econ_features.parquet"
econ_features = spark.read.format("parquet").load(fn)

MSFT.write.bucketBy(10,"Datetime").saveAsTable("MSFT", format="parquet", mode="overwrite")
SP500.write.bucketBy(10,"Datetime").saveAsTable("SP500", format="parquet", mode="overwrite")
BOND.write.bucketBy(10,"Datetime").saveAsTable("BOND", format="parquet", mode="overwrite")
NOTE.write.bucketBy(10,"Datetime").saveAsTable("NOTE", format="parquet", mode="overwrite")
GOLD.write.bucketBy(10,"Datetime").saveAsTable("GOLD", format="parquet", mode="overwrite")
econ_features.write.bucketBy(10,"Datetime").saveAsTable("econ", format="parquet", mode="overwrite")