# BitcoinData Ingestion using AWS Glue

In [None]:
import sys
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.functions import explode, col, to_date
from db_utils import init_filebrowser_db
init_filebrowser_db()

# Glue job parameters
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

# Initialize Glue and Spark
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# === CONFIGURATION ===
DATABASE = "bitcoin_data"
TABLE = "raw"   # ✅ This is your actual table name
BUCKET = "data606-bitcoinbucket"
OUTPUT_PATH = f"s3://{BUCKET}/processed/bitcoin_prices/"

# === LOAD DATA FROM GLUE CATALOG ===
dyf = glueContext.create_dynamic_frame.from_catalog(
    database=DATABASE,
    table_name=TABLE,
    transformation_ctx="datasource0"
)

# Convert to Spark DataFrame
df = dyf.toDF()

# === TRANSFORMATION: Explode and extract from array<struct<double:[], long:[]>> ===
df_exploded = df.select(explode(col("prices")).alias("price_entry"))

# Now extract the first value of each array field
df_flat = df_exploded.select(
    (col("price_entry.long")[0] / 1000).cast("timestamp").alias("timestamp"),
    col("price_entry.double")[0].alias("price_usd")
)

# Add date column for partitioning
df_final = df_flat.withColumn("date", to_date(col("timestamp")))

# Write to S3 in partitioned Parquet format
df_final.write.mode("overwrite").partitionBy("date").parquet(OUTPUT_PATH)

# Commit job
job.commit()