### Imports & Setup

In [8]:
%pip install yfinance

StatementMeta(, 7e5c78e7-f828-446a-961e-b1011fdae1c8, 19, Finished, Available, Finished)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.



In [9]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta
from pyspark.sql.functions import col, year, month, date_format, avg
from pyspark.sql.window import Window

# Define parameters for 5 years of history
tickers = ["MSFT", "AAPL", "GOOGL", "AMZN"]
start_date = (datetime.now() - timedelta(days=5*365)).strftime('%Y-%m-%d')
end_date = datetime.now().strftime('%Y-%m-%d')

print(f"Setup complete. Fetching data from {start_date} to {end_date}...")

StatementMeta(, 7e5c78e7-f828-446a-961e-b1011fdae1c8, 21, Finished, Available, Finished)

Setup complete. Fetching data from 2020-11-07 to 2025-11-06...


# Phase I: Ingestion (Bronze Layer)

### Step 1: Download & Restructure (Pandas)

In [10]:
# 1. Download data
# auto_adjust=True helps standardize price columns, but we still need robust renaming later
raw_pd = yf.download(tickers, start=start_date, end=end_date, progress=False, auto_adjust=True)

# 2. Robust Stacking
# Try stacking Tickers (level 1) first. Fallback to level 0 if structure differs.
try:
    stacked_pd = raw_pd.stack(level=1).reset_index()
except IndexError:
    stacked_pd = raw_pd.stack(level=0).reset_index()

print("Data downloaded and stacked. Rows:", len(stacked_pd))
display(stacked_pd.head(3)) # Quick peek at raw stacked data

StatementMeta(, 7e5c78e7-f828-446a-961e-b1011fdae1c8, 22, Finished, Available, Finished)

Data downloaded and stacked. Rows: 5016


SynapseWidget(Synapse.DataFrame, 05c4618d-60fd-4143-a154-a7f121c873ac)

### Step 2: Standardize Schema (Pandas)

In [11]:
# 1. Force rename the first two index columns to 'date' and 'ticker'
stacked_pd.rename(columns={stacked_pd.columns[0]: 'date', stacked_pd.columns[1]: 'ticker'}, inplace=True)

# 2. Robust renaming for Price and Volume attributes
# We scan current column names and map them to our standard names
current_cols = list(stacked_pd.columns)
rename_map = {}
for c in current_cols:
    c_lower = c.lower()
    if 'date' in c_lower: rename_map[c] = 'date'
    elif 'ticker' in c_lower: rename_map[c] = 'ticker'
    elif 'volume' in c_lower: rename_map[c] = 'volume'
    # Catch either 'Adj Close' OR just 'Close' depending on download settings
    elif 'adj' in c_lower and 'close' in c_lower: rename_map[c] = 'close_price'
    elif 'close' in c_lower and 'adj' not in c_lower: rename_map[c] = 'close_price'

print(f"Applying column mapping: {rename_map}")
stacked_pd.rename(columns=rename_map, inplace=True)

StatementMeta(, 7e5c78e7-f828-446a-961e-b1011fdae1c8, 23, Finished, Available, Finished)

Applying column mapping: {'date': 'date', 'ticker': 'ticker', 'Close': 'close_price', 'Volume': 'volume'}


### Step 3: Write to Bronze (Spark)

In [12]:
# 1. Convert to Spark DataFrame
spark_bronze = spark.createDataFrame(stacked_pd)

# 2. Select and Cast specific columns to lock in the schema
final_cols = [c for c in spark_bronze.columns if c in ['date', 'ticker', 'close_price', 'volume']]
spark_bronze = spark_bronze.select(*[col(c) for c in final_cols]) \
                           .withColumn("close_price", col("close_price").cast("double")) \
                           .withColumn("volume", col("volume").cast("long"))

# 3. Write to Bronze Files (forcing schema overwrite to fix previous errors)
spark_bronze.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save("Files/stock_bronze_raw/historical")

print("Phase 1 Complete: Bronze data saved with correct schema.")
spark_bronze.printSchema()

StatementMeta(, 7e5c78e7-f828-446a-961e-b1011fdae1c8, 24, Finished, Available, Finished)

Phase 1 Complete: Bronze data saved with correct schema.
root
 |-- date: timestamp (nullable = true)
 |-- ticker: string (nullable = true)
 |-- close_price: double (nullable = true)
 |-- volume: long (nullable = true)



# Phase 2: Transformation (Silver Layer)

### Step 1: Clean & Enrich (Time Attributes & SMA)

In [13]:
# 1. Read back from Bronze (sanity check that read works)
stock_df_raw = spark.read.format("delta").load("Files/stock_bronze_raw/historical")

# 2. Clean and Add Time Attributes
stock_df_clean = stock_df_raw.na.drop(subset=['close_price'])
stock_df_silver = stock_df_clean.withColumn("year", year(col("date"))) \
                                .withColumn("month", month(col("date"))) \
                                .withColumn("day_of_week", date_format(col("date"), "E"))

# 3. Calculate 7-Day SMA using Window Function
window_spec = Window.partitionBy("ticker").orderBy("date").rowsBetween(-6, 0)
stock_df_final = stock_df_silver.withColumn("sma_7_day", avg(col("close_price")).over(window_spec))

print("Data enriched with Time Attributes and SMA.")

StatementMeta(, 7e5c78e7-f828-446a-961e-b1011fdae1c8, 25, Finished, Available, Finished)

Data enriched with Time Attributes and SMA.


### Step 2: Write to Silver Table

In [14]:
# Write to Silver managed table (forcing schema overwrite again to be safe)
stock_df_final.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("StockData_Silver")

print("Phase 2 Complete: StockData_Silver table is ready.")
print("Verifying final schema below (Must see 'volume' and 'sma_7_day'):")
spark.table("StockData_Silver").printSchema()

StatementMeta(, 7e5c78e7-f828-446a-961e-b1011fdae1c8, 26, Finished, Available, Finished)

Phase 2 Complete: StockData_Silver table is ready.
Verifying final schema below (Must see 'volume' and 'sma_7_day'):
root
 |-- date: timestamp (nullable = true)
 |-- ticker: string (nullable = true)
 |-- close_price: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- sma_7_day: double (nullable = true)

