In [1]:
# --- NB_03: Create DimCompany (Metadata Dimension) ---
import yfinance as yf
from pyspark.sql.types import StructType, StructField, StringType

# 1. Define your universe of tickers
tickers = ["MSFT", "AAPL", "GOOGL", "AMZN"]
print(f"Fetching metadata for {len(tickers)} companies...")

# 2. Dynamic Metadata Extraction (The "Shift Left" Magic)
company_data = []
for t in tickers:
    try:
        # Use yf.Ticker to get rich metadata
        info = yf.Ticker(t).info
        # Extract the full name, falling back to the ticker if missing
        long_name = info.get('longName', t)
        # Optional: You could also grab 'sector', 'industry', etc. here!
        company_data.append((t, long_name))
        print(f" - {t}: {long_name}")
    except Exception as e:
        print(f"Error fetching {t}: {e}")
        company_data.append((t, t)) # Fallback

# 3. Define Schema & Create Spark DataFrame
# Explicit schema is best practice for production pipelines
schema = StructType([
    StructField("ticker", StringType(), False),
    StructField("company_name", StringType(), True)
])
dim_company_df = spark.createDataFrame(company_data, schema)

# 4. Write to Lakehouse as a managed Delta Table
# 'overwrite' ensures it updates if you add new tickers later
dim_company_df.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("DimCompany")

print("\nSUCCESS! 'DimCompany' table created in Lakehouse.")
display(dim_company_df)

StatementMeta(, 12a9513c-3f60-42ea-9f60-31a1be35e299, 5, Finished, Available, Finished)

Fetching metadata for 4 companies...
 - MSFT: Microsoft Corporation
 - AAPL: Apple Inc.
 - GOOGL: Alphabet Inc.
 - AMZN: Amazon.com, Inc.

SUCCESS! 'DimCompany' table created in Lakehouse.


SynapseWidget(Synapse.DataFrame, b4ed4e64-1e33-4864-967a-b2357a4f5290)