# The purpose of this notebook is to create an initial table for storing business plans from an Excel file.

Step 1 - Importing Python packages needed

In [1]:
import pandas as pd
import numpy as np
import uuid
from pyspark.sql.types import *

StatementMeta(, 7835ef7c-3b5a-4f78-9271-6ba3a1114330, 3, Finished, Available, Finished)

Step 2 - Importing the excel file into Pandas data frame. 

In [2]:
#path to excel file with targets 
excel_path = "./builtin/fake_budget.xlsx"
#excel sheet name with targets
sheet_nam = "Sheet1"
#reading the excel file into pandas data frame
df = pd.read_excel(excel_path, sheet_name=sheet_nam)

#looping through all the values in data frame and replacing white space with null
for col in df.columns:
    df[col] = df[col].replace(r'^\s*$', np.nan, regex=True)
#dropping all rows where all values are null
df = df.dropna(how="all")
#transposing all the columns except for the ones in the array id_vars is set to
df = pd.melt(df, id_vars=["Plan Version", "Model", "Metric"], value_vars=None, var_name="Time Period", value_name="value")
#putting all columns in the data frame into cammel case
df.columns = ['_'.join(col.strip().split()).lower() for col in df.columns]
#converts time period column to string
df['time_period'] = df['time_period'].astype(str)
display(df)

StatementMeta(, 7835ef7c-3b5a-4f78-9271-6ba3a1114330, 4, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a5c90053-65bf-4344-921c-9257fa151452)

Step 3 - Adding Columns to facilitate the type 2 table: Record Start, Record End, Transaction Type, Active Record Flag, Unique ID. Interesting Piece of code being used in this cell:

uuid.UUID5 - uuid5 takes the namespace UUID and the name string as input. It then combines them and uses the SHA-1 hashing algorithm to produce a 128-bit UUID.  Crucially, the same namespace and name will always produce the same UUID. This is the key benefit of version 5 UUIDs.

Why use uuid5?

- Reproducibility: If you have a name and a namespace, you can reliably generate the same UUID every time. This is useful for creating consistent identifiers for resources, especially when you need to refer to the same resource across different systems or databases.   
- Uniqueness within a namespace: While not guaranteed to be globally unique like version 4 UUIDs (randomly generated), version 5 UUIDs provide uniqueness within the context of a given namespace. This is often sufficient and avoids the storage overhead of truly random UUIDs when you have a well-defined namespace.
- Avoiding collisions: By using a namespace, you reduce the risk of collisions (two different names generating the same UUID) compared to just hashing the name directly.



In [3]:
#adding additional columns 
df['record_valid_start'] = pd.to_datetime('1900-01-01')
df['record_valid_end'] = pd.Timestamp.max
df['transaction_typ'] = 'I'
df['active_record_flag'] = True
#calculating a unique id by using uuid5 
namespace = uuid.UUID('1f925fee-159c-44e3-b9d0-aadbf58cf7ff')
df['id'] = df.apply(lambda row: str(uuid.uuid5(namespace, '-'.join([str(row['plan_version']), str(row['model']), str(row['metric']), str(row['time_period'])]))), axis=1)
#moving the id column to the front of the data frame
df = df[['id'] + [col for col in df.columns if col != 'id']]
display(df)

StatementMeta(, 7835ef7c-3b5a-4f78-9271-6ba3a1114330, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, e0464959-d6ae-40a3-bd54-17c142b0747f)

Step 4 - creating Spark Dataframe

In [5]:
#setting up a spark data frame so spark does not incorrectly infer the schma
schema = StructType([
    StructField("id", StringType(), False),
    StructField("plan_version", StringType(), False),
    StructField("model", StringType(), False),
    StructField("metric", StringType(), False),
    StructField("time_period", StringType(), False),
    StructField("value", DoubleType(), True),
    StructField("record_valid_start", DateType(), False),
    StructField("record_valid_end", DateType(), False),
    StructField("transaction_typ", StringType(), False),
    StructField("active_record_flg", BooleanType(), False) 
])
#converting the pandas data frame to a spark data frame
spark_df = spark.createDataFrame(df, schema=schema)

StatementMeta(, 6db8d669-2561-4ce7-97e3-e6f21a97cbef, 6, Finished, Available, Finished)

Step 5 - Write the spark data frame to table named targets

In [6]:
#outputting the spark data frame to the lakehouse
spark_df.write.mode("overwrite").format("delta").saveAsTable("targets")

StatementMeta(, 6db8d669-2561-4ce7-97e3-e6f21a97cbef, 7, Finished, Available, Finished)