# Lab 2: Create Direct Lake custom semantic model over billion row tables

- Use Shortcuts
- Run DAX Query Trace
- See Fallback in action

## 1. Install Semantic Link Labs Python Library

In [None]:
%pip install -q --disable-pip-version-check semantic-link-labs

## 2. Setup Parameters

In [None]:
import sempy_labs as labs
from sempy import fabric
import sempy
import pandas
import json
import time

LakehouseName = "BigData"
SemanticModelName = f"{LakehouseName}_model"

Shortcut_LakehouseName = "BigDemoDB"
Shortcut_WorkspaceName = "DL Labs - Source Data"

## 3. Create Lakehouse

In [None]:
lakehouses=labs.list_lakehouses()["Lakehouse Name"]
if LakehouseName in lakehouses.values:
    lakehouseId = notebookutils.lakehouse.getWithProperties(LakehouseName)["id"]
else:
    lakehouseId = fabric.create_lakehouse(LakehouseName)

workspaceId = notebookutils.lakehouse.getWithProperties(LakehouseName)["workspaceId"]
workspaceName = sempy.fabric.resolve_workspace_name(workspaceId)
print(f"WorkspaceId = {workspaceId}, LakehouseID = {lakehouseId}, Workspace Name = {workspaceName}")

## 4. Remove any unwanted semantic models

This code will not get past line 1 unless changed to the following.

```
if **True** :
```

In [None]:
if False:
    for index, row in sempy.fabric.list_items().iterrows():
        if row["Type"] == "SemanticModel" and row["Display Name"] != LakehouseName:
            sempy.fabric.delete_item(item_id=row["Id"],workspace=workspaceId)
            print(f"Deleted semantic model {row['Display Name']}")

## 5. Remove all tables from BigData Lakehouse

In [None]:
if False:
    folders = notebookutils.fs.ls(f"abfss://{workspaceId}@onelake.dfs.fabric.microsoft.com/{lakehouseId}/Tables")
    for fileInfo in folders:
        print(f"Deleting...{fileInfo.path}")
        notebookutils.fs.rm(fileInfo.path,recurse=True)

## 6. Create Lakehouse Shortcuts

In [None]:
#1. Remove any existing shortcuts
for index, row in labs.lakehouse.list_shortcuts(lakehouse=LakehouseName).iterrows():
    labs.lakehouse.delete_shortcut(shortcut_name=row["Shortcut Name"],lakehouse=LakehouseName)
    print(f"Deleted shortcut {row['Shortcut Name']}")

#2. Creates correct shortcuts
labs.lakehouse.create_shortcut_onelake(table_name="fact_myevents_1bln"                      ,source_lakehouse=Shortcut_LakehouseName,source_workspace=Shortcut_WorkspaceName,destination_lakehouse=LakehouseName)
labs.lakehouse.create_shortcut_onelake(table_name="fact_myevents_1bln_no_vorder"            ,source_lakehouse=Shortcut_LakehouseName,source_workspace=Shortcut_WorkspaceName,destination_lakehouse=LakehouseName)
labs.lakehouse.create_shortcut_onelake(table_name="fact_myevents_1bln_partitioned_datekey"  ,source_lakehouse=Shortcut_LakehouseName,source_workspace=Shortcut_WorkspaceName,destination_lakehouse=LakehouseName)
labs.lakehouse.create_shortcut_onelake(table_name="fact_myevents_2bln"                      ,source_lakehouse=Shortcut_LakehouseName,source_workspace=Shortcut_WorkspaceName,destination_lakehouse=LakehouseName)
labs.lakehouse.create_shortcut_onelake(table_name="dim_Date"                                ,source_lakehouse=Shortcut_LakehouseName,source_workspace=Shortcut_WorkspaceName,destination_lakehouse=LakehouseName)
labs.lakehouse.create_shortcut_onelake(table_name="dim_Geography"                           ,source_lakehouse=Shortcut_LakehouseName,source_workspace=Shortcut_WorkspaceName,destination_lakehouse=LakehouseName)

print('Adding shortcuts complete.')

## 7. Trigger backround job to sync Lakehouse tables

This block makes a REST API call (line 23) to trigger a MetadataRefreshExternalCommand.

The code will loop every second checking the status. 

This block can be re-run.

In [None]:
##https://medium.com/@sqltidy/delays-in-the-automatically-generated-schema-in-the-sql-analytics-endpoint-of-the-lakehouse-b01c7633035d

def triggerMetadataRefresh():
    client = fabric.FabricRestClient()
    response = client.get(f"/v1/workspaces/{workspaceId}/lakehouses/{lakehouseId}")
    sqlendpoint = response.json()['properties']['sqlEndpointProperties']['id']

    # trigger sync
    uri = f"/v1.0/myorg/lhdatamarts/{sqlendpoint}"
    payload = {"commands":[{"$type":"MetadataRefreshExternalCommand"}]}
    response = client.post(uri,json= payload)
    batchId = response.json()['batchId']

    # Monitor Progress
    statusuri = f"/v1.0/myorg/lhdatamarts/{sqlendpoint}/batches/{batchId}"
    statusresponsedata = client.get(statusuri).json()
    progressState = statusresponsedata['progressState']
    print(progressState)
    while progressState != "success":
        statusuri = f"/v1.0/myorg/lhdatamarts/{sqlendpoint}/batches/{batchId}"
        statusresponsedata = client.get(statusuri).json()
        progressState = statusresponsedata['progressState']
        print(progressState)
        time.sleep(1)

    print('done')
    
triggerMetadataRefresh()

## 8. Create Custom Semantic Model from Lakehouse

In [None]:
#1. Generate list of ALL table names from lakehouse to add to Semantic Model
lakehouseTables:list = labs.lakehouse.get_lakehouse_tables(lakehouse=LakehouseName)["Table Name"]

#2 Create the semantic model (check if exists first)
if sempy.fabric.list_items().query(f"`Display Name`=='{LakehouseName}_model' & Type=='SemanticModel'  ").shape[0] ==0:
    labs.directlake.generate_direct_lake_semantic_model(dataset=f"{LakehouseName}_model",lakehouse_tables=lakehouseTables,workspace=workspaceName,lakehouse=lakehouseId,refresh=False,overwrite=True)

## 9. Add model relationships

In [None]:
with labs.tom.connect_semantic_model(dataset=SemanticModelName, readonly=False) as tom:
    #1. Remove any existing relationships
    for r in tom.model.Relationships:
        tom.model.Relationships.Remove(r)

    #2. Creates correct relationships
    tom.add_relationship(from_table="fact_myevents_1bln"                    , from_column="DateKey"     , to_table="dim_Date"       , to_column="DateKey"       , from_cardinality="many" , to_cardinality="one")
    tom.add_relationship(from_table="fact_myevents_1bln"                    , from_column="GeographyID" , to_table="dim_Geography"  , to_column="GeographyID"   , from_cardinality="many" , to_cardinality="one")

    tom.add_relationship(from_table="fact_myevents_2bln"                    , from_column="DateKey"     , to_table="dim_Date"       , to_column="DateKey"       , from_cardinality="many" , to_cardinality="one")
    tom.add_relationship(from_table="fact_myevents_2bln"                    , from_column="GeographyID" , to_table="dim_Geography"  , to_column="GeographyID"   , from_cardinality="many" , to_cardinality="one")

    tom.add_relationship(from_table="fact_myevents_1bln_partitioned_datekey", from_column="DateKey"     , to_table="dim_Date"       , to_column="DateKey"       , from_cardinality="many" , to_cardinality="one")
    tom.add_relationship(from_table="fact_myevents_1bln_partitioned_datekey", from_column="GeographyID" , to_table="dim_Geography"  , to_column="GeographyID"   , from_cardinality="many" , to_cardinality="one")


## 10. Add model measures

In [None]:
with labs.tom.connect_semantic_model(dataset=SemanticModelName, readonly=False) as tom:
    #1. Remove any existing measures
    for t in tom.model.Tables:
        for m in t.Measures:
            tom.remove_object(m)
            print(m.Name)

    tom.add_measure(table_name="fact_myevents_2bln",measure_name="Sum of Sales (2bln)",expression="SUM(fact_myevents_2bln[Quantity_ThisYear])")
    tom.add_measure(table_name="fact_myevents_1bln",measure_name="Sum of Sales (1bln)",expression="SUM(fact_myevents_1bln[Quantity_ThisYear])")
    

## 11. Mark dim_Date as Date Table

In [None]:
with labs.tom.connect_semantic_model(dataset=SemanticModelName, readonly=False) as tom:
    tom.mark_as_date_table(table_name="dim_Date",column_name="DateKey")

## 12. Set Sort by Cols

In [None]:
tom = labs.tom.TOMWrapper(dataset=SemanticModelName, workspace=workspaceName, readonly=False)
tom.set_sort_by_column(table_name="dim_Date",column_name="MonthName"       ,sort_by_column="Month")
tom.set_sort_by_column(table_name="dim_Date",column_name="WeekDayName"     ,sort_by_column="Weekday")
tom.model.SaveChanges()

#Show BIM data for dim_Date table
i:int=0
for t in tom.model.Tables:
    if t.Name=="dim_Date":
        bim = json.dumps(tom.get_bim()["model"]["tables"][i],indent=4)
        print(bim)
    i=i+1

## 13. Hide Fact Table columns

In [None]:
i:int=0
for t in tom.model.Tables:
    if t.Name in ["fact_myevents_1bln","fact_myevents_2bln","fact_myevents_1bln_partitioned_datekey"]:
        for c in t.Columns:
            c.IsHidden=True

        bim = json.dumps(tom.get_bim()["model"]["tables"][i],indent=4)
        print(bim)
    i=i+1

## 14. Reframe model to update changes

In [None]:
reframeOK:bool=False
while not reframeOK:
    try:
        result:pandas.DataFrame = labs.refresh_semantic_model(dataset=SemanticModelName)
        reframeOK=True
    except:
        print('Error with reframe... trying again.')
        triggerMetadataRefresh()
        sleep(3)

print('Custom Semantic Model reframe OK')


## 15. Create function to run DAX query with a server timings trace

In [None]:
import warnings
from Microsoft.AnalysisServices.Tabular import TraceEventArgs
from typing import Dict, List, Optional, Callable

def runDMV():
    df = sempy.fabric.evaluate_dax(
        dataset=SemanticModelName, 
        dax_string="""
        
        SELECT 
            MEASURE_GROUP_NAME AS [TABLE],
            ATTRIBUTE_NAME AS [COLUMN],
            DATATYPE ,
            DICTIONARY_SIZE 		    AS SIZE ,
            DICTIONARY_ISPAGEABLE 		AS PAGEABLE ,
            DICTIONARY_ISRESIDENT		AS RESIDENT ,
            DICTIONARY_TEMPERATURE		AS TEMPERATURE,
            DICTIONARY_LAST_ACCESSED	AS LASTACCESSED 
        FROM $SYSTEM.DISCOVER_STORAGE_TABLE_COLUMNS 
        ORDER BY 
            [DICTIONARY_TEMPERATURE] DESC
        
        """)
    display(df)

def filter_func(e):
    retVal:bool=True
    if e.EventSubclass.ToString() == "VertiPaqScanInternal":
        retVal=False      
    #     #if e.EventSubClass.ToString() == "VertiPaqScanInternal":
    #     retVal=False
    return retVal

# define events to trace and their corresponding columns
def runQueryWithTrace (expr:str,workspaceName:str,SemanticModelName:str,Result:Optional[bool]=True,Trace:Optional[bool]=True,DMV:Optional[bool]=True):
    event_schema = fabric.Trace.get_default_query_trace_schema()
    event_schema.update({"ExecutionMetrics":["EventClass","TextData"]})
    del event_schema['VertiPaqSEQueryBegin']
    del event_schema['VertiPaqSEQueryCacheMatch']
    del event_schema['DirectQueryBegin']

    warnings.filterwarnings("ignore")

    WorkspaceName = workspaceName
    SemanticModelName = SemanticModelName

    with fabric.create_trace_connection(SemanticModelName,WorkspaceName) as trace_connection:
        # create trace on server with specified events
        with trace_connection.create_trace(
            event_schema=event_schema, 
            name="Simple Query Trace",
            filter_predicate=filter_func,
            stop_event="QueryEnd"
            ) as trace:

            trace.start()

            df=sempy.fabric.evaluate_dax(
                dataset=SemanticModelName, 
                dax_string=expr)

            if Result:
                displayHTML(f"<H2>####### DAX QUERY RESULT #######</H2>")
                display(df)

            # Wait 5 seconds for trace data to arrive
            time.sleep(5)

            # stop Trace and collect logs
            final_trace_logs = trace.stop()

    if Trace:
        displayHTML(f"<H2>####### SERVER TIMINGS #######</H2>")
        display(final_trace_logs)
    
    if DMV:
        displayHTML(f"<H2>####### SHOW DMV RESULTS #######</H2>")
        runDMV()



## 16. DAX Queries

In [None]:
df=sempy.fabric.evaluate_dax(
    dataset=SemanticModelName, 
    dax_string="""
    
    evaluate tabletraits()
    
    """)
display(df)

In [None]:
df=labs.directlake.get_direct_lake_guardrails()
display(df)

## 17. Run DMV to check current state

In [None]:
runDMV()

## 18. Run DAX Queries Billion Row Tables

##### 18.1 Run DAX Query over 1 billion row table

In [None]:
labs.clear_cache(SemanticModelName)

runQueryWithTrace("""
    
    EVALUATE
        SUMMARIZECOLUMNS(
               
                dim_Date[FirstDateofMonth] ,
                "Count of Transactions" , COUNTROWS(fact_myevents_1bln) ,
                "Sum of Sales" , [Sum of Sales (1bln)] 
        )
        ORDER BY [FirstDateofMonth]

""",workspaceName,SemanticModelName)



##### 18.2 Run DAX Query over 2 billion row table

In [None]:
labs.clear_cache(SemanticModelName)

runQueryWithTrace("""
    
    EVALUATE
        SUMMARIZECOLUMNS(
                dim_Date[FirstDateofMonth] ,
                "Count of Transactions" , COUNTROWS(fact_myevents_2bln) ,
                "Sum of Sales" , [Sum of Sales (2bln)]
        )
        ORDER BY [FirstDateofMonth]

""",workspaceName,SemanticModelName,DMV=False)

##### 18.3 Run DAX Query over 1 & 2 billion row table

In [None]:
labs.clear_cache(SemanticModelName)

runQueryWithTrace("""

    EVALUATE
        SUMMARIZECOLUMNS(
                dim_Date[FirstDateofMonth] ,
                "Count of Transactions" , COUNTROWS(fact_myevents_1bln) ,
                "Sum of Sales (1bln)" , [Sum of Sales (1bln)] ,
                "Sum of Sales (2bln)" , [Sum of Sales (2bln)]
        )
        ORDER BY [FirstDateofMonth]

""",workspaceName,SemanticModelName,DMV=False)