####Look into external tools 
- Objective - Look into read, write, delete operations on Storage for external tables ONLY.   To identify good candidates that can be migrated to Managed Tables.
  - Identify if an external table is being leveraged by an external platform. If it is it would not be a good candidate until the write operations subside.

External Table -> main.default.mytable
- Leveraged by Databricks
- Leveraged by External Platforms, Read Only for Good Candidate

External Table -> main.default.xyz
- Leveraged by Databricks
- Good Candidate would be read, write, delete



#### External Tables

In [0]:
select distinct
  info_tableName,
  storage_userAgentHeader,
  info_tableType
from
  slog.default.azure_storage_logs
where
  -- lower(Storage_userAgentHeader) like '%databricks%' and
  lower(info_tableType) like '%external%'

IT_TableName,Storage_userAgentHeader,IT_TableType
main.default.my_table,"Azure Blob FS/3.3 (AzulSystems,Inc. JavaJRE 17.0.13; Linux 5.15.0-1075-azure/amd64; SunJSSE-17.0; UNKNOWN/UNKNOWN) APN/1.0 unknown",EXTERNAL
main.default.my_table,APN/1.0 Databricks/1.0 DBR/null,EXTERNAL


#### Unique Agent Headers

In [0]:
SELECT
  storage_userAgentHeader,
  storage_category,
  count(*)
FROM
  slog.default.azure_storage_logs
group by all

storage_userAgentHeader,storage_Action,count(1)
"Azure Blob FS/3.3 (AzulSystems,Inc. JavaJRE 17.0.13; Linux 5.15.0-1075-azure/amd64; SunJSSE-17.0; UNKNOWN/UNKNOWN) APN/1.0 unknown",delete,2579
"Azure Blob FS/3.3 (AzulSystems,Inc. JavaJRE 1.8.0_412; Linux 5.15.0-1075-azure/amd64; SunJSSE-1.8; UNKNOWN/UNKNOWN) APN/1.0 unknown",read,945
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Obsidian/yzTryDIXGEB22dl7dxMu8nhvAPX5f9XLAi5ywxURO_7DAQJY5QrDJoEJDkKojQ",read,1
azsdk-js-data-tables/12.1.2 core-rest-pipeline/1.19.0 Node/18.20.4 OS/(arm64-Linux-5.10.235-227.919.amzn2.aarch64),read,1
SRP/1.0,,15
Azure Blob FS/3.3 (OracleCorporation JavaJRE 17.0.12; Linux 5.4.0-1147-azure-fips/amd64; SunJSSE-17.0; UNKNOWN/UNKNOWN) APN/1.0 Databricks/1.0 DBR/UNKNOWN,write,10
APN/1.0 Databricks/1.0 DBR/null,read,2683
"Azure Blob FS/3.3 (AzulSystems,Inc. JavaJRE 17.0.13; Linux 5.15.0-1075-azure/amd64; SunJSSE-17.0; UNKNOWN/UNKNOWN) APN/1.0 unknown",write,21498
azsdk-java-azure-storage-blob/12.29.0 (17.0.12; Linux; 5.4.0-1147-azure-fips),,2
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36 Obsidian/yzTryDIXGEB22dl7dxMu8nhvAPX5f9XLAi5ywxURO_7DAQJY5QrDJoEJDkKojQ",,1


#### Checking to see if there are any non-managed tables with a Databricks Agent Header

In [0]:
SELECT
  storage_time,
  info_parsedPath,
  info_tableName,
  info_tableType,
  storage_userAgentHeader,
  storage_category
FROM
  slog.default.azure_storage_logs
WHERE
  lower(storage_userAgentHeader) like '%databricks%'
  and lower(info_tableType) != 'managed'
GROUP BY ALL

storage_time,IT_ParsedPath,IT_TableName,IT_TableType,Storage_userAgentHeader,Storage_Category
2025-04-12T18:18:32.431Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-16T20:41:58.932Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-16T20:41:59.052Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-16T20:41:58.962Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-12T18:18:30.860Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-16T20:41:58.720Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-12T18:18:30.906Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-16T20:41:58.937Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-12T18:18:30.947Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead
2025-04-16T20:41:58.601Z,/stsezsandbox07/pos-dev/d1,main.default.my_table,EXTERNAL,APN/1.0 Databricks/1.0 DBR/null,StorageRead


#### Getting list candidates to upgrade to managed

In [0]:
-- Get the latest timestamp for the "eventhub_storage_log_setup" job
with last_run_timestamp as (
  select
    max(period_start_time) as last_run_timestamp
  from
    system.lakeflow.job_run_timeline
  where
    job_id = '1072068050595527'
)

-- Identify list of tables that are good candidates to move to managed tables
select
  src.Storage_AccountName,
  last_run_timestamp.last_run_timestamp as Job_Run_TS,
  src.info_tableName,
  coalesce(sum(case when src.storage_category = 'StorageRead' then src.operation_count else 0 end), 0) as total_read_count,
  coalesce(sum(case when src.storage_category = 'StorageWrite' then src.operation_count else 0 end), 0) + 
  coalesce(sum(case when src.storage_category = 'StorageDelete' then src.operation_count else 0 end), 0) as write_delete_count,
  case
    when coalesce(sum(case when src.storage_category = 'StorageWrite' then src.operation_count else 0 end), 0) + 
    coalesce(sum(case when src.storage_category = 'StorageDelete' then src.operation_count else 0 end), 0) = 0 then 1
    else 0
  end as good_candidate,
  case
    when sum(case when lower(src.Storage_userAgentHeader) like '%databricks%' then 1 else 0 end) > 0 then 0
    else 1
  end as from_external_platform
from
  (
    select
      Storage_AccountName,
      Date(Storage_Time) as Storage_Date,
      lower(Storage_userAgentHeader) as Storage_userAgentHeader,
      info_tableName,
      info_tableType,
      storage_Action,
      count(*) as operation_count
    from
      slog.default.azure_storage_logs
    where
      info_tableType = 'EXTERNAL' and
      storage_category in ('StorageRead', 'StorageWrite', 'StorageDelete')
    group by
      Storage_AccountName,
      Storage_Time,
      Storage_userAgentHeader,
      info_tableName,
      info_tableType,
      storage_category
  ) src
  join last_run_timestamp
group by
  src.Storage_AccountName,
  src.info_tableName,
  last_run_timestamp.last_run_timestamp
order by
  total_read_count desc,
  write_delete_count desc

Storage_AccountName,Job_Run_TS,IT_TableName,total_read_count,write_delete_count,good_candidate,from_external_platform
stsezsandbox07,2025-04-18T11:27:55.123Z,main.default.my_table,66,26,0,0
stsezsandbox07,2025-04-18T11:27:55.123Z,pos_dev.retailer_na.pos_snapshots,13,0,1,1
stsezsandbox07,2025-04-18T11:27:55.123Z,pos_dev.retailer_na.pos_generator,4,0,1,1
stsezsandbox07,2025-04-18T11:27:55.123Z,pos_dev.retailer_na.pos_static,3,0,1,1


#View of Delta Tables not registered in Unity Catalog

Review all delta table storage paths that are not registred in Unity Catalog and see what operations are being done.

In [0]:
-- DELTA_LOG INTERROGATION OF PATHS
with external_tables_not_registered as (
  select
    storage_accountName,
    REGEXP_REPLACE(Storage_RelativePath, '^[^/]+://[^/]+/$', '') as Storage_RelativePath,
    storage_category,
    count(*) as operation_count
  from
    slog.default.azure_storage_logs
  where
    info_tableType is null
    and lower(REGEXP_REPLACE(Storage_RelativePath, '^[^/]+://[^/]+/$', '')) like '%/_delta_log'
    and lower(REGEXP_REPLACE(Storage_RelativePath, '^[^/]+://[^/]+/$', '')) not like '%unity%'
  group by
    Storage_AccountName,
    REGEXP_REPLACE(storage_relativePath, '^[^/]+://[^/]+/$', ''),
    storage_category
  order by
    operation_count desc
)
select
  *
from
  external_tables_not_registered
pivot (
  max(operation_count) for storage_category in ('StorageRead' as read_count, 'StorageWrite' as write_count, 'StorageDelete' as delete_count)
)

storage_accountName,Storage_RelativePath,read_count,write_count,delete_count
