In [None]:
import requests

from pyspark.sql.types import StructType, StructField, StringType
from bs4 import BeautifulSoup
from pyspark.sql.functions import *

baseurl = "https://learn.microsoft.com/en-us/rest/api/fabric/"

In [None]:
# Function to extract all articles and return the data_list
def extract_all_articles(data):
    if "items" in data:
        return extract_articles(data["items"])
    return []

### Function for extracting 
def extract_articles(items, level=0, path=""):
    indent = '    ' * level  # Create indentation based on the current level
    local_data_list = []  # Initialize a local list to store data    
    
    for item in items:
        if item["toc_title"] != "Overview":
            
            if "href" in item and "toc_title" in item:
                print(f"{indent}{item['toc_title']} : {item['href']}")
        
                article = requests.get(baseurl+item['href'])
                soup = BeautifulSoup(article.content, 'html.parser')
                identity_header = soup.find(id='microsoft-entra-supported-identities')
                
                if identity_header is not None:
                    rows = soup.find(id='microsoft-entra-supported-identities').find_next('table').find('tbody').find_all('tr')

                    key_value_pairs = {}
                    for row in rows:
                        columns = row.find_all('td')
                        key_value_pairs[columns[0].get_text(strip=True)] = columns[1].get_text(strip=True)

                        #To add individual rows for each identity type use the codeline below.
                        #local_data_list.append((path, item['toc_title'], baseurl+item['href'], item['href'],columns[0].get_text(strip=True),columns[1].get_text(strip=True)))
                    
                    local_data_list.append((path, item['toc_title'], baseurl+item['href'], item['href'],key_value_pairs.get("User"),key_value_pairs.get("Service principalandManaged identities")))
                else:
                    local_data_list.append((path, item['toc_title'], baseurl+item['href'], item['href'],"N/A","N/A"))
            else:
                print(f"{indent}{item['toc_title']}")   
            if "children" in item:
                local_data_list.extend(extract_articles(item["children"], level + 1, path + "|" + item['toc_title']))
    
    return local_data_list  # Return the local data list

In [None]:
### Extract Fabric API documentation
response = requests.get(baseurl+"toc.json")
data = response.json()

# Call the extract_all_articles function and store the return value as data_list
data_list = extract_all_articles(data)

In [None]:
### Convert the data_list to Spark DataFrame, split path and clean up
schema = StructType([
    StructField("Path", StringType(), True),
    StructField("Title", StringType(), True),
    StructField("ArticleUrl", StringType(), True),
    StructField("Href", StringType(), True),
    StructField("SupportUserIdentity", StringType(), True),
    StructField("SupportSpnAndMi", StringType(), True)
])

df = spark.createDataFrame(data_list, schema)

df = df.withColumn("Service", split(df["Path"], "\\|").getItem(1)) \
             .withColumn("API", split(df["Path"], "\\|").getItem(2))

df = df.drop("Path", "Href")

In [None]:
### Get lakehouse if it exist otherwise create it
lakehouse = None
try:
    lakehouse = mssparkutils.lakehouse.get("FabricDocs")
except:
    lakehouse = mssparkutils.lakehouse.create(name = "FabricDocs", description = "Lakehouse for storing Microsoft Fabric documentation")

In [None]:
### Write delta table
abfsPath = lakehouse.properties["abfsPath"]
df.write.format("delta").mode("overwrite").save(f"{abfsPath}/Tables/FabricRestApiDocs")

In [None]:
### Create manual table holding Identity options
df_identities = spark.sql("""
    SELECT 
        1 AS IdentityNo,
        'User' AS Identity,
        'Supports user identity' AS Identity_desc,
        '' as IdentitySupportArticle
    UNION
    SELECT 
        2 AS IdentityNo,
        'Service principal and Managed identities' AS Identity,
        'Supports Service principal and Managed identities' as Identity_desc,
        'https://learn.microsoft.com/en-us/entra/identity-platform/app-objects-and-service-principals#service-principal-object' as IdentitySupportArticle
""")

df_identities.write.format("delta").mode("overwrite").save(f"{abfsPath}/Tables/SupportedIdentities")

In [None]:
### Download report definition file from Github
mssparkutils.fs.cp('https://raw.githubusercontent.com/gronnerup/Fabric/refs/heads/main/FabricRestApiDocs/report.json', f'{abfsPath}/Files')