# Form Recognizer tutorial

__Setup__

In [None]:
#from synapse.ml.cognitive import *
from synapse.ml.cognitive import *

from notebookutils import mssparkutils

# A general Cognitive Services key for Text Analytics and Computer Vision (or use separate keys that belong to each service)
cognitive_service_key = mssparkutils.credentials.getSecret("ezzat-keyvault", "ez-cog-lang")
cognitive_service_form_key = mssparkutils.credentials.getSecret("ezzat-keyvault","ez-cog-form")

# A Bing Search v7 subscription key
#bingsearch_service_key = mssparkutils.credentials.getSecret("ADD_YOUR_KEY_VAULT_NAME", "ADD_YOUR_BING_SEARCH_KEY","ADD_YOUR_KEY_VAULT_LINKED_SERVICE_NAME")
# An Anomaly Dectector subscription key
#anomalydetector_key = mssparkutils.credentials.getSecret("ADD_YOUR_KEY_VAULT_NAME", "ADD_YOUR_ANOMALY_KEY","ADD_YOUR_KEY_VAULT_LINKED_SERVICE_NAME")

#Ref: https://microsoft.github.io/SynapseML/docs/documentation/transformers/transformers_cognitive/#analyzedocument


## AnalyzeLayout

In [38]:
display(analyzeLayout.transform(imageDf))

StatementMeta(testspark2, 6, 38, Finished, Available)

SynapseWidget(Synapse.DataFrame, b7a54317-97ca-4c2a-aef3-c0fa9076ecbd)

In [6]:
import pyspark.sql.functions as f

imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg",),
  ("https://raw.githubusercontent.com/MicrosoftDocs/azure-docs/master/articles/cognitive-services/Computer-vision/Images/readsample.jpg",),
  ("https://www.nist.gov/sites/default/files/images/2019/04/27/sd19.jpg",),
  ("https://formrecognizer.appliedai.azure.com/documents/samples/layout/layout-checklist.jpg",)
], ["source",])

analyzeLayout = (AnalyzeLayout()
            .setSubscriptionKey(cognitive_service_form_key)
            .setLocation("eastus")
            .setImageUrlCol("source")
            .setOutputCol("layout")
            .setConcurrency(5))

df_analyse_layout=(analyzeLayout.transform(imageDf)
        .withColumn("lines", f.flatten(f.col("layout.analyzeResult.readResults.lines")))
        .withColumn("readLayout", f.col("lines.text"))
        .withColumn("tables", f.flatten(f.col("layout.analyzeResult.pageResults.tables")))
        .withColumn("cells", f.flatten(f.col("tables.cells")))
        .withColumn("pageLayout", f.col("cells.text"))
        .select("source", "readLayout", "pageLayout","*"))

display(df_analyse_layout)

StatementMeta(testspark2, 6, 6, Finished, Available)

SynapseWidget(Synapse.DataFrame, d16353e4-55cd-49b2-9835-a072e00ba4e7)

In [14]:
#Analyse table

#df_analyse_layout.cache()
display(df_analyse_layout
        .filter("source='https://formrecognizer.appliedai.azure.com/documents/samples/layout/layout-checklist.jpg'")
        .withColumn('tables',f.explode_outer(f.col("tables")))
        .select("tables.*")
        .withColumn('cells',f.explode_outer(f.col("cells")))
        .select("rows","columns","cells.*")
        )

StatementMeta(testspark2, 6, 14, Finished, Available)

SynapseWidget(Synapse.DataFrame, ba7de12d-15b5-4e9b-b911-13bea94fb353)

## Analyze document

In [37]:
display(analyzeDocument.transform(imageDf))

StatementMeta(testspark2, 6, 37, Finished, Available)

SynapseWidget(Synapse.DataFrame, f35b591c-f796-4dd6-b4bc-67e002929635)

In [15]:
from synapse.ml.cognitive import *
import pyspark.sql.functions as f

imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg",),
  ("https://raw.githubusercontent.com/MicrosoftDocs/azure-docs/master/articles/cognitive-services/Computer-vision/Images/readsample.jpg",),
   ("https://www.nist.gov/sites/default/files/images/2019/04/27/sd19.jpg",),
   ("https://formrecognizer.appliedai.azure.com/documents/samples/layout/layout-checklist.jpg",)
], ["source",])

analyzeDocument = (AnalyzeDocument()
            # For supported prebuilt models, please go to documentation page for details
            .setPrebuiltModelId("prebuilt-layout")
            .setSubscriptionKey(cognitive_service_form_key)
            .setLocation("eastus")
            .setImageUrlCol("source")
            .setOutputCol("result")
            .setConcurrency(5))

df_analyzeDocument=(analyzeDocument.transform(imageDf)
        .withColumn("content", f.col("result.analyzeResult.content"))
        .withColumn("cells", f.flatten(f.col("result.analyzeResult.tables.cells")))
        .withColumn("cells", f.col("cells.content"))
        .select("source", "result", "content", "cells","*"))

df_analyzeDocument.cache()        
display(df_analyzeDocument)

StatementMeta(testspark2, 6, 15, Finished, Available)

SynapseWidget(Synapse.DataFrame, 5939bdb6-9003-434e-9a0a-2e71926fcbf8)

In [36]:
#Analyse table

display(df_analyzeDocument
        .filter("source='https://formrecognizer.appliedai.azure.com/documents/samples/layout/layout-checklist.jpg'")
        .select("result.analyzeResult.*")
        .withColumn('tables',f.explode_outer(f.col("tables")))        
        .select("tables.*")
        .withColumn('cells',f.explode_outer(f.col("cells")))
        .select("rowCount","columnCount","cells.*")
        )

StatementMeta(testspark2, 6, 36, Finished, Available)

SynapseWidget(Synapse.DataFrame, 5343f4c0-d512-4268-b09f-9dab41234e85)

## Analyse receipt

In [11]:
from synapse.ml.cognitive import *


imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",),
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",)
], ["image",])

analyzeReceipts = (AnalyzeReceipts()
                  .setSubscriptionKey(cognitive_service_form_key)
                  .setLocation("eastus")
                  .setImageUrlCol("image")
                  .setOutputCol("receipts")
                  .setConcurrency(5))

display(analyzeReceipts.transform(imageDf))

StatementMeta(testspark2, 2, 10, Finished, Available)

SynapseWidget(Synapse.DataFrame, 7fe66c8e-2df9-4ad1-a525-1e5391e2e286)

## Analyze ID document

In [29]:
from synapse.ml.cognitive import *

imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/id1.jpg",),  
], ["source",])

analyzeIDDocuments = (AnalyzeIDDocuments()
                  .setSubscriptionKey(cognitive_service_form_key)
                  .setLocation("eastus")
                  .setImageUrlCol("source")
                  .setOutputCol("ids")
                  .setConcurrency(5))

display(analyzeIDDocuments
        .transform(imageDf)
        .withColumn("documents", f.explode_outer(f.col("ids.analyzeResult.documentResults.fields")))
        .select("source", "documents","*"))

StatementMeta(testspark2, 2, 28, Finished, Available)

SynapseWidget(Synapse.DataFrame, eafb9d80-f8d6-4cb7-8942-c8c4dc878ec5)

## AnalyzeCustomModel

In [None]:
from synapse.ml.cognitive import *

modelId = "02bc2f58-2beb-4ae3-84fb-08f011b2f7b8" # put your own modelId here
imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/invoice2.png",)
], ["source",])

analyzeCustomModel = (AnalyzeCustomModel()
                 .setSubscriptionKey(cognitive_service_form_key)
                 .setLocation("eastus")
                 .setModelId(modelId)
                 .setImageUrlCol("source")
                 .setOutputCol("output")
                 .setConcurrency(5))

(analyzeCustomModel
        .transform(imageDf)
        .withColumn("keyValuePairs", flatten(col("output.analyzeResult.pageResults.keyValuePairs")))
        .withColumn("keys", col("keyValuePairs.key.text"))
        .withColumn("values", col("keyValuePairs.value.text"))
        .withColumn("keyValuePairs", create_map(lit("key"), col("keys"), lit("value"), col("values")))
        .select("source", "keyValuePairs")).show()

## GetCustomModel

In [None]:
from synapse.ml.cognitive import *


modelId = "02bc2f58-2beb-4ae3-84fb-08f011b2f7b8" # put your own modelId here
emptyDf = spark.createDataFrame([("",)])

getCustomModel = (GetCustomModel()
                  .setSubscriptionKey(cognitive_service_form_key)
                  .setLocation("eastus")
                  .setModelId(modelId)
                  .setIncludeKeys(True)
                  .setOutputCol("model")
                  .setConcurrency(5))

(getCustomModel
        .transform(emptyDf)
        .withColumn("modelInfo", col("model.ModelInfo"))
        .withColumn("trainResult", col("model.TrainResult"))
        .select("modelInfo", "trainResult")).show()

## List custom models

In [19]:
from synapse.ml.cognitive import *


emptyDf = spark.createDataFrame([("",)])

listCustomModels = (ListCustomModels()
                  .setSubscriptionKey(cognitive_service_form_key)
                  .setLocation("eastus")
                  .setOp("full")
                  .setOutputCol("models")
                  .setConcurrency(5))

(listCustomModels
       .transform(emptyDf)
       .withColumn("modelIds", f.col("models.modelList.modelId"))
       .select("modelIds")).show()

StatementMeta(testspark2, 2, 18, Finished, Available)

+--------+
|modelIds|
+--------+
|      []|
+--------+

