# Form Recognizer tutorial

__Setup__

- Create Form Recognizer service instance
- Create secret scope to store access key
- Create Azure Synapse Spark pool

__Setup__

In [None]:
#from synapse.ml.cognitive import *
from synapse.ml.cognitive import *

from notebookutils import mssparkutils

# A general Cognitive Services key for Text Analytics and Computer Vision (or use separate keys that belong to each service)
cognitive_service_key = mssparkutils.credentials.getSecret("ezzat-keyvault", "ez-cog-lang")
cognitive_service_form_key = mssparkutils.credentials.getSecret("ezzat-keyvault","ez-cog-form")

# A Bing Search v7 subscription key
#bingsearch_service_key = mssparkutils.credentials.getSecret("ADD_YOUR_KEY_VAULT_NAME", "ADD_YOUR_BING_SEARCH_KEY","ADD_YOUR_KEY_VAULT_LINKED_SERVICE_NAME")
# An Anomaly Dectector subscription key
#anomalydetector_key = mssparkutils.credentials.getSecret("ADD_YOUR_KEY_VAULT_NAME", "ADD_YOUR_ANOMALY_KEY","ADD_YOUR_KEY_VAULT_LINKED_SERVICE_NAME")

#Ref: https://microsoft.github.io/SynapseML/docs/documentation/transformers/transformers_cognitive/#analyzedocument


## AnalyzeLayout

In [None]:
import pyspark.sql.functions as f

imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg",),
  ("https://raw.githubusercontent.com/MicrosoftDocs/azure-docs/master/articles/cognitive-services/Computer-vision/Images/readsample.jpg",),
  ("https://www.nist.gov/sites/default/files/images/2019/04/27/sd19.jpg",),
  ("https://formrecognizer.appliedai.azure.com/documents/samples/layout/layout-checklist.jpg",)
], ["source",])

analyzeLayout = (AnalyzeLayout()
            .setSubscriptionKey(cognitive_service_form_key)
            .setLocation("eastus")
            .setImageUrlCol("source")
            .setOutputCol("layout")
            .setConcurrency(5))

df_analyse_layout=(analyzeLayout.transform(imageDf)
        .withColumn("lines", f.flatten(f.col("layout.analyzeResult.readResults.lines")))
        .withColumn("readLayout", f.col("lines.text"))
        .withColumn("tables", f.flatten(f.col("layout.analyzeResult.pageResults.tables")))
        .withColumn("cells", f.flatten(f.col("tables.cells")))
        .withColumn("pageLayout", f.col("cells.text"))
        .select("source", "readLayout", "pageLayout","*"))

df_analyse_layout.cache()

display(df_analyse_layout)

In [None]:
#Analyse table

display(df_analyse_layout
        .filter("source='https://formrecognizer.appliedai.azure.com/documents/samples/layout/layout-checklist.jpg'")
        .withColumn('tables',f.explode_outer(f.col("tables")))
        .select("tables.*")
        .withColumn('cells',f.explode_outer(f.col("cells")))
        .select("rows","columns","cells.*")
        )

## Analyze document

In [None]:
from synapse.ml.cognitive import *
import pyspark.sql.functions as f

imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/layout1.jpg",),
  ("https://raw.githubusercontent.com/MicrosoftDocs/azure-docs/master/articles/cognitive-services/Computer-vision/Images/readsample.jpg",),
   ("https://www.nist.gov/sites/default/files/images/2019/04/27/sd19.jpg",),
   ("https://formrecognizer.appliedai.azure.com/documents/samples/layout/layout-checklist.jpg",)
], ["source",])

analyzeDocument = (AnalyzeDocument()
            # For supported prebuilt models, please go to documentation page for details
            .setPrebuiltModelId("prebuilt-layout")
            .setSubscriptionKey(cognitive_service_form_key)
            .setLocation("eastus")
            .setImageUrlCol("source")
            .setOutputCol("result")
            .setConcurrency(5))

df_analyzeDocument=(analyzeDocument.transform(imageDf)
        .withColumn("content", f.col("result.analyzeResult.content"))
        .withColumn("cells", f.flatten(f.col("result.analyzeResult.tables.cells")))
        .withColumn("cells", f.col("cells.content"))
        .select("source", "result", "content", "cells","*"))

df_analyzeDocument.cache()        
display(df_analyzeDocument)

In [None]:
#Analyse table

display(df_analyzeDocument
        .filter("source='https://formrecognizer.appliedai.azure.com/documents/samples/layout/layout-checklist.jpg'")
        .select("result.analyzeResult.*")
        .withColumn('tables',f.explode_outer(f.col("tables")))        
        .select("tables.*")
        .withColumn('cells',f.explode_outer(f.col("cells")))
        .select("rowCount","columnCount","cells.*")
        )

## Analyse receipt

In [None]:
from synapse.ml.cognitive import *


imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",),
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/receipt1.png",)
], ["image",])

analyzeReceipts = (AnalyzeReceipts()
                  .setSubscriptionKey(cognitive_service_form_key)
                  .setLocation("eastus")
                  .setImageUrlCol("image")
                  .setOutputCol("receipts")
                  .setConcurrency(5))

display(analyzeReceipts.transform(imageDf))

## Analyze ID document

In [None]:
from synapse.ml.cognitive import *

imageDf = spark.createDataFrame([
  ("https://mmlspark.blob.core.windows.net/datasets/FormRecognizer/id1.jpg",),  
], ["source",])

analyzeIDDocuments = (AnalyzeIDDocuments()
                  .setSubscriptionKey(cognitive_service_form_key)
                  .setLocation("eastus")
                  .setImageUrlCol("source")
                  .setOutputCol("ids")
                  .setConcurrency(5))

display(analyzeIDDocuments
        .transform(imageDf)
        .withColumn("documents", f.explode_outer(f.col("ids.analyzeResult.documentResults.fields")))
        .select("source", "documents","*"))