# Celebrity Quote Analysis with The Cognitive Services on Spark

###  
<img src="https://mmlspark.blob.core.windows.net/graphics/SparkSummit2/cog_services.png" width="800" style="float: center;"/>

In [3]:
dbutils.widgets.text("BING_IMAGE_SEARCH_KEY", "", "")
dbutils.widgets.text("VISION_API_KEY", "", "")
dbutils.widgets.text("TEXT_API_KEY", "", "")

In [4]:
dbutils.widgets.get("BING_IMAGE_SEARCH_KEY") 
BING_IMAGE_SEARCH_KEY = getArgument("BING_IMAGE_SEARCH_KEY")

dbutils.widgets.get("VISION_API_KEY") 
VISION_API_KEY = getArgument("VISION_API_KEY")

dbutils.widgets.get("TEXT_API_KEY") 
TEXT_API_KEY = getArgument("TEXT_API_KEY")

region = "eastus"

In [5]:
import azureml.core
from azureml.core import Workspace
from pyspark.sql import SparkSession

# Check core SDK version number - based on build number of preview/master.
print("SDK version:", azureml.core.VERSION)

prefix = "dcib_igor_"
data = "news"
workspace_name = prefix + "_"+data+"_aml"
subscription_id = '03909a66-bef8-4d52-8e9a-a346604e0902'
resource_group = prefix + "_" + data
workspace_region = "westus2"

# import the Workspace class and check the azureml SDK version
# exist_ok checks if workspace exists or not.
ws = Workspace.create(name = workspace_name,
                      subscription_id = subscription_id,
                      resource_group = resource_group, 
                      location = workspace_region,
                      exist_ok=True)

# persist the subscription id, resource group name, and workspace name in aml_config/config.json.
ws.write_config()

from azureml.core.datastore import Datastore

def_blob_store = Datastore(ws, "workspaceblobstore")
print('Datastore {} will be used'.format(def_blob_store.name))

SparkSession.builder.getOrCreate().conf.set(
  "fs.azure.account.key."+def_blob_store.account_name+".blob.core.windows.net",
  def_blob_store.account_key)

source_directory = "."

from azureml.data.data_reference import DataReference

In [6]:
%sh
rm *.py*

In [7]:
%sh
wget https://raw.githubusercontent.com/dciborow/DB-Recs/master/CogServices/BingSearchStep.py
  
wget https://raw.githubusercontent.com/dciborow/DB-Recs/master/CogServices/RecognizeDomainSpecificContentStep.py
  
wget https://raw.githubusercontent.com/dciborow/DB-Recs/master/CogServices/RecognizeTextStep.py
  
wget https://raw.githubusercontent.com/dciborow/DB-Recs/master/CogServices/TextSentimentStep.py

In [8]:
def DatabrickCompute4Step(num_workers, compute_target):
  return {
    "num_workers": num_workers,
    "compute_target": compute_target
  }   

In [9]:
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
import os
# Replace with your account info before running.
 
db_compute_name=os.getenv("DATABRICKS_COMPUTE_NAME", "Igor") # Databricks compute name
db_resource_group=os.getenv("DATABRICKS_RESOURCE_GROUP", "dcibdb") # Databricks resource group
db_workspace_name=os.getenv("DATABRICKS_WORKSPACE_NAME", "dcibwesttest") # Databricks workspace name
db_access_token=os.getenv("DATABRICKS_ACCESS_TOKEN", "dapic4a84699c4c40b2784d3c87032365615") # Databricks access token
 
try:
    databricks_compute = ComputeTarget(workspace=ws, name=db_compute_name)
    print('Compute target {} already exists'.format(db_compute_name))
except ComputeTargetException:
    print('Compute not found, will use below parameters to attach new one')
    print('db_compute_name {}'.format(db_compute_name))
    print('db_resource_group {}'.format(db_resource_group))
    print('db_workspace_name {}'.format(db_workspace_name))
 
    config = DatabricksCompute.attach_configuration(
        resource_group = db_resource_group,
        workspace_name = db_workspace_name,
        access_token= db_access_token)
    databricks_compute=ComputeTarget.attach(ws, db_compute_name, config)
    databricks_compute.wait_for_completion(True)
    
databricks_compute_4_step = DatabrickCompute4Step(1, databricks_compute)     

### Extracting celebrity quote images using Bing Image Search on Spark

Here we define two Transformers to extract celebrity quote images.

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/step%201.png" width="600" style="float: center;"/>

In [11]:
from wrappers.wrappers import BingImageSearchStep

input_query = "celebrity quotes"

step_1_output = DataReference(datastore=def_blob_store, 
                              path_on_datastore="raw_data/cog_services/celebs/urls/",
                              data_reference_name="step_1_output")

bing_image_search = BingImageSearchStep(
  step_1_output, BING_IMAGE_SEARCH_KEY, input_query)

### Recognizing Images of Celebrities
This block identifies the name of the celebrities for each of the images returned by the Bing Image Search.

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/step%202.png" width="600" style="float: center;"/>

In [13]:
from wrappers.wrappers import RecognizeDomainSpecificContentStep

model = "celebrities"
output_col = "celebs"

step_2_output = DataReference(datastore=def_blob_store, 
                              path_on_datastore="raw_data/cog_services/celebs/celeb/",
                              data_reference_name="step_2_output")

recognize_domain_content = RecognizeDomainSpecificContentStep(
  step_1_output, step_2_output, VISION_API_KEY, model, region, output_col)

### Reading the quote from the image.
This stage performs OCR on the images to recognize the quotes.

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/step%203.png" width="600" style="float: center;"/>

In [15]:
from wrappers.wrappers import RecognizeTextStep

step_3_output = DataReference(datastore=def_blob_store, 
                              path_on_datastore="raw_data/cog_services/celebs/text/",
                              data_reference_name="step_3_output")

recognize_text_step = RecognizeTextStep(
  step_2_output, step_3_output, TEXT_API_KEY, region)

### Understanding the Sentiment of the Quote

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/step%204.png" width="600" style="float: center;"/>

In [17]:
from wrappers.wrappers import RecognizeSentimentStep

step_4_output = DataReference(datastore=def_blob_store, 
                              path_on_datastore="raw_data/cog_services/celebs/sentiment/",
                              data_reference_name="step_4_output")

recognize_sentiment_step = RecognizeSentimentStep(
  step_3_output, step_4_output, TEXT_API_KEY, region)

### Create Custom Clean Up Step

In [19]:
%%writefile clean_up.py
from mmlspark import *
from pyspark.ml import PipelineModel
from pyspark.sql import SparkSession

dbutils.widgets.get("account_name") 
account_name = getArgument("account_name")

dbutils.widgets.get("input_path") 
input_path = getArgument("input_path")

dbutils.widgets.get("output_path") 
output_path = getArgument("output_path")

wasb_path    = "wasbs://azureml@"+account_name+".blob.core.windows.net/"

input_df = SparkSession.builder.getOrCreate().read.parquet(wasb_path + input_path)

# Select the final coulmns
output = SelectColumns().setCols(["url", "firstCeleb", "text", "sentimentScore"]).transform(input_df)
output.write.parquet(wasb_path + output_path, mode='overwrite')
output

In [20]:
from wrappers.wrappers import dbStep

class CleanUpStep(dbStep):
  def __init__(self, input, output):
    self.step = {
    "name":"Clean Up Step",
    "notebook_params": {
      "account_name": output.datastore.account_name,
      "input_path": input.path_on_datastore,
      "output_path": output.path_on_datastore,        
    },
    "inputs":[input],
    "python_script_name": "clean_up.py",
    "source_directory":".",
    "run_name": "Clean Up",
    "allow_reuse": True    
  }

step_5_output = DataReference(datastore=def_blob_store, 
                              path_on_datastore="raw_data/cog_services/celebs/output/",
                              data_reference_name="step_5_output")

clean_up_step = CleanUpStep(
  step_4_output, step_5_output)

### Tying it all together

Now that we have built the stages of our pipeline its time to chain them together into a single model that can be used to process batches of incoming data

<img src="https://mmlspark.blob.core.windows.net/graphics/Cog%20Service%20NB/full%20pipe.png" width="800" style="float: center;"/>

In [22]:
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline as mlPipeline, PipelineData
from azureml.pipeline.steps import DatabricksStep
from azureml.core.experiment import Experiment

steps = [
  bing_image_search.attach(databricks_compute_4_step),
  recognize_domain_content.attach(databricks_compute_4_step),  
  recognize_text_step.attach(databricks_compute_4_step),
  recognize_sentiment_step.attach(databricks_compute_4_step),
  clean_up_step.attach(databricks_compute_4_step)
]

pipeline = mlPipeline(workspace=ws, steps=steps)
pipeline.validate()
print("Simple validation complete")

In [23]:
pipeline_run = Experiment(ws, 'CognitiveServices-Celebrity_Quote_Analysis').submit(pipeline)
pipeline_run.wait_for_completion()

step_runs = pipeline_run.get_children()
for step_run in step_runs:
    status = step_run.get_status()
    print('Script:', step_run.name, 'status:', status)
    
    # Change this if you want to see details even if the Step has succeeded.
    if status == "Failed":
        joblog = step_run.get_job_log()
        print('job log:', joblog)