# Language Model

This notebook does the same as the `luis.ipynb` notebook, but for an Azure Language Service. It works fine, but it is not used in the rest of this project, because the Azure Language Service is too new and the integration with the Bot service is not well documented at this time.


In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and saving data files...
Data files already downloaded.
>>> OK.



In [3]:
import json
import os
import random
import warnings
from pathlib import Path
import uuid
import modin.pandas as pd
import pandas
import plotly.io as pio
import requests
from dotenv import load_dotenv
from pandas_profiling import ProfileReport
from tqdm.notebook import tqdm_notebook as tqdm
import time

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


pio.renderers.default = "notebook"
pd.options.plotting.backend = "plotly"


load_dotenv()

AZURE_LANGUAGE_ENDPOINT = os.getenv("AZURE_LANGUAGE_ENDPOINT")
AZURE_LANGUAGE_KEY = os.getenv("AZURE_LANGUAGE_KEY")

AZURE_LANGUAGE_PROJECT_NAME = "BookFlight"
AZURE_LANGUAGE_API_VERSION = "2022-03-01-preview"

DATA_PATH = Path("../data")
FRAMES_JSON_PATH = Path(DATA_PATH, "raw/frames.json")

TRAIN_TEST_RATIO = 0.8

In [4]:
purge = True

if purge:
    url = f"{AZURE_LANGUAGE_ENDPOINT}/language/authoring/analyze-conversations/projects/{AZURE_LANGUAGE_PROJECT_NAME}?api-version={AZURE_LANGUAGE_API_VERSION}"
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
    }

    response = requests.delete(url=url, headers=headers)
    print(response)

<Response [404]>


In [5]:
raw_data = pd.read_json(FRAMES_JSON_PATH)

assets = {
    "intents": [
        {"category": "Book"},
        {"category": "Info"},
    ],
    "entities": [
        {"category": "or_city"},
        {"category": "dst_city"},
        {"category": "str_date"},
        {"category": "end_date"},
        {"category": "budget"},
    ],
    "utterances": [],
}

unique_utterances = []

for turn in tqdm(raw_data["turns"]):
    for frame in turn:
        if frame["author"] == "wizard" or frame["text"] in unique_utterances:
            continue

        unique_utterances.append(frame["text"])

        is_book = False
        entities = []

        for act in frame["labels"]["acts_without_refs"]:
            for arg in act["args"]:
                if arg["key"] == "intent" and arg["val"] == "book":
                    is_book = True

                if (
                    arg["key"] in [e["category"] for e in assets["entities"]]
                    and arg["val"] is not None
                    and frame["text"].find(arg["val"]) != -1
                ):
                    entity = {
                        "category": arg["key"],
                        "offset": frame["text"].index(arg["val"]),
                        "length": len(arg["val"]),
                    }
                    entities.append(entity)

        if len(entities) > 0:
            assets["utterances"].append(
                {
                    "text": frame["text"],
                    "language": "en-us",
                    "intent": "Book" if is_book else "Info",
                    "entities": entities,
                    "dataset": "Train"
                    if random.random() < TRAIN_TEST_RATIO
                    else "Test",
                }
            )

  0%|          | 0/1369 [00:00<?, ?it/s]

In [6]:
## Import

url = f"{AZURE_LANGUAGE_ENDPOINT}/language/authoring/analyze-conversations/projects/{AZURE_LANGUAGE_PROJECT_NAME}/:import?api-version={ AZURE_LANGUAGE_API_VERSION }"
headers = {
    "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
}
data = {
    "api-version": AZURE_LANGUAGE_API_VERSION,
    "stringIndexType": "Utf16CodeUnit",
    "metadata": {
        "projectName": AZURE_LANGUAGE_PROJECT_NAME,
        "projectKind": "conversation",
        "multilingual": False,
        "language": "en-us",
    },
    "assets": assets,
}

response = requests.post(url=url, headers=headers, data=json.dumps(data))
location = response.headers["operation-location"]

print(f"> Status code: {response.status_code}")
print(f"> Reason: {response.reason}")

> Status code: 202
> Reason: Accepted


In [7]:
response = requests.get(url=location, headers=headers)

while response.json()["status"] == "running":
    print(f"> Status: {response.json()['status']}")
    print("> Waiting...\n")
    time.sleep(5)
    response = requests.get(url=location, headers=headers)

print(json.dumps(response.json(), indent=4))

> Status: running
> Waiting...

{
    "jobId": "28903b37-7c86-4371-8a2c-9e7bf67868f9_637893792000000000",
    "createdDateTime": "2022-05-29T15:45:06Z",
    "lastUpdatedDateTime": "2022-05-29T15:45:08Z",
    "expirationDateTime": "2022-06-05T15:45:06Z",
    "status": "succeeded"
}


In [9]:
## Train


url = f"{AZURE_LANGUAGE_ENDPOINT}/language/authoring/analyze-conversations/projects/{AZURE_LANGUAGE_PROJECT_NAME}/:train?api-version={ AZURE_LANGUAGE_API_VERSION }"
headers = {
    "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
    "Content-Type": "application/json",
}
data = {
    "modelLabel": AZURE_LANGUAGE_PROJECT_NAME,
    "trainingMode": "standard",
    "evaluationOptions": {
        "kind": "manual",
    },
}

response = requests.post(url=url, headers=headers, data=json.dumps(data))
location = response.headers["operation-location"]

print(f"> Status code: {response.status_code}")
print(f"> Reason: {response.reason}")

> Status code: 202
> Reason: Accepted


In [10]:
response = requests.get(url=location, headers=headers)

with tqdm(total=100) as progress:
    p = 0
    while not response.json()["status"] == "succeeded":
        time.sleep(1)
        response = requests.get(url=location, headers=headers)
        progress.update(
            response.json()["result"]["trainingStatus"]["percentComplete"] - p
        )
        p = response.json()["result"]["trainingStatus"]["percentComplete"]

    progress.update(100 - p)

print(json.dumps(response.json(), indent=4))

  0%|          | 0/100 [00:00<?, ?it/s]

{
    "result": {
        "modelLabel": "BookFlight",
        "trainingConfigVersion": "2022-05-01",
        "trainingMode": "standard",
        "trainingStatus": {
            "percentComplete": 100,
            "startDateTime": "2022-05-29T15:45:16.73844Z",
            "endDateTime": "2022-05-29T15:49:01.9239945Z",
            "status": "succeeded"
        },
        "evaluationStatus": {
            "percentComplete": 100,
            "startDateTime": "2022-05-29T15:49:01.9482545Z",
            "endDateTime": "2022-05-29T15:50:47.0671822Z",
            "status": "succeeded"
        }
    },
    "jobId": "b2f5ba89-6e20-419a-bbca-8da973c2ffd8_637893792000000000",
    "createdDateTime": "2022-05-29T15:45:14Z",
    "lastUpdatedDateTime": "2022-05-29T15:50:48Z",
    "expirationDateTime": "2022-06-05T15:45:14Z",
    "status": "succeeded"
}


In [11]:
## Deploy


url = f"{AZURE_LANGUAGE_ENDPOINT}/language/authoring/analyze-conversations/projects/{AZURE_LANGUAGE_PROJECT_NAME}/deployments/production?api-version={ AZURE_LANGUAGE_API_VERSION }"
headers = {
    "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
}
data = {
    "trainedModelLabel": AZURE_LANGUAGE_PROJECT_NAME,
}

response = requests.put(url=url, headers=headers, data=json.dumps(data))
location = response.headers["operation-location"]

print(f"> Status code: {response.status_code}")
print(f"> Reason: {response.reason}")

> Status code: 202
> Reason: Accepted


In [12]:
response = requests.get(url=location, headers=headers)

while response.json()["status"] == "running":
    print(f"> Status: {response.json()['status']}")
    print("> Waiting...\n")
    time.sleep(5)
    response = requests.get(url=location, headers=headers)

print(json.dumps(response.json(), indent=4))

> Status: running
> Waiting...

> Status: running
> Waiting...

{
    "jobId": "3f967ffe-5bfb-4454-9450-d2d684fbef8c_637893792000000000",
    "createdDateTime": "2022-05-29T15:50:50Z",
    "lastUpdatedDateTime": "2022-05-29T15:50:58Z",
    "expirationDateTime": "2022-06-05T15:50:50Z",
    "status": "succeeded"
}


In [13]:
## Test


url = f"{ AZURE_LANGUAGE_ENDPOINT }/language/:analyze-conversations?api-version={ AZURE_LANGUAGE_API_VERSION }"
headers = {
    "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
}
data = {
    "kind": "CustomConversation",
    "analysisInput": {
        "conversationItem": {
            "participantId": "test",
            "id": "test",
            "modality": "text",
            "text": "I want to book a flight from Paris to London next week for less than $100.",
            "language": "en-us",
        }
    },
    "parameters": {
        "projectName": AZURE_LANGUAGE_PROJECT_NAME,
        "deploymentName": "production",
    },
}

response = requests.post(url=url, headers=headers, data=json.dumps(data))

print(json.dumps(response.json(), indent=4))

{
    "kind": "CustomConversationResult",
    "results": {
        "query": "I want to book a flight from Paris to London next week for less than $100.",
        "prediction": {
            "topIntent": "Info",
            "projectKind": "conversation",
            "intents": [
                {
                    "category": "Info",
                    "confidenceScore": 0.95340455
                },
                {
                    "category": "Book",
                    "confidenceScore": 0.9430551
                },
                {
                    "category": "None",
                    "confidenceScore": 0
                }
            ],
            "entities": [
                {
                    "category": "or_city",
                    "text": "Paris",
                    "offset": 29,
                    "length": 5,
                    "confidenceScore": 1
                },
                {
                    "category": "dst_city",
                    "te