In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and saving data files...
Data files already downloaded.
>>> OK.



In [2]:
# from codecarbon import OfflineEmissionsTracker

# tracker = OfflineEmissionsTracker(save_to_file=False, country_iso_code="FRA")
# tracker.start()

In [3]:
import json
import os
import random
import warnings
from pathlib import Path
import uuid
import modin.pandas as pd
import pandas
import plotly.io as pio
import requests
from dotenv import load_dotenv
from pandas_profiling import ProfileReport
from tqdm.notebook import tqdm_notebook as tqdm
import time

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


pio.renderers.default = "notebook"
pd.options.plotting.backend = "plotly"


load_dotenv()

AZURE_LANGUAGE_ENDPOINT = os.getenv("AZURE_LANGUAGE_ENDPOINT")
AZURE_LANGUAGE_KEY = os.getenv("AZURE_LANGUAGE_KEY")

AZURE_LANGUAGE_PROJECT_NAME = "BookFlight"
AZURE_LANGUAGE_BASE_URL = f"{ AZURE_LANGUAGE_ENDPOINT }/language/analyze-conversations/projects/{ AZURE_LANGUAGE_PROJECT_NAME }"
AZURE_LANGUAGE_API_VERSION = "2021-11-01-preview"

UNIQUE_ID = "-" + str(uuid.uuid4())

TRAIN_TEST_RATIO = 0.8

DATA_PATH = Path("../data")
FRAMES_JSON_PATH = Path(DATA_PATH, "raw/frames.json")

In [4]:
purge = False

if purge:
    url = f"{AZURE_LANGUAGE_BASE_URL}?api-version={AZURE_LANGUAGE_API_VERSION}"
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
    }

    response = requests.delete(url=url, headers=headers)
    print(response)


if purge:
    url = f"{AZURE_LANGUAGE_BASE_URL}/models/{AZURE_LANGUAGE_PROJECT_NAME}?api-version=2021-11-01-preview"
    headers = {
        "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
    }

    response = requests.delete(url=url, headers=headers)
    print(response)


<Response [202]>
<Response [404]>


In [5]:
%%time

raw_data = pd.read_json(FRAMES_JSON_PATH)

assets = {
    "intents": [
        {"name": "Book"},
        {"name": "Info"},
    ],
    "entities": [
        {"name": "or_city"},
        {"name": "dst_city"},
        {"name": "str_date"},
        {"name": "end_date"},
        {"name": "budget"},
    ],
    "examples": [],
}

unique_entities = {e["name"]: [] for e in assets["entities"]}

for turn in tqdm(raw_data["turns"]):
    for frame in turn:
        if frame["author"] == "wizard":
            continue

        is_book = False
        entities = []

        for act in frame["labels"]["acts_without_refs"]:
            for arg in act["args"]:
                if arg["key"] == "intent" and arg["val"] == "book":
                    is_book = True

                if (
                    arg["key"] in [e["name"] for e in assets["entities"]]
                    and arg["val"] is not None
                    and frame["text"].find(arg["val"]) != -1
                    and arg["val"] not in unique_entities[arg["key"]]
                ):
                    unique_entities[arg["key"]].append(arg["val"])
                    entity = {
                        "entityName": arg["key"],
                        "offset": frame["text"].index(arg["val"]),
                        "length": len(arg["val"]),
                    }
                    entities.append(entity)

        if len(entities) > 0:
            assets["examples"].append(
                {
                    "text": frame["text"],
                    "language": "en-us",
                    "intent": "Book" if is_book else "Info",
                    "entities": entities,
                    "dataset": "Train"
                    if random.random() < TRAIN_TEST_RATIO
                    else "Test",
                }
            )

  0%|          | 0/1369 [00:00<?, ?it/s]

CPU times: user 14.5 s, sys: 1.55 s, total: 16 s
Wall time: 27.4 s


In [6]:
## Import

url = f"{ AZURE_LANGUAGE_BASE_URL }/:import?api-version={ AZURE_LANGUAGE_API_VERSION }"
headers = {
    "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
    "format": "clu",
    "Content-Type": "application/json",
}
data = {
    "api-version": AZURE_LANGUAGE_API_VERSION,
    "metadata": {
        "name": AZURE_LANGUAGE_PROJECT_NAME,
        "type": "Conversation",
        "multilingual": False,
        "language": "en-us",
        "settings": {},
    },
    "assets": assets,
}

response = requests.post(url=url, headers=headers, data=json.dumps(data))
location = response.headers["Location"]

print(f"> Status code: {response.status_code}\n")
print(f"> Reason: {response.reason}\n")
print(f"> Location: {location}\n")

> Status code: 202

> Reason: Accepted

> Location: https://westeurope.api.cognitive.microsoft.com/language/analyze-conversations/projects/BookFlight/import/jobs/f893b3e3-79aa-4e5e-9c39-40f4c704c783_637889472000000000?api-version=2021-11-01-preview



In [7]:
response = requests.get(url=location, headers=headers)

while not response.json()["status"] == "succeeded":
    print("Waiting...")
    time.sleep(1)
    response = requests.get(url=location, headers=headers)
    print(f"Status: {response.json()['status']}\n")

print(json.dumps(response.json(), indent=4))

{
    "jobId": "f893b3e3-79aa-4e5e-9c39-40f4c704c783_637889472000000000",
    "createdDateTime": "2022-05-24T19:19:46Z",
    "lastUpdatedDateTime": "2022-05-24T19:19:47Z",
    "expirationDateTime": "2022-05-31T19:19:46Z",
    "status": "succeeded"
}


In [8]:
## Train


url = f"{ AZURE_LANGUAGE_BASE_URL }/:train?api-version={ AZURE_LANGUAGE_API_VERSION }"
headers = {
    "Ocp-Apim-Subscription-Key": AZURE_LANGUAGE_KEY,
    "format": "clu",
    "Content-Type": "application/json",
}
data = {
    "modelLabel": AZURE_LANGUAGE_PROJECT_NAME,
    "RunVerification": True,
    "evaluationOptions": {
        "type": "set",
    },
}

response = requests.post(url=url, headers=headers, data=json.dumps(data))
location = response.headers["Location"]

print(f"> Status code: {response.status_code}\n")
print(f"> Reason: {response.reason}\n")
print(f"> Location: {location}\n")

> Status code: 202

> Reason: Accepted

> Location: https://westeurope.api.cognitive.microsoft.com/language/analyze-conversations/projects/BookFlight/train/jobs/0472dea2-de02-4f8a-9fc5-803e98472f1f_637889472000000000?api-version=2021-11-01-preview



In [9]:
response = requests.get(url=location, headers=headers)

with tqdm(total=100) as progress:
    p = 0
    while not response.json()["status"] == "succeeded":
        time.sleep(1)
        response = requests.get(url=location, headers=headers)
        progress.update(response.json()["result"]["trainStatus"]["percentComplete"] - p)
        p = response.json()["result"]["trainStatus"]["percentComplete"]

    progress.update(100 - p)

print(json.dumps(response.json(), indent=4))

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
# tracker.stop()

[codecarbon INFO @ 19:53:47] Energy consumed for RAM : 0.001009 kWh. RAM Power : 5.758007526397705 W
[codecarbon INFO @ 19:53:47] Energy consumed for all CPUs : 0.000000 kWh. All CPUs Power : 0.0 W
[codecarbon INFO @ 19:53:47] 0.001009 kWh of electricity used since the begining.


5.5489611622562685e-05