# Data Convertion

Here's the code for converting different datasets to a my format.

In [21]:
import json

In [2]:
# ! mkdir ../data/intent_records

## Dream

source:
- english https://github.com/deeppavlov/dream/blob/new_intents/annotators/IntentCatcherTransformers/intent_phrases.json
- russian https://github.com/deeppavlov/dream/blob/new_intents/annotators/IntentCatcherTransformers/intent_phrases_RU.json

In [3]:
dream = json.load(open("../data/dream.json"))
ru_dream = json.load(open("../data/ru_dream.json"))

In [4]:
def convert_dream(dream_dict):
    res = []
    for i, (intent_name, all_phrases) in enumerate(dream_dict["intent_phrases"].items()):
        intent_record = {
            "intent_id": i,
            "intent_name": intent_name,
            "sample_utterances": [],
            "regexp_full_match": all_phrases["phrases"],
            "regexp_partial_match": all_phrases.get("reg_phrases", []),
        }
        res.append(intent_record)
    return res

In [5]:
dream_records = convert_dream(dream)

In [6]:
import pandas as pd

pd.DataFrame.from_records(dream_records)

Unnamed: 0,intent_id,intent_name,sample_utterances,regexp_full_match,regexp_partial_match
0,0,what_are_you_talking_about,[],"[(alexa ){0,1}what are ((you)|(we)) ((talking ...","[(alexa ){0,1}are we having a communication pr..."
1,1,topic_switching,[],"[(that's ){0,1}enough( talking ){0,1} about ((...","[tell me something else, don't tell me about ...."
2,2,lets_chat_about,[],"[(stop ){0,1}(((let's )|(i want to )|(wanna )|...",[.*let(('s)|(s)) ((chat)|(talk)) ((to|with) (m...
3,3,exit,[],"[be quiet, (see you ){0,1}later, leave me alon...","[(leave|end) (the|this) conversation, alexa do..."
4,4,tell_me_a_story,[],"[tell me ((another)|(other)) story, ((can you ...","[(can you ){0,1}tell me a .* story]"
5,5,repeat,[],"[i did not hear you, what come again, what did...","[one second, what( (book|movie))?, say it agai..."
6,6,yes,[],"[yes yes yes, ((sure)|(fine)|(okay)|(ok)|(yes)...","[you bet, kind of, sort of, oh yeah, maybe, it..."
7,7,no,[],"[(alexa ){0,1}((no)|(nope)|(no way)|(don't)|(d...","[.* no no, .* not today]"
8,8,dont_understand,[],"[(because ){0,1}you are being confusing(, alex...",[]
9,9,stupid,[],"[(alexa ){0,1}why are you this ((stupid)|(dump...",[]


In [7]:
json.dump(dream_records, open("../data/intent_records/dream.json", "w"), indent=4, ensure_ascii=False)

In [8]:
dream_ru_records = convert_dream(ru_dream)
pd.DataFrame.from_records(dream_ru_records)

Unnamed: 0,intent_id,intent_name,sample_utterances,regexp_full_match,regexp_partial_match
0,0,what_are_you_talking_about,[],"[о ((чем)|(чём)) ты( говоришь){0,1}( вообще){0...","[о ((чем)|(чём)) ты( говоришь){0,1}( вообще){0..."
1,1,topic_switching,[],[((хватит)|(прекрати)|(не хочу)|(не хочу больш...,[((хватит)|(прекрати)|(не хочу)|(не хочу больш...
2,2,lets_chat_about,[],[((можем)|(можешь)|(давай))(( мы)|( ты)|( я)){...,[((можем)|(можешь)|(давай))(( мы)|( ты)|( я)){...
3,3,exit,[],"[пока, хватит, закончим разговор, мне пора]","[пока(((-)|( ))пока){0,1}, хватит ((болтать)|(..."
4,4,repeat,[],"[повтори( еще раз){0,1}( пожалуйста){0,1}, ((м...","[повтори( еще раз){0,1}( пожалуйста){0,1}, ((м..."
5,5,yes,[],[((да)|(конечно)|(разумеется)|(точно)|(согласе...,[((да)|(конечно)|(разумеется)|(точно)|(согласе...
6,6,no,[],[((нет)|(нее)|(неа)|(ни за что)|(ни в коем слу...,[((нет)|(нее)|(неа)|(ни за что)|(ни в коем слу...
7,7,what_is_your_name,[],"[((представься)|(представь себя)), у тебя есть...","[((представься)|(представь себя)), у тебя есть..."
8,8,where_are_you_from,[],"[откуда ((ты)|(вы))( родом){0,1}, ((какая)|(ка...","[откуда ((ты)|(вы))( родом){0,1}, ((какая)|(ка..."
9,9,what_can_you_do,[],[что ты ((умеешь)|(можешь)|(способна)|(способе...,[.*что ты ((умеешь)|(можешь)|(способна)|(спосо...


In [9]:
json.dump(dream_ru_records, open("../data/intent_records/ru_dream.json", "w"), indent=4, ensure_ascii=False)

## banking77

source: https://huggingface.co/datasets/PolyAI/banking77

In [10]:
from datasets import load_dataset

banking77 = load_dataset("PolyAI/banking77")

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
banking77

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [12]:
banking77["train"][0]

{'text': 'I am still waiting on my card?', 'label': 11}

In [13]:
# ! wget https://huggingface.co/datasets/PolyAI/banking77/resolve/main/dataset_infos.json -O ../data/banking77_info.json

### intent records

In [14]:
banking77_info = json.load(open("../data/banking77_info.json"))
intent_names = banking77_info["default"]["features"]["label"]["names"]

In [15]:
def convert_banking77(banking77_train, shots_per_intent, intent_names):
    all_labels = sorted(banking77_train.unique("label"))
    assert all_labels == list(range(len(intent_names)))

    res = [
        {
            "intent_id": i,
            "intent_name": name,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i, name in enumerate(intent_names)
    ]

    for b77_batch in banking77_train.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(b77_batch["text"], b77_batch["label"], strict=False):
            target_list = res[intent_id]["sample_utterances"]
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    return res

In [16]:
banking77_records = convert_banking77(banking77["train"], shots_per_intent=5, intent_names=intent_names)

In [17]:
banking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ["Please help me with my card.  It won't activate.",
  'I tired but an unable to activate my card.',
  'I want to start using my card.',
  'How do I verify my new card?',
  "I tried activating my plug-in and it didn't piece of work"],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [18]:
json.dump(banking77_records, open("../data/intent_records/banking77.json", "w"), indent=4, ensure_ascii=False)

### utterance records

In [5]:
import json

banking77_records = json.load(open("../data/intent_records/banking77.json"))
ru_banking77_records = json.load(open("../data/intent_records/ru_banking77.json"))
banking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ["Please help me with my card.  It won't activate.",
  'I tired but an unable to activate my card.',
  'I want to start using my card.',
  'How do I verify my new card?',
  "I tried activating my plug-in and it didn't piece of work"],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [6]:
def get_utterance_records(intent_records):
    res = []
    for rec in intent_records:
        for utt in rec["sample_utterances"]:
            res.append({"intent_id": rec["intent_id"], "intent_name": rec["intent_name"], "utterance": utt})
    return res

In [7]:
banking77_utterance_records = get_utterance_records(banking77_records)
ru_banking77_utterance_records = get_utterance_records(ru_banking77_records)

In [8]:
json.dump(
    banking77_utterance_records, open("../data/utterance_records/banking77.json", "w"), indent=4, ensure_ascii=False
)
json.dump(
    ru_banking77_utterance_records,
    open("../data/utterance_records/ru_banking77.json", "w"),
    indent=4,
    ensure_ascii=False,
)

## russian banking77

source: https://github.com/LadaNikitina/RuBanking77

In [20]:
# ! git clone https://github.com/LadaNikitina/RuBanking77 ../data/RuBanking77
# ! rm -rf ../data/RuBanking77/.git

Cloning into '../data/RuBanking77'...
Username for 'https://github.com': ^C


In [21]:
from datasets import load_from_disk

rubanking77 = load_from_disk("../data/RuBanking77")
rubanking77

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [22]:
rubanking77["train"][0]

{'text': 'Я все еще жду свою карту?', 'label': 11}

In [23]:
rubanking77_records = convert_banking77(rubanking77["train"], shots_per_intent=5)

In [24]:
rubanking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ['Пожалуйста, помогите мне с моей картой. Она не активируется.',
  'Я устал, но не могу активировать свою карту.',
  'Я хочу начать пользоваться своей картой.',
  'Как мне проверить мою новую карту?',
  'Я попытался активировать свой плагин, и это не сработало.'],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [25]:
json.dump(rubanking77_records, open("../data/intent_records/ru_banking77.json", "w"), indent=4, ensure_ascii=False)

## clinc150

In [39]:
from datasets import load_dataset

clinc150 = load_dataset("cmaldona/All-Generalization-OOD-CLINC150")
clinc150

DatasetDict({
    train: Dataset({
        features: ['data', 'labels', 'domain', 'generalisation'],
        num_rows: 15200
    })
    validation: Dataset({
        features: ['data', 'labels', 'domain', 'generalisation'],
        num_rows: 3200
    })
    test: Dataset({
        features: ['data', 'labels', 'domain', 'generalisation'],
        num_rows: 7900
    })
})

In [40]:
intent_names = sorted(clinc150["train"].unique("labels"))
intent_names[:5]

151


['accept_reservations',
 'account_blocked',
 'alarm',
 'application_status',
 'apr']

In [41]:
oos_samples = clinc150.filter(lambda x: x["labels"] == "ood")

In [42]:
oos_samples["train"][2]

{'data': 'what size wipers does this car take',
 'labels': 'ood',
 'domain': 'Unknown',
 'generalisation': 'near-OOD'}

In [44]:
for name in intent_names:
    samples = clinc150["train"].filter(lambda x: x["labels"] == name)
    if len(samples) > 100:
        pass

ood


In [29]:
def convert_clinc150(clinc150_train, shots_per_intent, oos_intent_name="ood"):
    intent_names = sorted(clinc150_train.unique("labels"))
    oos_intent_id = intent_names.index(oos_intent_name)
    intent_names = intent_names[:oos_intent_id] + intent_names[oos_intent_id + 1 :] + [intent_names[oos_intent_id]]
    name_to_id = dict(zip(intent_names, range(len(intent_names)), strict=False))
    name_to_id[oos_intent_name] = -1

    res = [
        {
            "intent_id": i,
            "intent_name": name,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for name, i in name_to_id.items()
    ]

    for batch in clinc150_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch["data"], batch["labels"], strict=False):
            intent_id = name_to_id[name]
            target_list = res[intent_id]["sample_utterances"]
            if name != oos_intent_name and len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    return res

In [30]:
clinc150_records = convert_clinc150(clinc150["train"], shots_per_intent=5)

In [31]:
clinc150_records[0]

{'intent_id': 0,
 'intent_name': 'accept_reservations',
 'sample_utterances': ['can i make a reservation for redrobin',
  'is it possible to make a reservation at redrobin',
  'does redrobin take reservations',
  'are reservations taken at redrobin',
  'does redrobin do reservations'],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [32]:
clinc150_records[-1]

{'intent_id': -1,
 'intent_name': 'ood',
 'sample_utterances': ['how much is an overdraft fee for bank',
  'why are exponents preformed before multiplication in the order of operations',
  'what size wipers does this car take',
  'where is the dipstick',
  'how much is 1 share of aapl',
  'how is glue made',
  'any headlines from my area',
  'what is the largest state in the us',
  'what is the current market trend',
  'what is the most popular airline',
  'what is the formula for the circumference of a circle',
  'what are some ways to reduce spending',
  'what time does the louvre open',
  'are there any local stations covering the moral march in raleigh in februaryu',
  'how many planets have we discovered',
  'how do i change my billing address',
  'how do i open an online line of credit',
  'how do i remove a coffee blemish',
  'how expensive is an apple share',
  'where can i find the cruise control on my kia sportage 2010',
  'how many sides are in a hexagon',
  'how fast does a

In [33]:
import json

json.dump(clinc150_records, open("../data/intent_records/clinc150.json", "w"), indent=4, ensure_ascii=False)

## russian clinc150

In [None]:
# ! git clone https://github.com/LadaNikitina/clinc150 ../data/RuClinc150
# ! rm -rf ../data/RuClinc150/.git

In [47]:
from datasets import load_from_disk

ruclinc150 = load_from_disk("../data/RuClinc150")
ruclinc150

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 15250
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
})

In [48]:
ruclinc150["train"][0]

{'text': 'Какое выражение я бы использовал, чтобы сказать, что я люблю тебя, если бы я был итальянцем?',
 'intent': 61}

In [49]:
intent_labels = sorted(ruclinc150["train"].unique("intent"))

151


find index of ood class:

In [50]:
for i in intent_labels:
    samples = ruclinc150["train"].filter(lambda x: x["intent"] == i)
    if len(samples) > 100:
        pass

42


In [51]:
def convert_ruclinc150(clinc150_train, shots_per_intent, ood_index=42):
    all_labels = sorted(clinc150_train.unique("intent"))
    assert all_labels == list(range(151))

    in_domain_samples = clinc150_train.filter(lambda x: x["intent"] != ood_index)
    oos_samples = clinc150_train.filter(lambda x: x["intent"] == ood_index)

    res = [
        {
            "intent_id": i,
            "intent_name": None,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i in range(150)
    ]

    for batch in in_domain_samples.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(batch["text"], batch["intent"], strict=False):
            intent_id -= int(intent_id > ood_index)
            target_list = res[intent_id]["sample_utterances"]
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    res.append(
        {
            "intent_id": -1,
            "intent_name": "ood",
            "sample_utterances": oos_samples["text"],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
    )

    return res

In [52]:
ruclinc150_records = convert_ruclinc150(ruclinc150["train"], shots_per_intent=5)

Filter:   0%|          | 0/15250 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15250 [00:00<?, ? examples/s]

In [53]:
ruclinc150_records[0]

{'intent_id': 0,
 'intent_name': None,
 'sample_utterances': ['как делают пончики в tgi',
  'Пончики в TGI хорошо оценены',
  'как выглядят пончики отзывы в tgi',
  'какие отзывы о пончиках в тги',
  'Люди рекомендуют пончики в TGI'],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [54]:
ruclinc150_records[-1]

{'intent_id': -1,
 'intent_name': 'ood',
 'sample_utterances': ['Сколько стоит овердрафт для банка',
  'почему экспоненты образуются перед умножением в порядке операций',
  'Какой размер стеклоочистителей принимает этот автомобиль',
  'Где находится Дипстик',
  'Сколько стоит 1 акция AAPL',
  'Как производится клей',
  'любые заголовки из моей области',
  'какой самый большой штат в сша',
  'Какова текущая тенденция на рынке',
  'какая самая популярная авиакомпания',
  'что такое формула для окружности круга',
  'Какие есть способы сократить расходы',
  'Во сколько открывается Лувр',
  'Есть ли какие-либо местные станции, освещающие моральный марш в Роли в феврале?',
  'Сколько планет мы обнаружили',
  'Как я могу изменить свой платежный адрес',
  'Как открыть кредитную линию онлайн',
  'Как удалить пятно от кофе',
  'Сколько стоит акция Apple',
  'Где я могу найти круиз-контроль на моем Kia Sportage 2010',
  'Сколько сторон в шестиугольнике',
  'С какой скоростью летит самолет',
  'Ст

In [55]:
json.dump(ruclinc150_records, open("../data/intent_records/ru_clinc150.json", "w"), indent=4, ensure_ascii=False)

## Snips

In [42]:
from datasets import load_dataset

snips = load_dataset("benayas/snips")
snips

Downloading readme:   0%|          | 0.00/426 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/370k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/45.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13084 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'category'],
        num_rows: 13084
    })
    test: Dataset({
        features: ['text', 'category'],
        num_rows: 1400
    })
})

In [44]:
intent_names = sorted(snips["train"].unique("category"))
intent_names

7


['AddToPlaylist',
 'BookRestaurant',
 'GetWeather',
 'PlayMusic',
 'RateBook',
 'SearchCreativeWork',
 'SearchScreeningEvent']

In [47]:
def convert_snips(snips_train, shots_per_intent):
    intent_names = sorted(snips_train.unique("category"))
    name_to_id = dict(zip(intent_names, range(len(intent_names)), strict=False))

    res = [
        {
            "intent_id": i,
            "intent_name": name,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i, name in enumerate(intent_names)
    ]

    for batch in snips_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch["text"], batch["category"], strict=False):
            intent_id = name_to_id[name]
            target_list = res[intent_id]["sample_utterances"]
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    return res

In [48]:
snips_records = convert_snips(snips["train"], shots_per_intent=5)

In [49]:
json.dump(snips_records, open("../data/intent_records/snips.json", "w"), indent=4, ensure_ascii=False)

## russian Snips

In [None]:
# ! git clone https://github.com/LadaNikitina/Snips ../data/RuSnips
# ! rm -rf ../data/RuSnips/.git

In [30]:
from datasets import load_from_disk

rusnips = load_from_disk("../data/RuSnips")
rusnips

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 13784
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 700
    })
})

In [31]:
rusnips["train"][0]

{'text': 'Добавьте еще одну песню в плейлист Cita Romántica.',
 'intent': 'add_to_playlist'}

In [33]:
rusnips["train"].unique("intent")

['add_to_playlist',
 'book_restaurant',
 'get_weather',
 'play_music',
 'rate_book',
 'search_creative_work',
 'search_screening_event']

In [39]:
def convert_rusnips(snips_train, shots_per_intent):
    intent_names = sorted(snips_train.unique("intent"))
    name_to_id = dict(zip(intent_names, range(len(intent_names)), strict=False))

    res = [
        {
            "intent_id": i,
            "intent_name": name,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i, name in enumerate(intent_names)
    ]

    for batch in snips_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch["text"], batch["intent"], strict=False):
            intent_id = name_to_id[name]
            target_list = res[intent_id]["sample_utterances"]
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    return res

In [40]:
rusnips_records = convert_rusnips(rusnips["train"], shots_per_intent=5)

In [41]:
json.dump(rusnips_records, open("../data/intent_records/ru_snips.json", "w"), indent=4, ensure_ascii=False)

## hwu64

source: https://github.com/jianguoz/Few-Shot-Intent-Detection/tree/main/Datasets/HWU64/train

In [1]:
hwu64_labels = open("../data/hwu_assets/label.txt").read().split("\n")[:-1]
hwu64_labels[:5]

8954


['alarm_query', 'alarm_query', 'alarm_query', 'alarm_query', 'alarm_query']

In [2]:
hwu64_utterances = open("../data/hwu_assets/seq.in").read().split("\n")[:-1]
hwu64_utterances[:5]

8954


['what alarms do i have set right now',
 'checkout today alarm of meeting',
 'report alarm settings',
 'see see for me the alarms that you have set tomorrow morning',
 'is there an alarm for ten am']

In [3]:
len(set(hwu64_labels))

64

In [4]:
def convert_hwu64(hwu_utterances, hwu_labels, shots_per_intent):
    intent_names = sorted(set(hwu_labels))
    name_to_id = dict(zip(intent_names, range(len(intent_names)), strict=False))

    res = [
        {
            "intent_id": i,
            "intent_name": name,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i, name in enumerate(intent_names)
    ]

    for txt, name in zip(hwu_utterances, hwu_labels, strict=False):
        intent_id = name_to_id[name]
        target_list = res[intent_id]["sample_utterances"]
        if len(target_list) >= shots_per_intent:
            continue
        target_list.append(txt)

    return res

In [5]:
hwu64_records = convert_hwu64(hwu64_utterances, hwu64_labels, shots_per_intent=5)

In [6]:
import json

json.dump(hwu64_records, open("../data/intent_records/hwu64.json", "w"), indent=4, ensure_ascii=False)

## russian hwu64

In [None]:
# ! git clone https://github.com/LadaNikitina/HWU64 ../data/RuHWU64
# ! rm -rf ../data/RuHWU64/.git

In [74]:
from datasets import load_from_disk

ruhwu64 = load_from_disk("../data/RuHWU64")
ruhwu64

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 25606
    })
})

In [75]:
ruhwu64["train"][0]

{'text': 'Разбуди меня в 5 утра на этой неделе', 'intent': 'set'}

In [80]:
intent_names = sorted(set(ruhwu64["train"].unique("intent")))
intent_names[:5]

54


['addcontact', 'affirm', 'audiobook', 'cleaning', 'coffee']

In [81]:
def convert_ruhwu64(hwu64_train, shots_per_intent):
    intent_names = sorted(hwu64_train.unique("intent"))
    name_to_id = dict(zip(intent_names, range(len(intent_names)), strict=False))

    res = [
        {
            "intent_id": i,
            "intent_name": name,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i, name in enumerate(intent_names)
    ]

    for batch in hwu64_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch["text"], batch["intent"], strict=False):
            intent_id = name_to_id[name]
            target_list = res[intent_id]["sample_utterances"]
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    return res

In [82]:
ruhwu64_records = convert_ruhwu64(ruhwu64["train"], shots_per_intent=5)

In [83]:
json.dump(ruhwu64_records, open("../data/intent_records/ru_hwu64.json", "w"), indent=4, ensure_ascii=False)

## russian Minds14

In [86]:
from datasets import load_dataset

ruminds14 = load_dataset("PolyAI/minds14", "ru-RU")
ruminds14

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 539
    })
})

In [87]:
ruminds14["train"][0]

{'path': '/home/voorhs/.cache/huggingface/datasets/downloads/extracted/f8075d4661a5d714b98ee4adbe7d239eb1a69d19d3cd7fa9cd42aeb27cab93c3/ru-RU~LATEST_TRANSACTIONS/6030093cbb1e6d0fbce93a74.wav',
 'audio': {'path': '/home/voorhs/.cache/huggingface/datasets/downloads/extracted/f8075d4661a5d714b98ee4adbe7d239eb1a69d19d3cd7fa9cd42aeb27cab93c3/ru-RU~LATEST_TRANSACTIONS/6030093cbb1e6d0fbce93a74.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
         -0.00073242, -0.00073242]),
  'sampling_rate': 8000},
 'transcription': 'Здравствуйте я бы хотела пересмотреть свои предыдущие последние операции которые проходили по моей карте прямым помимо ему счёту Покажите пожалуйста операции последних трёх месяцев',
 'english_transcription': 'Hello, I would like to review my previous last transactions that took place on my card directly in addition to his account. Please show the transactions of the last three months',
 'intent_class': 12,
 'lang_id': 12}

In [89]:
sorted(ruminds14["train"].unique("intent_class"))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [90]:
def convert_ruminds14(minds14_train, shots_per_intent):
    res = [
        {
            "intent_id": i,
            "intent_name": None,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i in range(14)
    ]

    for batch in minds14_train.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(batch["transcription"], batch["intent_class"], strict=False):
            target_list = res[intent_id]["sample_utterances"]
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    return res

In [91]:
ruminds14_records = convert_ruminds14(ruminds14["train"], shots_per_intent=5)

In [92]:
ruminds14_records[0]

{'intent_id': 0,
 'intent_name': None,
 'sample_utterances': ['Здравствуйте я хотел бы узнать Могу ли я использовать свою карту за границей и нужно мне для этого предупредить Мой Банк Спасибо',
  'Здравствуйте какая комиссия будет если я буду использовать карту свою заграницы',
  'Здравствуйте Через несколько дней я уезжаю в швецию на целый месяц я хотел бы узнать Могу ли я платить моей карты Спасибо',
  'Здравствуйте я уезжаю на несколько недель заграницу и хотела бы узнать Могу ли я пользоваться твоей картой там Я бы хотела оплачивать покупки в Америке Нужно ли мне что-нибудь для этого делать',
  'Здравствуйте мне хотелось бы задать вам один вопрос я уезжаю в отпуск и мне нужно уточнить работать будет ли моя кредитная карточка За границей'],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [93]:
json.dump(ruminds14_records, open("../data/intent_records/ru_minds14.json", "w"), indent=4, ensure_ascii=False)

## Minds14

In [94]:
def convert_minds14(minds14_train, shots_per_intent):
    res = [
        {
            "intent_id": i,
            "intent_name": None,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i in range(14)
    ]

    for batch in minds14_train.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(batch["english_transcription"], batch["intent_class"], strict=False):
            target_list = res[intent_id]["sample_utterances"]
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    return res

In [95]:
minds14_records = convert_minds14(ruminds14["train"], shots_per_intent=5)

In [96]:
minds14_records[0]

{'intent_id': 0,
 'intent_name': None,
 'sample_utterances': ['Hello, I would like to know if I can use my card abroad and I need to warn My Bank for this. Thank you',
  'Hello, what will be the commission if I use my card abroad',
  'Hello In a few days I am leaving for sweden for a whole month I would like to know if I can pay with my card Thank you',
  'Hello, I am going abroad for a few weeks and would like to know Can I use your card there I would like to pay for purchases in America Do I need to do anything for this',
  'Hello, I would like to ask you one question, I am going on vacation and I need to clarify whether my credit card will work abroad'],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [97]:
json.dump(minds14_records, open("../data/intent_records/minds14.json", "w"), indent=4, ensure_ascii=False)

## Massive

In [98]:
from datasets import load_dataset

massive = load_dataset("mteb/amazon_massive_intent", "en")
massive

Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/187k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/54.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11514 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2974 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2033 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 11514
    })
    test: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 2974
    })
    validation: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 2033
    })
})

In [101]:
massive["train"][0]

{'id': '1',
 'label': 'alarm_set',
 'label_text': 'alarm_set',
 'text': 'wake me up at nine am on friday',
 'lang': 'en'}

In [104]:
intent_names = sorted(massive["train"].unique("label"))
intent_names[:5]

60


['alarm_query',
 'alarm_remove',
 'alarm_set',
 'audio_volume_down',
 'audio_volume_mute']

In [108]:
def convert_massive(massive_train, shots_per_intent):
    intent_names = sorted(massive_train.unique("label"))
    name_to_id = dict(zip(intent_names, range(len(intent_names)), strict=False))

    res = [
        {
            "intent_id": i,
            "intent_name": name,
            "sample_utterances": [],
            "regexp_full_match": [],
            "regexp_partial_match": [],
        }
        for i, name in enumerate(intent_names)
    ]

    for batch in massive_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch["text"], batch["label"], strict=False):
            intent_id = name_to_id[name]
            target_list = res[intent_id]["sample_utterances"]
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)

    return res

In [109]:
massive_records = convert_massive(massive["train"], shots_per_intent=5)

In [110]:
massive_records[0]

{'intent_id': 0,
 'intent_name': 'alarm_query',
 'sample_utterances': ['please list active alarms',
  'show me the alarms i set',
  'do i have any alarms',
  'show alarms',
  'do i have an alarm set for morning flight'],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [111]:
json.dump(massive_records, open("../data/intent_records/massive.json", "w"), indent=4, ensure_ascii=False)

## russian Massive

In [112]:
from datasets import load_dataset

rumassive = load_dataset("mteb/amazon_massive_intent", "ru")
rumassive

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/262k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/73.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11514 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2974 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2033 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 11514
    })
    test: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 2974
    })
    validation: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 2033
    })
})

In [113]:
rumassive["train"][0]

{'id': '1',
 'label': 'alarm_set',
 'label_text': 'alarm_set',
 'text': 'разбуди меня в девять утра в пятницу',
 'lang': 'ru'}

In [114]:
rumassive_records = convert_massive(rumassive["train"], shots_per_intent=5)

In [115]:
rumassive_records[0]

{'intent_id': 0,
 'intent_name': 'alarm_query',
 'sample_utterances': ['пожалуйста список активных будильников',
  'покажи мне будильники которые я установил',
  'у меня есть какие-то будильники',
  'покажи будильники',
  'имею ли я будильник на утренний рейс'],
 'regexp_full_match': [],
 'regexp_partial_match': []}

In [116]:
json.dump(rumassive_records, open("../data/intent_records/ru_massive.json", "w"), indent=4, ensure_ascii=False)

## DialogStudio

бесполезная разметка у этих датасетов :(

In [1]:
from datasets import load_dataset


def get_dataset(name):
    return load_dataset("Salesforce/dialogstudio", name, split="train", trust_remote_code=True, token=True)


sgd = get_dataset("SGD")

❤️Attention❤️: Dataset download may take some time. We appreciate your patience!


In [7]:
multiwoz = get_dataset("MULTIWOZ2_2")

❤️Attention❤️: Dataset download may take some time. We appreciate your patience!


In [9]:
multiwoz[0]

{'original dialog id': 'SSNG0007.json',
 'new dialog id': 'MULTIWOZ2_2--train--1',
 'dialog index': 1,
 'original dialog info': '{"services": ["restaurant"]}',
 'log': [{'turn id': 1,
   'user utterance': 'I am looking for a Chinese place in the centre.',
   'system response': 'I have 10 restaurants matching your request. Did you have a price range you would like?',
   'dialog history': '',
   'original user side information': '{"frames": [{"actions": [], "service": "restaurant", "slots": [{"exclusive_end": 26, "slot": "restaurant-food", "start": 19, "value": "Chinese"}], "state": {"active_intent": "find_restaurant", "requested_slots": [], "slot_values": {"restaurant-area": ["centre"], "restaurant-food": ["chinese"]}}}, {"actions": [], "service": "taxi", "slots": [], "state": {"active_intent": "NONE", "requested_slots": [], "slot_values": {}}}, {"actions": [], "service": "train", "slots": [], "state": {"active_intent": "NONE", "requested_slots": [], "slot_values": {}}}, {"actions": [],

## AC Robotic

In [78]:
import json

robotic_intents = json.load(open("../data/ac_robotic_intents.json"))
len(robotic_intents)

18

In [79]:
robotic_intents[0]

{'name': 'move_forward',
 'phrases': ['(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)|(отъедь)|(отъезжай)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((2)|(три)|(17)|(десять)){0,1}( метров){0,1}',
  '(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)|(отъедь)|(отъезжай)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((10)|(десять)){0,1}( метров){0,1}(, если можешь){0,1}',
  '((тебе ((надо)|(следует)|(стоит))|ты (можешь|мог бы|не мог бы)) ){0,1}((проехать)|(отъехать)|(доехать)|(продвинуться)|(подвинуться)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((6)|(пять)){0,1}( метров){0,1}',
  '(робот(,){0,1} ){0,1}(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)) (на ){0,1}((7)|(четыре)|(11)|(двенадцать)){0,1}( метров){0,1}',
  'было бы хорошо, если бы ты ((проехал)|(продвинулся)|(подвинулся)|(отъехал)) ((вперёд)|(вперед)|(по прямой)

In [80]:
intent_names = sorted({record["name"] for record in robotic_intents})
intent_names

['GO',
 'disable_autopilot',
 'drop',
 'enable_autopilot',
 'go_to',
 'move_backward',
 'move_forward',
 'pick_up',
 'place',
 'say',
 'set_point',
 'sit_down',
 'stand_up',
 'status',
 'stop',
 'turn_left',
 'turn_right',
 'world_state']

In [81]:
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'GO': 0,
 'disable_autopilot': 1,
 'drop': 2,
 'enable_autopilot': 3,
 'go_to': 4,
 'move_backward': 5,
 'move_forward': 6,
 'pick_up': 7,
 'place': 8,
 'say': 9,
 'set_point': 10,
 'sit_down': 11,
 'stand_up': 12,
 'status': 13,
 'stop': 14,
 'turn_left': 15,
 'turn_right': 16,
 'world_state': 17}

In [82]:
for intent in robotic_intents:
    intent["intent_id"] = name_to_id[intent["name"]]
    intent["intent_name"] = intent.pop("name")
    intent["sample_utterances"] = []
    intent["regexp_full_match"] = intent.pop("phrases")
    intent["regexp_partial_match"] = intent.pop("reg_phrases")
    intent.pop("punctuation")
    intent.pop("min_precision")

In [83]:
robotic_intents[0]

{'intent_id': 6,
 'intent_name': 'move_forward',
 'sample_utterances': [],
 'regexp_full_match': ['(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)|(отъедь)|(отъезжай)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((2)|(три)|(17)|(десять)){0,1}( метров){0,1}',
  '(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)|(отъедь)|(отъезжай)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((10)|(десять)){0,1}( метров){0,1}(, если можешь){0,1}',
  '((тебе ((надо)|(следует)|(стоит))|ты (можешь|мог бы|не мог бы)) ){0,1}((проехать)|(отъехать)|(доехать)|(продвинуться)|(подвинуться)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((6)|(пять)){0,1}( метров){0,1}',
  '(робот(,){0,1} ){0,1}(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)) (на ){0,1}((7)|(четыре)|(11)|(двенадцать)){0,1}( метров){0,1}',
  'было бы хорошо, если бы ты ((проехал)|(продвин

In [88]:
json.dump(robotic_intents, open("../data/intent_records/ac_robotic.json", "w"), indent=4, ensure_ascii=False)

## AC Robotic (new version)

In [2]:
import json

robotic_data = json.load(open("../data/ac_robotic_2_raw.json"))
robotic_intents = robotic_data["intents"]
robotic_tags = robotic_data["tags"]
len(robotic_intents), len(robotic_tags)

(18, 3)

In [3]:
intent_names = sorted({record["name"] for record in robotic_intents})
intent_names

['GO',
 'disable_autopilot',
 'drop',
 'enable_autopilot',
 'go_to',
 'move_backward',
 'move_forward',
 'pick_up',
 'place',
 'say',
 'set_point',
 'sit_down',
 'stand_up',
 'status',
 'stop',
 'turn_left',
 'turn_right',
 'world_state']

In [4]:
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'GO': 0,
 'disable_autopilot': 1,
 'drop': 2,
 'enable_autopilot': 3,
 'go_to': 4,
 'move_backward': 5,
 'move_forward': 6,
 'pick_up': 7,
 'place': 8,
 'say': 9,
 'set_point': 10,
 'sit_down': 11,
 'stand_up': 12,
 'status': 13,
 'stop': 14,
 'turn_left': 15,
 'turn_right': 16,
 'world_state': 17}

In [5]:
for intent in robotic_intents:
    intent["intent_id"] = name_to_id[intent["name"]]
    intent["intent_name"] = intent.pop("name")
    intent["sample_utterances"] = []
    intent["regexp_full_match"] = intent.pop("phrases")
    intent["regexp_partial_match"] = intent.pop("reg_phrases")
    intent.pop("punctuation")
    intent.pop("min_precision")

In [6]:
robotic_intents[0]

{'tags': ['movement_direction'],
 'intent_id': 6,
 'intent_name': 'move_forward',
 'sample_utterances': [],
 'regexp_full_match': ['(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)|(отъедь)|(отъезжай)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((2)|(три)|(17)|(десять)){0,1}( метров){0,1}',
  '(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)|(отъедь)|(отъезжай)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((10)|(десять)){0,1}( метров){0,1}(, если можешь){0,1}',
  '((тебе ((надо)|(следует)|(стоит))|ты (можешь|мог бы|не мог бы)) ){0,1}((проехать)|(отъехать)|(доехать)|(продвинуться)|(подвинуться)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((6)|(пять)){0,1}( метров){0,1}',
  '(робот(,){0,1} ){0,1}(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)) (на ){0,1}((7)|(четыре)|(11)|(двенадцать)){0,1}( метров){0,1}',
  'было бы хорош

In [7]:
json.dump(robotic_intents, open("../data/intent_records/ac_robotic_new.json", "w"), indent=4, ensure_ascii=False)

## dstc3

In [28]:
from datasets import load_dataset

dstc3 = load_dataset("marcel-gohsen/dstc3")
dstc3

DatasetDict({
    test: Dataset({
        features: ['session', 'caller', 'turn', 'transcript', 'audio', 'intent', 'slots', 'cam'],
        num_rows: 18715
    })
    seed: Dataset({
        features: ['session', 'caller', 'turn', 'transcript', 'audio', 'intent', 'slots', 'cam'],
        num_rows: 109
    })
})

In [29]:
dstc3["test"] = dstc3["test"].filter(lambda example: example["transcript"] != "")

Filter:   0%|          | 0/18715 [00:00<?, ? examples/s]

In [30]:
dstc3["test"]

Dataset({
    features: ['session', 'caller', 'turn', 'transcript', 'audio', 'intent', 'slots', 'cam'],
    num_rows: 18714
})

In [31]:
import itertools as it

intent_names = sorted(set(it.chain.from_iterable(intents for intents in dstc3["test"]["intent"])))
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'ack': 0,
 'affirm': 1,
 'bye': 2,
 'confirm': 3,
 'deny': 4,
 'hello': 5,
 'inform': 6,
 'negate': 7,
 'repeat': 8,
 'reqalts': 9,
 'reqmore': 10,
 'request': 11,
 'restart': 12,
 'thankyou': 13}

In [32]:
dstc3["test"].filter(lambda example: "reqmore" in example["intent"])

Filter:   0%|          | 0/18714 [00:00<?, ? examples/s]

Dataset({
    features: ['session', 'caller', 'turn', 'transcript', 'audio', 'intent', 'slots', 'cam'],
    num_rows: 1
})

In [33]:
intent_names.remove("reqmore")
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'ack': 0,
 'affirm': 1,
 'bye': 2,
 'confirm': 3,
 'deny': 4,
 'hello': 5,
 'inform': 6,
 'negate': 7,
 'repeat': 8,
 'reqalts': 9,
 'request': 10,
 'restart': 11,
 'thankyou': 12}

In [34]:
def transform(example: dict):
    return {
        "utterance": example["transcript"],
        "labels": [name_to_id[intent_name] for intent_name in example["intent"] if intent_name != "reqmore"],
    }


multilabel_dstc = dstc3["test"].map(transform, remove_columns=dstc3["test"].features.keys())

Map:   0%|          | 0/18714 [00:00<?, ? examples/s]

In [35]:
multilabel_dstc[0]

{'utterance': 'sil', 'labels': []}

In [36]:
multilabel_dstc

Dataset({
    features: ['utterance', 'labels'],
    num_rows: 18714
})

In [37]:
multilabel_dstc.to_json("../data/multi_label_data/dstc3.json", lines=False, indent=4, orient="records")

Creating json from Arrow format:   0%|          | 0/19 [00:00<?, ?ba/s]

1865921

## events text classification

In [2]:
from datasets import load_dataset

events_dataset = load_dataset("knowledgator/events_classification_biotech", trust_remote_code=True)
events_dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'target organization', 'all_labels', 'all_labels_concat', 'label 1', 'label 2', 'label 3', 'label 4', 'label 5'],
        num_rows: 2759
    })
    test: Dataset({
        features: ['title', 'content', 'target organization', 'all_labels', 'all_labels_concat', 'label 1', 'label 2', 'label 3', 'label 4', 'label 5'],
        num_rows: 381
    })
})

In [3]:
events_dataset["train"][-2]

{'title': 'Barnet, Enfield and Haringey Mental Health Trust on track to meet carbon emissions target',
 'content': "Barnet, Enfield and Haringey Mental Health Trust on track to meet carbon emissions target\nSt Mark's Hospital in Harrow. Photo: Google Street View\ncomment\nBarnet's mental health trust is putting the planet first as records show it is on track to meet ambitious targets to cut carbon emissions by a third.\nIt is likely Barnet, Enfield and Haringey Mental Health Trust will continue to achieve the Governments goal for the NHS to slash carbon dioxide emissions by 2020.\nThe mental health trust is amongst 90 others which were on course to meet the target in 2017/18 which leaves more than 60 per cent failing to take their responsibilities seriously, the shadow health secretary claimed.\nJohn Mills, BEHs director of estates and facilities, is pleased and reassured by their performance.\nHe explained the trust has reduced its CO2 emissions by 23 per cent since 2013.\nHe said: We

In [7]:
from collections import defaultdict

counter = defaultdict(int)
for batch in events_dataset["train"].iter(batch_size=16):
    for labels in batch["all_labels"]:
        for lab in labels:
            counter[lab] += 1
sorted(counter.items(), key=lambda x: x[1])

[('partnerships & alliances', 2),
 ('patent publication', 6),
 ('subsidiary establishment', 8),
 ('department establishment', 9),
 ('foundation', 11),
 ('closing', 19),
 ('ipo exit', 20),
 ('clinical trial sponsorship', 23),
 ('expanding industry', 36),
 ('new initiatives & programs', 53),
 ('investment in public company', 60),
 ('regulatory approval', 63),
 ('hiring', 73),
 ('event organization', 75),
 ('participation in an event', 77),
 ('product updates', 87),
 ('service & product providing', 103),
 ('funding round', 125),
 ('executive appointment', 134),
 ('article publication', 142),
 ('expanding geography', 146),
 ('support & philanthropy', 158),
 ('m&a', 185),
 ('new initiatives or programs', 203),
 ('product launching & presentation', 303),
 ('alliance & partnership', 365),
 ('other', 400),
 ('company description', 767),
 ('executive statement', 1435)]

In [10]:
import itertools as it

intent_names = sorted(set(it.chain.from_iterable(intents for intents in events_dataset["train"]["all_labels"])))
names_to_remove = [
    "partnerships & alliances",
    "patent publication",
    "subsidiary establishment",
    "department establishment",
]
for n in names_to_remove:
    intent_names.remove(n)
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'alliance & partnership': 0,
 'article publication': 1,
 'clinical trial sponsorship': 2,
 'closing': 3,
 'company description': 4,
 'event organization': 5,
 'executive appointment': 6,
 'executive statement': 7,
 'expanding geography': 8,
 'expanding industry': 9,
 'foundation': 10,
 'funding round': 11,
 'hiring': 12,
 'investment in public company': 13,
 'ipo exit': 14,
 'm&a': 15,
 'new initiatives & programs': 16,
 'new initiatives or programs': 17,
 'other': 18,
 'participation in an event': 19,
 'product launching & presentation': 20,
 'product updates': 21,
 'regulatory approval': 22,
 'service & product providing': 23,
 'support & philanthropy': 24}

In [12]:
def transform(example: dict):
    return {
        "utterance": example["content"],
        "labels": [name_to_id[intent_name] for intent_name in example["all_labels"] if intent_name not in names_to_remove],
    }


multilabel_events_dataset = events_dataset["train"].map(
    transform, remove_columns=events_dataset["train"].features.keys()
)
multilabel_events_dataset

Map:   0%|          | 0/2759 [00:00<?, ? examples/s]

Dataset({
    features: ['utterance', 'labels'],
    num_rows: 2759
})

In [13]:
multilabel_events_dataset.to_json("../data/multi_label_data/events.json", lines=False, indent=4, orient="records")

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

8865566

## reiters

In [15]:
reuters = load_dataset("ucirvine/reuters21578", "ModHayes", trust_remote_code=True)
reuters

DatasetDict({
    test: Dataset({
        features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],
        num_rows: 722
    })
    train: Dataset({
        features: ['text', 'text_type', 'topics', 'lewis_split', 'cgis_split', 'old_id', 'new_id', 'places', 'people', 'orgs', 'exchanges', 'date', 'title'],
        num_rows: 20856
    })
})

In [16]:
reuters["train"][0]

{'text': 'Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are doubts as to how much of this coc

In [18]:
from collections import defaultdict

counter = defaultdict(int)
for batch in reuters["train"].iter(batch_size=16):
    for labels in batch["topics"]:
        for lab in labels:
            counter[lab] += 1
sorted(counter.items(), key=lambda x: x[1])

[('red-bean', 1),
 ('citruspulp', 1),
 ('rape-meal', 1),
 ('corn-oil', 1),
 ('peseta', 1),
 ('ringgit', 1),
 ('castorseed', 1),
 ('lit', 1),
 ('rupiah', 1),
 ('skr', 1),
 ('dkr', 1),
 ('lin-meal', 1),
 ('cruzado', 1),
 ('cottonseed', 1),
 ('f-cattle', 1),
 ('sfr', 1),
 ('linseed', 2),
 ('lin-oil', 2),
 ('rye', 2),
 ('groundnut-oil', 2),
 ('cornglutenfeed', 2),
 ('wool', 2),
 ('fishmeal', 2),
 ('castor-oil', 2),
 ('sun-meal', 2),
 ('copra-cake', 3),
 ('palmkernel', 3),
 ('tapioca', 3),
 ('saudriyal', 3),
 ('can', 3),
 ('palladium', 3),
 ('dfl', 3),
 ('cotton-oil', 3),
 ('rand', 3),
 ('pork-belly', 3),
 ('nkr', 3),
 ('plywood', 4),
 ('cpu', 4),
 ('austdlr', 4),
 ('nzdlr', 4),
 ('jet', 5),
 ('inventories', 5),
 ('propane', 6),
 ('potato', 6),
 ('instal-debt', 6),
 ('naphtha', 6),
 ('coconut', 6),
 ('sun-oil', 7),
 ('coconut-oil', 7),
 ('rape-oil', 8),
 ('l-cattle', 8),
 ('groundnut', 9),
 ('nickel', 9),
 ('platinum', 12),
 ('tea', 13),
 ('oat', 14),
 ('dmk', 14),
 ('lei', 15),
 ('sunseed'

In [19]:
names_to_remove = [name for name, cnt in counter.items() if cnt < 10]

In [20]:
import itertools as it

intent_names = sorted(set(it.chain.from_iterable(intents for intents in reuters["train"]["topics"])))
for n in names_to_remove:
    intent_names.remove(n)
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'acq': 0,
 'alum': 1,
 'barley': 2,
 'bop': 3,
 'carcass': 4,
 'cocoa': 5,
 'coffee': 6,
 'copper': 7,
 'corn': 8,
 'cotton': 9,
 'cpi': 10,
 'crude': 11,
 'dlr': 12,
 'dmk': 13,
 'earn': 14,
 'fuel': 15,
 'gas': 16,
 'gnp': 17,
 'gold': 18,
 'grain': 19,
 'heat': 20,
 'hog': 21,
 'housing': 22,
 'income': 23,
 'interest': 24,
 'ipi': 25,
 'iron-steel': 26,
 'jobs': 27,
 'lead': 28,
 'lei': 29,
 'livestock': 30,
 'lumber': 31,
 'meal-feed': 32,
 'money-fx': 33,
 'money-supply': 34,
 'nat-gas': 35,
 'oat': 36,
 'oilseed': 37,
 'orange': 38,
 'palm-oil': 39,
 'pet-chem': 40,
 'platinum': 41,
 'rapeseed': 42,
 'reserves': 43,
 'retail': 44,
 'rice': 45,
 'rubber': 46,
 'ship': 47,
 'silver': 48,
 'sorghum': 49,
 'soy-meal': 50,
 'soy-oil': 51,
 'soybean': 52,
 'stg': 53,
 'strategic-metal': 54,
 'sugar': 55,
 'sunseed': 56,
 'tea': 57,
 'tin': 58,
 'trade': 59,
 'veg-oil': 60,
 'wheat': 61,
 'wpi': 62,
 'yen': 63,
 'zinc': 64}

In [21]:
def transform(example: dict):
    return {
        "utterance": example["text"],
        "labels": [name_to_id[intent_name] for intent_name in example["topics"] if intent_name not in names_to_remove],
    }


multilabel_reuters = reuters["train"].map(transform, remove_columns=reuters["train"].features.keys())
multilabel_reuters

Map:   0%|          | 0/20856 [00:00<?, ? examples/s]

Dataset({
    features: ['utterance', 'labels'],
    num_rows: 20856
})

In [22]:
multilabel_reuters[0]

{'utterance': 'Showers continued throughout the week in\nthe Bahia cocoa zone, alleviating the drought since early\nJanuary and improving prospects for the coming temporao,\nalthough normal humidity levels have not been restored,\nComissaria Smith said in its weekly review.\n    The dry period means the temporao will be late this year.\n    Arrivals for the week ended February 22 were 155,221 bags\nof 60 kilos making a cumulative total for the season of 5.93\nmln against 5.81 at the same stage last year. Again it seems\nthat cocoa delivered earlier on consignment was included in the\narrivals figures.\n    Comissaria Smith said there is still some doubt as to how\nmuch old crop cocoa is still available as harvesting has\npractically come to an end. With total Bahia crop estimates\naround 6.4 mln bags and sales standing at almost 6.2 mln there\nare a few hundred thousand bags still in the hands of farmers,\nmiddlemen, exporters and processors.\n    There are doubts as to how much of thi

In [23]:
multilabel_reuters.to_json("../data/multi_label_data/reuters.json", lines=False, indent=4, orient="records")

Creating json from Arrow format:   0%|          | 0/21 [00:00<?, ?ba/s]

17345885

## eurlex

In [24]:
eurlex = load_dataset("coastalcph/multi_eurlex", "en", trust_remote_code=True)
eurlex

DatasetDict({
    train: Dataset({
        features: ['celex_id', 'text', 'labels'],
        num_rows: 55000
    })
    test: Dataset({
        features: ['celex_id', 'text', 'labels'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['celex_id', 'text', 'labels'],
        num_rows: 5000
    })
})

In [25]:
eurlex["train"][0]

{'celex_id': '32006D0213',
 'text': 'COMMISSION DECISION\nof 6 March 2006\nestablishing the classes of reaction-to-fire performance for certain construction products as regards wood flooring and solid wood panelling and cladding\n(notified under document number C(2006) 655)\n(Text with EEA relevance)\n(2006/213/EC)\nTHE COMMISSION OF THE EUROPEAN COMMUNITIES,\nHaving regard to the Treaty establishing the European Community,\nHaving regard to Directive 89/106/EEC of 21 December 1988, on the approximation of laws, regulations and administrative provisions of the Member States relating to construction products (1), and in particular Article 20(2) thereof,\nWhereas:\n(1)\nDirective 89/106/EEC envisages that in order to take account of different levels of protection for construction works at national, regional or local level, it may be necessary to establish in the interpretative documents classes corresponding to the performance of products in respect of each essential requirement. Those d

In [27]:
from collections import defaultdict

counter = defaultdict(int)
for batch in eurlex["train"].iter(batch_size=16):
    for labels in batch["labels"]:
        for lab in labels:
            counter[lab] += 1
sorted(counter.items(), key=lambda x: x[1])

[(13, 541),
 (16, 877),
 (14, 1122),
 (9, 1926),
 (10, 2151),
 (11, 2233),
 (20, 3096),
 (8, 3317),
 (12, 3640),
 (7, 4175),
 (4, 5065),
 (0, 6056),
 (1, 6150),
 (19, 7065),
 (5, 8444),
 (2, 8803),
 (15, 13519),
 (6, 19431),
 (18, 22975),
 (17, 26931),
 (3, 30222)]

In [157]:
def transform(example: dict):
    return {"utterance": example["text"], "labels": example["labels"]}


multilabel_eurlex = eurlex["train"].map(transform, remove_columns=eurlex["train"].features.keys())
multilabel_eurlex

Dataset({
    features: ['labels', 'utterance'],
    num_rows: 55000
})

In [158]:
multilabel_eurlex.to_json("../data/multi_label_data/eurlex.json", lines=False, indent=4, orient="records")

397409661

In [2]:
eurlex = open("../data/multi_label_data/eurlex.json").read().replace("]\n[", ",")

In [3]:
open("../data/multi_label_data/eurlex.json", "w").write(eurlex)

397409553

In [4]:
import json

eurlex = json.load(open("../data/multi_label_data/eurlex.json"))

In [5]:
len(eurlex)

55000

In [6]:
eurlex[0]

{'labels': [1, 20, 7, 3, 0],
 'utterance': 'COMMISSION DECISION\nof 6 March 2006\nestablishing the classes of reaction-to-fire performance for certain construction products as regards wood flooring and solid wood panelling and cladding\n(notified under document number C(2006) 655)\n(Text with EEA relevance)\n(2006/213/EC)\nTHE COMMISSION OF THE EUROPEAN COMMUNITIES,\nHaving regard to the Treaty establishing the European Community,\nHaving regard to Directive 89/106/EEC of 21 December 1988, on the approximation of laws, regulations and administrative provisions of the Member States relating to construction products (1), and in particular Article 20(2) thereof,\nWhereas:\n(1)\nDirective 89/106/EEC envisages that in order to take account of different levels of protection for construction works at national, regional or local level, it may be necessary to establish in the interpretative documents classes corresponding to the performance of products in respect of each essential requirement. 

## robotics validation

In [13]:
import csv

with open("../data/robotics_val.csv", newline="") as file:
    csv_reader = csv.DictReader(file)
    robotics_val = list(csv_reader)

In [14]:
len(robotics_val)

80

In [15]:
robotics_val[:10]

[{'text': 'пожалуйста, проедь вперед на 20 метров', 'intents': 'move_forward'},
 {'text': 'тебе стоит отъехать назад 5 метров', 'intents': 'move_backward'},
 {'text': 'иди на расстояние', 'intents': 'GO'},
 {'text': 'подними этот стул, пожалуйста', 'intents': 'pick_up'},
 {'text': 'поставь эту коробку в угол', 'intents': 'place drop'},
 {'text': 'повтори за мной, пожалуйста', 'intents': 'say'},
 {'text': 'можешь сесть рядом', 'intents': 'sit_down'},
 {'text': 'встань, пожалуйста', 'intents': 'stand_up'},
 {'text': 'прекрати шуметь', 'intents': 'stop'},
 {'text': 'можешь завернуть направо', 'intents': 'turn_right'}]

In [16]:
def split_intent_field(record: dict):
    record["intents_splitted"] = record["intents"].split()


for rec in robotics_val:
    split_intent_field(rec)

In [17]:
robotics_val[:10]

[{'text': 'пожалуйста, проедь вперед на 20 метров',
  'intents': 'move_forward',
  'intents_splitted': ['move_forward']},
 {'text': 'тебе стоит отъехать назад 5 метров',
  'intents': 'move_backward',
  'intents_splitted': ['move_backward']},
 {'text': 'иди на расстояние', 'intents': 'GO', 'intents_splitted': ['GO']},
 {'text': 'подними этот стул, пожалуйста',
  'intents': 'pick_up',
  'intents_splitted': ['pick_up']},
 {'text': 'поставь эту коробку в угол',
  'intents': 'place drop',
  'intents_splitted': ['place', 'drop']},
 {'text': 'повтори за мной, пожалуйста',
  'intents': 'say',
  'intents_splitted': ['say']},
 {'text': 'можешь сесть рядом',
  'intents': 'sit_down',
  'intents_splitted': ['sit_down']},
 {'text': 'встань, пожалуйста',
  'intents': 'stand_up',
  'intents_splitted': ['stand_up']},
 {'text': 'прекрати шуметь', 'intents': 'stop', 'intents_splitted': ['stop']},
 {'text': 'можешь завернуть направо',
  'intents': 'turn_right',
  'intents_splitted': ['turn_right']}]

In [18]:
import itertools as it

intent_names = sorted(set(it.chain.from_iterable(rec["intents_splitted"] for rec in robotics_val)))
intent_names

['GO',
 'disable_autopilot',
 'drop',
 'enable_autopilot',
 'go_to',
 'move_backward',
 'move_forward',
 'pick_up',
 'place',
 'say',
 'set_point',
 'sit_down',
 'stand_up',
 'status',
 'stop',
 'turn_left',
 'turn_right',
 'undefined',
 'world_state']

в тестовой выборке есть класс `undefined`, видимо это OOS

In [19]:
for rec in robotics_val:
    if "undefined" in rec["intents_splitted"]:
        pass

{'text': 'Посчитай количество объектов в комнате', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Измерь расстояние до ближайшей стены', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Воспроизведи аудиофайл из моей библиотеки', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Запиши видео своего движения', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Активируй режим экономии энергии', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Синхронизируй данные с облачным хранилищем', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Проведи калибровку сенсоров', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Опусти левую руку и подними правую', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Подними руку и помаши', 'intents': 'undefined', 'intents_splitted': ['undefined']}
{'text': 'Выключи свет в комнате', 'intents': 'undefi

надо сделать метки для интентов, консистентные с метками в тренировочной выборке

In [20]:
intent_names.remove("undefined")

In [21]:
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'GO': 0,
 'disable_autopilot': 1,
 'drop': 2,
 'enable_autopilot': 3,
 'go_to': 4,
 'move_backward': 5,
 'move_forward': 6,
 'pick_up': 7,
 'place': 8,
 'say': 9,
 'set_point': 10,
 'sit_down': 11,
 'stand_up': 12,
 'status': 13,
 'stop': 14,
 'turn_left': 15,
 'turn_right': 16,
 'world_state': 17}

In [22]:
robotics_val_converted = []
for rec in robotics_val:
    robotics_val_converted.append(
        {
            "utterance": rec["text"],
            "labels": [name_to_id[name] for name in rec["intents_splitted"] if name != "undefined"],
        }
    )

In [23]:
robotics_val_converted[:10]

[{'utterance': 'пожалуйста, проедь вперед на 20 метров', 'labels': [6]},
 {'utterance': 'тебе стоит отъехать назад 5 метров', 'labels': [5]},
 {'utterance': 'иди на расстояние', 'labels': [0]},
 {'utterance': 'подними этот стул, пожалуйста', 'labels': [7]},
 {'utterance': 'поставь эту коробку в угол', 'labels': [8, 2]},
 {'utterance': 'повтори за мной, пожалуйста', 'labels': [9]},
 {'utterance': 'можешь сесть рядом', 'labels': [11]},
 {'utterance': 'встань, пожалуйста', 'labels': [12]},
 {'utterance': 'прекрати шуметь', 'labels': [14]},
 {'utterance': 'можешь завернуть направо', 'labels': [16]}]

In [24]:
import json

json.dump(robotics_val_converted, open("../data/intent_records/ac_robotic_val.json", "w"), indent=4, ensure_ascii=False)

## ac robotics ordering

In [26]:
import difflib
import json


def read_json_file(file_path):
    with open(file_path) as file:
        return json.load(file)


def json_to_string(json_obj):
    return json.dumps(json_obj, indent=4, sort_keys=True)


def compare_json_files(file1_path, file2_path):
    # Read JSON files
    json1 = read_json_file(file1_path)
    json2 = read_json_file(file2_path)

    # Convert JSON objects to strings
    json1_str = json_to_string(json1)
    json2_str = json_to_string(json2)

    # Split strings into lines for difflib
    json1_lines = json1_str.splitlines()
    json2_lines = json2_str.splitlines()

    # Compare the lines
    diff = difflib.unified_diff(json1_lines, json2_lines, fromfile=file1_path, tofile=file2_path)

    # Print the differences
    for _line in diff:
        pass

In [29]:
compare_json_files(
    file1_path="../data/ac_robotic_2_raw.json",
    file2_path="../data/autopilot_intents.json",
)

окей, автопилот интенты не изменились

In [30]:
import json

ordering_data = json.load(open("../data/ordering_intents.json"))
# ordering_tags = ordering_data["tags"]
ordering_intents = ordering_data["intents"]
len(ordering_intents)

13

In [31]:
ordering_intents[0]

{'name': 'get_order',
 'phrases': ['(пожалуйста(,){0,1} ){0,1}((получить)|(забрать)|(оформить)|(выдать)) ((заказ)|(посылку)|(покупку))',
  '(пожалуйста(,){0,1} ){0,1}((получить)|(забрать)|(оформить)|(выдать)) ((мой)|(свой)) ((заказ)|(посылку)|(покупку))',
  '((тебе ((надо)|(следует)|(стоит))|ты (можешь|мог бы|не мог бы)) ){0,1}((получить)|(забрать)|(оформить)|(выдать)) ((заказ)|(посылку)|(покупку))',
  '(сотрудник(,){0,1} ){0,1}(пожалуйста(,){0,1} ){0,1}((получить)|(забрать)|(оформить)|(выдать)) ((мой)|(свой)) ((заказ)|(посылку)|(покупку))',
  'было бы хорошо, если бы вы ((выдали)|(оформили)|(подготовили)) ((мой)|(свой)) ((заказ)|(посылку)|(покупку))',
  '((готов)|(хочу)|(когда можно)|(как)) ((получить)|(забрать)|(оформить)) ((мой)|(свой)) ((заказ)|(посылку)|(покупку))',
  '((заказ)|(посылка)|(покупка)) на выдачу',
  '((оформить)|(сделать)) выдачу ((заказа)|(посылки)|(покупки))',
  '((забираю)|(получаю)) ((заказ)|(посылку)|(покупку))'],
 'reg_phrases': ['(?i)(?:пожалуйста,?\\s*)?(?:сот

In [33]:
intent_names = sorted({rec["name"] for rec in ordering_intents})
intent_names

['about_pickup_point',
 'get_order',
 'have_more_questions',
 'have_no_more_questions',
 'not_ready_to_get_order',
 'off_topic',
 'order_is_accepted',
 'order_is_not_accepted',
 'query_delay_order',
 'query_order_conditions',
 'query_return_item',
 'query_working_hours',
 'ready_to_get_order']

In [34]:
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'about_pickup_point': 0,
 'get_order': 1,
 'have_more_questions': 2,
 'have_no_more_questions': 3,
 'not_ready_to_get_order': 4,
 'off_topic': 5,
 'order_is_accepted': 6,
 'order_is_not_accepted': 7,
 'query_delay_order': 8,
 'query_order_conditions': 9,
 'query_return_item': 10,
 'query_working_hours': 11,
 'ready_to_get_order': 12}

In [35]:
for intent in ordering_intents:
    intent["intent_id"] = name_to_id[intent["name"]]
    intent["intent_name"] = intent.pop("name")
    intent["sample_utterances"] = []
    intent["regexp_full_match"] = intent.pop("phrases")
    intent["regexp_partial_match"] = intent.pop("reg_phrases")
    intent.pop("punctuation")
    intent.pop("min_precision")

In [36]:
ordering_intents[0]

{'intent_id': 1,
 'intent_name': 'get_order',
 'sample_utterances': [],
 'regexp_full_match': ['(пожалуйста(,){0,1} ){0,1}((получить)|(забрать)|(оформить)|(выдать)) ((заказ)|(посылку)|(покупку))',
  '(пожалуйста(,){0,1} ){0,1}((получить)|(забрать)|(оформить)|(выдать)) ((мой)|(свой)) ((заказ)|(посылку)|(покупку))',
  '((тебе ((надо)|(следует)|(стоит))|ты (можешь|мог бы|не мог бы)) ){0,1}((получить)|(забрать)|(оформить)|(выдать)) ((заказ)|(посылку)|(покупку))',
  '(сотрудник(,){0,1} ){0,1}(пожалуйста(,){0,1} ){0,1}((получить)|(забрать)|(оформить)|(выдать)) ((мой)|(свой)) ((заказ)|(посылку)|(покупку))',
  'было бы хорошо, если бы вы ((выдали)|(оформили)|(подготовили)) ((мой)|(свой)) ((заказ)|(посылку)|(покупку))',
  '((готов)|(хочу)|(когда можно)|(как)) ((получить)|(забрать)|(оформить)) ((мой)|(свой)) ((заказ)|(посылку)|(покупку))',
  '((заказ)|(посылка)|(покупка)) на выдачу',
  '((оформить)|(сделать)) выдачу ((заказа)|(посылки)|(покупки))',
  '((забираю)|(получаю)) ((заказ)|(посылку)|(по

In [37]:
json.dump(ordering_intents, open("../data/intent_records/ac_ordering.json", "w"), indent=4, ensure_ascii=False)

теперь надо сделать мердж двух датасетов

In [40]:
autopilot_intents = json.load(open("../data/intent_records/ac_robotic_new.json"))
ordering_intents = json.load(open("../data/intent_records/ac_ordering.json"))

merged_intents = autopilot_intents + ordering_intents
len(merged_intents)

31

In [41]:
intent_names = sorted({rec["intent_name"] for rec in merged_intents})
intent_names

['GO',
 'about_pickup_point',
 'disable_autopilot',
 'drop',
 'enable_autopilot',
 'get_order',
 'go_to',
 'have_more_questions',
 'have_no_more_questions',
 'move_backward',
 'move_forward',
 'not_ready_to_get_order',
 'off_topic',
 'order_is_accepted',
 'order_is_not_accepted',
 'pick_up',
 'place',
 'query_delay_order',
 'query_order_conditions',
 'query_return_item',
 'query_working_hours',
 'ready_to_get_order',
 'say',
 'set_point',
 'sit_down',
 'stand_up',
 'status',
 'stop',
 'turn_left',
 'turn_right',
 'world_state']

In [42]:
name_to_id = {name: i for i, name in enumerate(intent_names)}
name_to_id

{'GO': 0,
 'about_pickup_point': 1,
 'disable_autopilot': 2,
 'drop': 3,
 'enable_autopilot': 4,
 'get_order': 5,
 'go_to': 6,
 'have_more_questions': 7,
 'have_no_more_questions': 8,
 'move_backward': 9,
 'move_forward': 10,
 'not_ready_to_get_order': 11,
 'off_topic': 12,
 'order_is_accepted': 13,
 'order_is_not_accepted': 14,
 'pick_up': 15,
 'place': 16,
 'query_delay_order': 17,
 'query_order_conditions': 18,
 'query_return_item': 19,
 'query_working_hours': 20,
 'ready_to_get_order': 21,
 'say': 22,
 'set_point': 23,
 'sit_down': 24,
 'stand_up': 25,
 'status': 26,
 'stop': 27,
 'turn_left': 28,
 'turn_right': 29,
 'world_state': 30}

In [43]:
for intent in merged_intents:
    intent["intent_id"] = name_to_id[intent["intent_name"]]

In [44]:
merged_intents[0]

{'tags': ['movement_direction'],
 'intent_id': 10,
 'intent_name': 'move_forward',
 'sample_utterances': [],
 'regexp_full_match': ['(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)|(отъедь)|(отъезжай)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((2)|(три)|(17)|(десять)){0,1}( метров){0,1}',
  '(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)|(отъедь)|(отъезжай)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((10)|(десять)){0,1}( метров){0,1}(, если можешь){0,1}',
  '((тебе ((надо)|(следует)|(стоит))|ты (можешь|мог бы|не мог бы)) ){0,1}((проехать)|(отъехать)|(доехать)|(продвинуться)|(подвинуться)) ((вперёд)|(вперед)|(по прямой)|(прямо)) (на ){0,1}((6)|(пять)){0,1}( метров){0,1}',
  '(робот(,){0,1} ){0,1}(пожалуйста(,){0,1} ){0,1}((проедь)|(проезжай)|(едь)|(езжай)|(двигайся)|(подвинься)|(продвигайся)) (на ){0,1}((7)|(четыре)|(11)|(двенадцать)){0,1}( метров){0,1}',
  'было бы хоро

In [45]:
json.dump(merged_intents, open("../data/intent_records/ac_merged.json", "w"), indent=4, ensure_ascii=False)