# Data Convertion

Here's the code for converting different datasets to a my format.

In [21]:
import json

In [2]:
# ! mkdir ../data/intent_records

## Dream

source:
- english https://github.com/deeppavlov/dream/blob/new_intents/annotators/IntentCatcherTransformers/intent_phrases.json
- russian https://github.com/deeppavlov/dream/blob/new_intents/annotators/IntentCatcherTransformers/intent_phrases_RU.json

In [3]:
import json

dream = json.load(open('../data/dream.json'))
ru_dream = json.load(open('../data/ru_dream.json'))

In [4]:
def convert_dream(dream_dict):
    res = []
    for i, (intent_name, all_phrases) in enumerate(dream_dict['intent_phrases'].items()):
        intent_record = dict(
            intent_id=i,
            intent_name=intent_name,
            sample_utterances=[],
            regexp_for_sampling=all_phrases['phrases'],
            regexp_as_rules=all_phrases['reg_phrases'] if 'reg_phrases' in all_phrases else []
        )
        res.append(intent_record)
    return res

In [5]:
dream_records = convert_dream(dream)

In [6]:
import pandas as pd

pd.DataFrame.from_records(dream_records)

Unnamed: 0,intent_id,intent_name,sample_utterances,regexp_for_sampling,regexp_as_rules
0,0,what_are_you_talking_about,[],"[(alexa ){0,1}what are ((you)|(we)) ((talking ...","[(alexa ){0,1}are we having a communication pr..."
1,1,topic_switching,[],"[(that's ){0,1}enough( talking ){0,1} about ((...","[tell me something else, don't tell me about ...."
2,2,lets_chat_about,[],"[(stop ){0,1}(((let's )|(i want to )|(wanna )|...",[.*let(('s)|(s)) ((chat)|(talk)) ((to|with) (m...
3,3,exit,[],"[be quiet, (see you ){0,1}later, leave me alon...","[(leave|end) (the|this) conversation, alexa do..."
4,4,tell_me_a_story,[],"[tell me ((another)|(other)) story, ((can you ...","[(can you ){0,1}tell me a .* story]"
5,5,repeat,[],"[i did not hear you, what come again, what did...","[one second, what( (book|movie))?, say it agai..."
6,6,yes,[],"[yes yes yes, ((sure)|(fine)|(okay)|(ok)|(yes)...","[you bet, kind of, sort of, oh yeah, maybe, it..."
7,7,no,[],"[(alexa ){0,1}((no)|(nope)|(no way)|(don't)|(d...","[.* no no, .* not today]"
8,8,dont_understand,[],"[(because ){0,1}you are being confusing(, alex...",[]
9,9,stupid,[],"[(alexa ){0,1}why are you this ((stupid)|(dump...",[]


In [7]:
json.dump(dream_records, open('../data/intent_records/dream.json', 'w'), indent=4, ensure_ascii=False)

In [8]:
dream_ru_records = convert_dream(ru_dream)
pd.DataFrame.from_records(dream_ru_records)

Unnamed: 0,intent_id,intent_name,sample_utterances,regexp_for_sampling,regexp_as_rules
0,0,what_are_you_talking_about,[],"[о ((чем)|(чём)) ты( говоришь){0,1}( вообще){0...","[о ((чем)|(чём)) ты( говоришь){0,1}( вообще){0..."
1,1,topic_switching,[],[((хватит)|(прекрати)|(не хочу)|(не хочу больш...,[((хватит)|(прекрати)|(не хочу)|(не хочу больш...
2,2,lets_chat_about,[],[((можем)|(можешь)|(давай))(( мы)|( ты)|( я)){...,[((можем)|(можешь)|(давай))(( мы)|( ты)|( я)){...
3,3,exit,[],"[пока, хватит, закончим разговор, мне пора]","[пока(((-)|( ))пока){0,1}, хватит ((болтать)|(..."
4,4,repeat,[],"[повтори( еще раз){0,1}( пожалуйста){0,1}, ((м...","[повтори( еще раз){0,1}( пожалуйста){0,1}, ((м..."
5,5,yes,[],[((да)|(конечно)|(разумеется)|(точно)|(согласе...,[((да)|(конечно)|(разумеется)|(точно)|(согласе...
6,6,no,[],[((нет)|(нее)|(неа)|(ни за что)|(ни в коем слу...,[((нет)|(нее)|(неа)|(ни за что)|(ни в коем слу...
7,7,what_is_your_name,[],"[((представься)|(представь себя)), у тебя есть...","[((представься)|(представь себя)), у тебя есть..."
8,8,where_are_you_from,[],"[откуда ((ты)|(вы))( родом){0,1}, ((какая)|(ка...","[откуда ((ты)|(вы))( родом){0,1}, ((какая)|(ка..."
9,9,what_can_you_do,[],[что ты ((умеешь)|(можешь)|(способна)|(способе...,[.*что ты ((умеешь)|(можешь)|(способна)|(спосо...


In [9]:
json.dump(dream_ru_records, open('../data/intent_records/ru_dream.json', 'w'), indent=4, ensure_ascii=False)

## banking77

source: https://huggingface.co/datasets/PolyAI/banking77

In [10]:
from datasets import load_dataset

banking77 = load_dataset('PolyAI/banking77')

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
banking77

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [12]:
banking77['train'][0]

{'text': 'I am still waiting on my card?', 'label': 11}

In [13]:
# ! wget https://huggingface.co/datasets/PolyAI/banking77/resolve/main/dataset_infos.json -O ../data/banking77_info.json

### intent records

In [14]:
banking77_info = json.load(open('../data/banking77_info.json'))
intent_names = banking77_info['default']['features']['label']['names']

In [15]:
def convert_banking77(banking77_train, shots_per_intent, intent_names):
    all_labels = sorted(banking77_train.unique('label'))
    assert all_labels == list(range(len(intent_names)))

    res = [{
        'intent_id': i,
        'intent_name': name,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i, name in enumerate(intent_names)]


    for b77_batch in banking77_train.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(b77_batch['text'], b77_batch['label']):
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [16]:
banking77_records = convert_banking77(banking77['train'], shots_per_intent=5, intent_names=intent_names)

In [17]:
banking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ["Please help me with my card.  It won't activate.",
  'I tired but an unable to activate my card.',
  'I want to start using my card.',
  'How do I verify my new card?',
  "I tried activating my plug-in and it didn't piece of work"],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [18]:
json.dump(banking77_records, open('../data/intent_records/banking77.json', 'w'), indent=4, ensure_ascii=False)

### utterance records

In [5]:
import json

banking77_records = json.load(open('../data/intent_records/banking77.json'))
ru_banking77_records = json.load(open('../data/intent_records/ru_banking77.json'))
banking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ["Please help me with my card.  It won't activate.",
  'I tired but an unable to activate my card.',
  'I want to start using my card.',
  'How do I verify my new card?',
  "I tried activating my plug-in and it didn't piece of work"],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [6]:
def get_utterance_records(intent_records):
    res = []
    for rec in intent_records:
        for utt in rec['sample_utterances']:
            res.append(dict(
                intent_id=rec['intent_id'],
                intent_name=rec['intent_name'],
                utterance=utt
            ))
    return res

In [7]:
banking77_utterance_records = get_utterance_records(banking77_records)
ru_banking77_utterance_records = get_utterance_records(ru_banking77_records)

In [8]:
json.dump(banking77_utterance_records, open('../data/utterance_records/banking77.json', 'w'), indent=4, ensure_ascii=False)
json.dump(ru_banking77_utterance_records, open('../data/utterance_records/ru_banking77.json', 'w'), indent=4, ensure_ascii=False)

## russian banking77

source: https://github.com/LadaNikitina/RuBanking77

In [20]:
# ! git clone https://github.com/LadaNikitina/RuBanking77 ../data/RuBanking77
# ! rm -rf ../data/RuBanking77/.git

Cloning into '../data/RuBanking77'...
Username for 'https://github.com': ^C


In [21]:
from datasets import load_from_disk

rubanking77 = load_from_disk('../data/RuBanking77')
rubanking77

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [22]:
rubanking77['train'][0]

{'text': 'Я все еще жду свою карту?', 'label': 11}

In [23]:
rubanking77_records = convert_banking77(rubanking77['train'], shots_per_intent=5)

In [24]:
rubanking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ['Пожалуйста, помогите мне с моей картой. Она не активируется.',
  'Я устал, но не могу активировать свою карту.',
  'Я хочу начать пользоваться своей картой.',
  'Как мне проверить мою новую карту?',
  'Я попытался активировать свой плагин, и это не сработало.'],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [25]:
json.dump(rubanking77_records, open('../data/intent_records/ru_banking77.json', 'w'), indent=4, ensure_ascii=False)

## clinc150

In [13]:
from datasets import load_dataset

clinc150 = load_dataset("cmaldona/All-Generalization-OOD-CLINC150")
clinc150

Downloading readme:   0%|          | 0.00/497 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/212k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/610k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15200 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3200 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7900 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['data', 'labels', 'domain', 'generalisation'],
        num_rows: 15200
    })
    validation: Dataset({
        features: ['data', 'labels', 'domain', 'generalisation'],
        num_rows: 3200
    })
    test: Dataset({
        features: ['data', 'labels', 'domain', 'generalisation'],
        num_rows: 7900
    })
})

In [16]:
intent_names = sorted(clinc150['train'].unique('labels'))
print(len(intent_names))
intent_names[:5]

151


['accept_reservations',
 'account_blocked',
 'alarm',
 'application_status',
 'apr']

In [17]:
def convert_clinc150(clinc150_train, shots_per_intent):
    intent_names = sorted(clinc150_train.unique('labels'))
    name_to_id = dict(zip(intent_names, range(len(intent_names))))

    res = [{
        'intent_id': i,
        'intent_name': name,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i, name in enumerate(intent_names)]


    for batch in clinc150_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch['data'], batch['labels']):
            intent_id = name_to_id[name]
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [18]:
clinc150_records = convert_clinc150(clinc150['train'], shots_per_intent=5)

In [19]:
clinc150_records[0]

{'intent_id': 0,
 'intent_name': 'accept_reservations',
 'sample_utterances': ['can i make a reservation for redrobin',
  'is it possible to make a reservation at redrobin',
  'does redrobin take reservations',
  'are reservations taken at redrobin',
  'does redrobin do reservations'],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [22]:
json.dump(clinc150_records, open('../data/intent_records/clinc150.json', 'w'), indent=4, ensure_ascii=False)

## russian clinc150

In [None]:
# ! git clone https://github.com/LadaNikitina/clinc150 ../data/RuClinc150
# ! rm -rf ../data/RuClinc150/.git

In [23]:
from datasets import load_from_disk

ruclinc150 = load_from_disk('../data/RuClinc150')
ruclinc150

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 15250
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
})

In [24]:
ruclinc150['train'][0]

{'text': 'Какое выражение я бы использовал, чтобы сказать, что я люблю тебя, если бы я был итальянцем?',
 'intent': 61}

In [25]:
def convert_ruclinc150(clinc150_train, shots_per_intent):
    all_labels = sorted(clinc150_train.unique('intent'))
    assert all_labels == list(range(151))

    res = [{
        'intent_id': i,
        'intent_name': None,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i in range(151)]


    for batch in clinc150_train.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(batch['text'], batch['intent']):
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [27]:
ruclinc150_records = convert_ruclinc150(ruclinc150['train'], shots_per_intent=5)

In [28]:
json.dump(ruclinc150_records, open('../data/intent_records/ru_clinc150.json', 'w'), indent=4, ensure_ascii=False)

## Snips

In [42]:
from datasets import load_dataset

snips = load_dataset("benayas/snips")
snips

Downloading readme:   0%|          | 0.00/426 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/370k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/45.0k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/13084 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1400 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'category'],
        num_rows: 13084
    })
    test: Dataset({
        features: ['text', 'category'],
        num_rows: 1400
    })
})

In [44]:
intent_names = sorted(snips['train'].unique('category'))
print(len(intent_names))
intent_names

7


['AddToPlaylist',
 'BookRestaurant',
 'GetWeather',
 'PlayMusic',
 'RateBook',
 'SearchCreativeWork',
 'SearchScreeningEvent']

In [47]:
def convert_snips(snips_train, shots_per_intent):
    intent_names = sorted(snips_train.unique('category'))
    name_to_id = dict(zip(intent_names, range(len(intent_names))))

    res = [{
        'intent_id': i,
        'intent_name': name,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i, name in enumerate(intent_names)]


    for batch in snips_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch['text'], batch['category']):
            intent_id = name_to_id[name]
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [48]:
snips_records = convert_snips(snips['train'], shots_per_intent=5)

In [49]:
json.dump(snips_records, open('../data/intent_records/snips.json', 'w'), indent=4, ensure_ascii=False)

## russian Snips

In [None]:
# ! git clone https://github.com/LadaNikitina/Snips ../data/RuSnips
# ! rm -rf ../data/RuSnips/.git

In [30]:
from datasets import load_from_disk

rusnips = load_from_disk('../data/RuSnips')
rusnips

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 13784
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 700
    })
})

In [31]:
rusnips['train'][0]

{'text': 'Добавьте еще одну песню в плейлист Cita Romántica.',
 'intent': 'add_to_playlist'}

In [33]:
rusnips['train'].unique('intent')

['add_to_playlist',
 'book_restaurant',
 'get_weather',
 'play_music',
 'rate_book',
 'search_creative_work',
 'search_screening_event']

In [39]:
def convert_rusnips(snips_train, shots_per_intent):
    intent_names = sorted(snips_train.unique('intent'))
    name_to_id = dict(zip(intent_names, range(len(intent_names))))

    res = [{
        'intent_id': i,
        'intent_name': name,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i, name in enumerate(intent_names)]


    for batch in snips_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch['text'], batch['intent']):
            intent_id = name_to_id[name]
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [40]:
rusnips_records = convert_rusnips(rusnips['train'], shots_per_intent=5)

In [41]:
json.dump(rusnips_records, open('../data/intent_records/ru_snips.json', 'w'), indent=4, ensure_ascii=False)

## hwu64

source: https://github.com/jianguoz/Few-Shot-Intent-Detection/tree/main/Datasets/HWU64/train

In [1]:
hwu64_labels = open("../data/hwu_assets/label.txt").read().split('\n')[:-1]
print(len(hwu64_labels))
hwu64_labels[:5]

8954


['alarm_query', 'alarm_query', 'alarm_query', 'alarm_query', 'alarm_query']

In [2]:
hwu64_utterances = open("../data/hwu_assets/seq.in").read().split('\n')[:-1]
print(len(hwu64_utterances))
hwu64_utterances[:5]

8954


['what alarms do i have set right now',
 'checkout today alarm of meeting',
 'report alarm settings',
 'see see for me the alarms that you have set tomorrow morning',
 'is there an alarm for ten am']

In [3]:
len(set(hwu64_labels))

64

In [4]:
def convert_hwu64(hwu_utterances, hwu_labels, shots_per_intent):
    intent_names = sorted(set(hwu_labels))
    name_to_id = dict(zip(intent_names, range(len(intent_names))))

    res = [{
        'intent_id': i,
        'intent_name': name,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i, name in enumerate(intent_names)]

    for txt, name in zip(hwu_utterances, hwu_labels):
        intent_id = name_to_id[name]
        target_list = res[intent_id]['sample_utterances']
        if len(target_list) >= shots_per_intent:
            continue
        target_list.append(txt)

    return res

In [5]:
hwu64_records = convert_hwu64(hwu64_utterances, hwu64_labels, shots_per_intent=5)

In [6]:
import json
json.dump(hwu64_records, open('../data/intent_records/hwu64.json', 'w'), indent=4, ensure_ascii=False)

## russian hwu64

In [None]:
# ! git clone https://github.com/LadaNikitina/HWU64 ../data/RuHWU64
# ! rm -rf ../data/RuHWU64/.git

In [74]:
from datasets import load_from_disk

ruhwu64 = load_from_disk('../data/RuHWU64')
ruhwu64

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 25606
    })
})

In [75]:
ruhwu64['train'][0]

{'text': 'Разбуди меня в 5 утра на этой неделе', 'intent': 'set'}

In [80]:
intent_names = sorted(set(ruhwu64['train'].unique('intent')))
print(len(intent_names))
intent_names[:5]

54


['addcontact', 'affirm', 'audiobook', 'cleaning', 'coffee']

In [81]:
def convert_ruhwu64(hwu64_train, shots_per_intent):
    intent_names = sorted(hwu64_train.unique('intent'))
    name_to_id = dict(zip(intent_names, range(len(intent_names))))

    res = [{
        'intent_id': i,
        'intent_name': name,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i, name in enumerate(intent_names)]


    for batch in hwu64_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch['text'], batch['intent']):
            intent_id = name_to_id[name]
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [82]:
ruhwu64_records = convert_ruhwu64(ruhwu64['train'], shots_per_intent=5)

In [83]:
json.dump(ruhwu64_records, open('../data/intent_records/ru_hwu64.json', 'w'), indent=4, ensure_ascii=False)

## russian Minds14

In [86]:
from datasets import load_dataset

ruminds14 = load_dataset("PolyAI/minds14", 'ru-RU')
ruminds14

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
        num_rows: 539
    })
})

In [87]:
ruminds14["train"][0]

{'path': '/home/voorhs/.cache/huggingface/datasets/downloads/extracted/f8075d4661a5d714b98ee4adbe7d239eb1a69d19d3cd7fa9cd42aeb27cab93c3/ru-RU~LATEST_TRANSACTIONS/6030093cbb1e6d0fbce93a74.wav',
 'audio': {'path': '/home/voorhs/.cache/huggingface/datasets/downloads/extracted/f8075d4661a5d714b98ee4adbe7d239eb1a69d19d3cd7fa9cd42aeb27cab93c3/ru-RU~LATEST_TRANSACTIONS/6030093cbb1e6d0fbce93a74.wav',
  'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
         -0.00073242, -0.00073242]),
  'sampling_rate': 8000},
 'transcription': 'Здравствуйте я бы хотела пересмотреть свои предыдущие последние операции которые проходили по моей карте прямым помимо ему счёту Покажите пожалуйста операции последних трёх месяцев',
 'english_transcription': 'Hello, I would like to review my previous last transactions that took place on my card directly in addition to his account. Please show the transactions of the last three months',
 'intent_class': 12,
 'lang_id': 12}

In [89]:
sorted(ruminds14["train"].unique('intent_class'))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

In [90]:
def convert_ruminds14(minds14_train, shots_per_intent):
    res = [{
        'intent_id': i,
        'intent_name': None,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i in range(14)]


    for batch in minds14_train.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(batch['transcription'], batch['intent_class']):
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [91]:
ruminds14_records = convert_ruminds14(ruminds14['train'], shots_per_intent=5)

In [92]:
ruminds14_records[0]

{'intent_id': 0,
 'intent_name': None,
 'sample_utterances': ['Здравствуйте я хотел бы узнать Могу ли я использовать свою карту за границей и нужно мне для этого предупредить Мой Банк Спасибо',
  'Здравствуйте какая комиссия будет если я буду использовать карту свою заграницы',
  'Здравствуйте Через несколько дней я уезжаю в швецию на целый месяц я хотел бы узнать Могу ли я платить моей карты Спасибо',
  'Здравствуйте я уезжаю на несколько недель заграницу и хотела бы узнать Могу ли я пользоваться твоей картой там Я бы хотела оплачивать покупки в Америке Нужно ли мне что-нибудь для этого делать',
  'Здравствуйте мне хотелось бы задать вам один вопрос я уезжаю в отпуск и мне нужно уточнить работать будет ли моя кредитная карточка За границей'],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [93]:
json.dump(ruminds14_records, open('../data/intent_records/ru_minds14.json', 'w'), indent=4, ensure_ascii=False)

## Minds14

In [94]:
def convert_minds14(minds14_train, shots_per_intent):
    res = [{
        'intent_id': i,
        'intent_name': None,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i in range(14)]


    for batch in minds14_train.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(batch['english_transcription'], batch['intent_class']):
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [95]:
minds14_records = convert_minds14(ruminds14['train'], shots_per_intent=5)

In [96]:
minds14_records[0]

{'intent_id': 0,
 'intent_name': None,
 'sample_utterances': ['Hello, I would like to know if I can use my card abroad and I need to warn My Bank for this. Thank you',
  'Hello, what will be the commission if I use my card abroad',
  'Hello In a few days I am leaving for sweden for a whole month I would like to know if I can pay with my card Thank you',
  'Hello, I am going abroad for a few weeks and would like to know Can I use your card there I would like to pay for purchases in America Do I need to do anything for this',
  'Hello, I would like to ask you one question, I am going on vacation and I need to clarify whether my credit card will work abroad'],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [97]:
json.dump(minds14_records, open('../data/intent_records/minds14.json', 'w'), indent=4, ensure_ascii=False)

## Massive

In [98]:
from datasets import load_dataset

massive = load_dataset("mteb/amazon_massive_intent", 'en')
massive

Downloading readme:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/187k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/54.1k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/38.3k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11514 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2974 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2033 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 11514
    })
    test: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 2974
    })
    validation: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 2033
    })
})

In [101]:
massive['train'][0]

{'id': '1',
 'label': 'alarm_set',
 'label_text': 'alarm_set',
 'text': 'wake me up at nine am on friday',
 'lang': 'en'}

In [104]:
intent_names = sorted(massive['train'].unique('label'))
print(len(intent_names))
intent_names[:5]

60


['alarm_query',
 'alarm_remove',
 'alarm_set',
 'audio_volume_down',
 'audio_volume_mute']

In [108]:
def convert_massive(massive_train, shots_per_intent):
    intent_names = sorted(massive_train.unique('label'))
    name_to_id = dict(zip(intent_names, range(len(intent_names))))

    res = [{
        'intent_id': i,
        'intent_name': name,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i, name in enumerate(intent_names)]


    for batch in massive_train.iter(batch_size=16, drop_last_batch=False):
        for txt, name in zip(batch['text'], batch['label']):
            intent_id = name_to_id[name]
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [109]:
massive_records = convert_massive(massive['train'], shots_per_intent=5)

In [110]:
massive_records[0]

{'intent_id': 0,
 'intent_name': 'alarm_query',
 'sample_utterances': ['please list active alarms',
  'show me the alarms i set',
  'do i have any alarms',
  'show alarms',
  'do i have an alarm set for morning flight'],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [111]:
json.dump(massive_records, open('../data/intent_records/massive.json', 'w'), indent=4, ensure_ascii=False)

## russian Massive

In [112]:
from datasets import load_dataset

rumassive = load_dataset("mteb/amazon_massive_intent", 'ru')
rumassive

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/51 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/262k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/73.4k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/51.8k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11514 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2974 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2033 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 11514
    })
    test: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 2974
    })
    validation: Dataset({
        features: ['id', 'label', 'label_text', 'text', 'lang'],
        num_rows: 2033
    })
})

In [113]:
rumassive['train'][0]

{'id': '1',
 'label': 'alarm_set',
 'label_text': 'alarm_set',
 'text': 'разбуди меня в девять утра в пятницу',
 'lang': 'ru'}

In [114]:
rumassive_records = convert_massive(rumassive['train'], shots_per_intent=5)

In [115]:
rumassive_records[0]

{'intent_id': 0,
 'intent_name': 'alarm_query',
 'sample_utterances': ['пожалуйста список активных будильников',
  'покажи мне будильники которые я установил',
  'у меня есть какие-то будильники',
  'покажи будильники',
  'имею ли я будильник на утренний рейс'],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [116]:
json.dump(rumassive_records, open('../data/intent_records/ru_massive.json', 'w'), indent=4, ensure_ascii=False)