# Data Convertion

Here's the code for converting different datasets to a my format.

In [2]:
# ! mkdir ../data/intent_records

## Dream

source:
- english https://github.com/deeppavlov/dream/blob/new_intents/annotators/IntentCatcherTransformers/intent_phrases.json
- russian https://github.com/deeppavlov/dream/blob/new_intents/annotators/IntentCatcherTransformers/intent_phrases_RU.json

In [3]:
import json

dream = json.load(open('../data/dream.json'))
ru_dream = json.load(open('../data/ru_dream.json'))

In [4]:
def convert_dream(dream_dict):
    res = []
    for i, (intent_name, all_phrases) in enumerate(dream_dict['intent_phrases'].items()):
        intent_record = dict(
            intent_id=i,
            intent_name=intent_name,
            sample_utterances=[],
            regexp_for_sampling=all_phrases['phrases'],
            regexp_as_rules=all_phrases['reg_phrases'] if 'reg_phrases' in all_phrases else []
        )
        res.append(intent_record)
    return res

In [5]:
dream_records = convert_dream(dream)

In [6]:
import pandas as pd

pd.DataFrame.from_records(dream_records)

Unnamed: 0,intent_id,intent_name,sample_utterances,regexp_for_sampling,regexp_as_rules
0,0,what_are_you_talking_about,[],"[(alexa ){0,1}what are ((you)|(we)) ((talking ...","[(alexa ){0,1}are we having a communication pr..."
1,1,topic_switching,[],"[(that's ){0,1}enough( talking ){0,1} about ((...","[tell me something else, don't tell me about ...."
2,2,lets_chat_about,[],"[(stop ){0,1}(((let's )|(i want to )|(wanna )|...",[.*let(('s)|(s)) ((chat)|(talk)) ((to|with) (m...
3,3,exit,[],"[be quiet, (see you ){0,1}later, leave me alon...","[(leave|end) (the|this) conversation, alexa do..."
4,4,tell_me_a_story,[],"[tell me ((another)|(other)) story, ((can you ...","[(can you ){0,1}tell me a .* story]"
5,5,repeat,[],"[i did not hear you, what come again, what did...","[one second, what( (book|movie))?, say it agai..."
6,6,yes,[],"[yes yes yes, ((sure)|(fine)|(okay)|(ok)|(yes)...","[you bet, kind of, sort of, oh yeah, maybe, it..."
7,7,no,[],"[(alexa ){0,1}((no)|(nope)|(no way)|(don't)|(d...","[.* no no, .* not today]"
8,8,dont_understand,[],"[(because ){0,1}you are being confusing(, alex...",[]
9,9,stupid,[],"[(alexa ){0,1}why are you this ((stupid)|(dump...",[]


In [7]:
json.dump(dream_records, open('../data/intent_records/dream.json', 'w'), indent=4, ensure_ascii=False)

In [8]:
dream_ru_records = convert_dream(ru_dream)
pd.DataFrame.from_records(dream_ru_records)

Unnamed: 0,intent_id,intent_name,sample_utterances,regexp_for_sampling,regexp_as_rules
0,0,what_are_you_talking_about,[],"[о ((чем)|(чём)) ты( говоришь){0,1}( вообще){0...","[о ((чем)|(чём)) ты( говоришь){0,1}( вообще){0..."
1,1,topic_switching,[],[((хватит)|(прекрати)|(не хочу)|(не хочу больш...,[((хватит)|(прекрати)|(не хочу)|(не хочу больш...
2,2,lets_chat_about,[],[((можем)|(можешь)|(давай))(( мы)|( ты)|( я)){...,[((можем)|(можешь)|(давай))(( мы)|( ты)|( я)){...
3,3,exit,[],"[пока, хватит, закончим разговор, мне пора]","[пока(((-)|( ))пока){0,1}, хватит ((болтать)|(..."
4,4,repeat,[],"[повтори( еще раз){0,1}( пожалуйста){0,1}, ((м...","[повтори( еще раз){0,1}( пожалуйста){0,1}, ((м..."
5,5,yes,[],[((да)|(конечно)|(разумеется)|(точно)|(согласе...,[((да)|(конечно)|(разумеется)|(точно)|(согласе...
6,6,no,[],[((нет)|(нее)|(неа)|(ни за что)|(ни в коем слу...,[((нет)|(нее)|(неа)|(ни за что)|(ни в коем слу...
7,7,what_is_your_name,[],"[((представься)|(представь себя)), у тебя есть...","[((представься)|(представь себя)), у тебя есть..."
8,8,where_are_you_from,[],"[откуда ((ты)|(вы))( родом){0,1}, ((какая)|(ка...","[откуда ((ты)|(вы))( родом){0,1}, ((какая)|(ка..."
9,9,what_can_you_do,[],[что ты ((умеешь)|(можешь)|(способна)|(способе...,[.*что ты ((умеешь)|(можешь)|(способна)|(спосо...


In [9]:
json.dump(dream_ru_records, open('../data/intent_records/ru_dream.json', 'w'), indent=4, ensure_ascii=False)

## banking77

source: https://huggingface.co/datasets/PolyAI/banking77

In [10]:
from datasets import load_dataset

banking77 = load_dataset('PolyAI/banking77')

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
banking77

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [12]:
banking77['train'][0]

{'text': 'I am still waiting on my card?', 'label': 11}

In [13]:
# ! wget https://huggingface.co/datasets/PolyAI/banking77/resolve/main/dataset_infos.json -O ../data/banking77_info.json

### intent records

In [14]:
banking77_info = json.load(open('../data/banking77_info.json'))
intent_names = banking77_info['default']['features']['label']['names']

In [15]:
def convert_banking77(banking77_train, shots_per_intent):
    all_labels = sorted(banking77_train.unique('label'))
    assert all_labels == list(range(77))

    res = [{
        'intent_id': i,
        'intent_name': name,
        'sample_utterances': [],
        'regexp_for_sampling': [],
        'regexp_as_rules': []
    } for i, name in enumerate(intent_names)]


    for b77_batch in banking77_train.iter(batch_size=16, drop_last_batch=False):
        for txt, intent_id in zip(b77_batch['text'], b77_batch['label']):
            target_list = res[intent_id]['sample_utterances']
            if len(target_list) >= shots_per_intent:
                continue
            target_list.append(txt)
    
    return res

In [16]:
banking77_records = convert_banking77(banking77['train'], shots_per_intent=5)

In [17]:
banking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ["Please help me with my card.  It won't activate.",
  'I tired but an unable to activate my card.',
  'I want to start using my card.',
  'How do I verify my new card?',
  "I tried activating my plug-in and it didn't piece of work"],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [18]:
json.dump(banking77_records, open('../data/intent_records/banking77.json', 'w'), indent=4, ensure_ascii=False)

### utterance records

In [5]:
import json

banking77_records = json.load(open('../data/intent_records/banking77.json'))
ru_banking77_records = json.load(open('../data/intent_records/ru_banking77.json'))
banking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ["Please help me with my card.  It won't activate.",
  'I tired but an unable to activate my card.',
  'I want to start using my card.',
  'How do I verify my new card?',
  "I tried activating my plug-in and it didn't piece of work"],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [6]:
def get_utterance_records(intent_records):
    res = []
    for rec in intent_records:
        for utt in rec['sample_utterances']:
            res.append(dict(
                intent_id=rec['intent_id'],
                intent_name=rec['intent_name'],
                utterance=utt
            ))
    return res

In [7]:
banking77_utterance_records = get_utterance_records(banking77_records)
ru_banking77_utterance_records = get_utterance_records(ru_banking77_records)

In [8]:
json.dump(banking77_utterance_records, open('../data/utterance_records/banking77.json', 'w'), indent=4, ensure_ascii=False)
json.dump(ru_banking77_utterance_records, open('../data/utterance_records/ru_banking77.json', 'w'), indent=4, ensure_ascii=False)

## russian banking77

source: https://github.com/LadaNikitina/RuBanking77

In [20]:
# ! git clone https://github.com/LadaNikitina/RuBanking77 ../data/RuBanking77
# ! rm -rf ../data/RuBanking77/.git

Cloning into '../data/RuBanking77'...
Username for 'https://github.com': ^C


In [21]:
from datasets import load_from_disk

rubanking77 = load_from_disk('../data/RuBanking77')
rubanking77

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 10003
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 3080
    })
})

In [22]:
rubanking77['train'][0]

{'text': 'Я все еще жду свою карту?', 'label': 11}

In [23]:
rubanking77_records = convert_banking77(rubanking77['train'], shots_per_intent=5)

In [24]:
rubanking77_records[0]

{'intent_id': 0,
 'intent_name': 'activate_my_card',
 'sample_utterances': ['Пожалуйста, помогите мне с моей картой. Она не активируется.',
  'Я устал, но не могу активировать свою карту.',
  'Я хочу начать пользоваться своей картой.',
  'Как мне проверить мою новую карту?',
  'Я попытался активировать свой плагин, и это не сработало.'],
 'regexp_for_sampling': [],
 'regexp_as_rules': []}

In [25]:
json.dump(rubanking77_records, open('../data/intent_records/ru_banking77.json', 'w'), indent=4, ensure_ascii=False)