In [1]:
import os
import json
import pandas as pd

In [2]:
dialogue_dir = "dialogue_data"
train_data_path = os.path.join(dialogue_dir, "train_data.json")

In [None]:
# Creating a single json file for all the training dialogues

train_data = []
train_dir = os.path.join(dialogue_dir, "train")
for dialogue in os.listdir(train_dir):
    dialogue_path = os.path.join(train_dir, dialogue)
    with open(dialogue_path, "r") as train_json:
        dialogues = json.load(train_json)
        train_data.extend(dialogues)
    train_json.close()

with open(train_data_path, "w") as train_json:
    json.dump(train_data, train_json, indent=2)
train_json.close()
train_data = []

In [None]:
# Creating a csv file for the dialogues train dataset

with open(train_data_path, "r") as train_json:
    train_data = json.load(train_json)
train_json.close()

train_csv = []

for dialogue in train_data:
    for turn in dialogue["turns"]:
        row = {}
        row["Speaker"] = turn["speaker"]
        row["Turn ID"] = turn["turn_id"]
        row["Utterence"] = turn["utterance"]
        row["Dialogue ID"] = dialogue["dialogue_id"]
        train_csv.append(row)
train_csv = pd.DataFrame(train_csv, columns=["Dialogue ID", "Speaker", "Turn ID", "Utterence"])
train_csv.to_csv(f"{train_data_path[:-5]}.csv", index=False)

# Bulk Labelling

In [3]:
import pathlib 
import numpy as np
import pandas as pd
import ipywidgets as widgets

from whatlies import EmbeddingSet 
from whatlies.transformers import Pca, Umap
from hulearn.preprocessing import InteractivePreprocessor
from hulearn.experimental.interactive import InteractiveCharts
from whatlies.language import UniversalSentenceLanguage, LaBSELanguage

In [4]:
txt = pathlib.Path("./data/nlu.yml").read_text()
texts = list(set([t.replace(" - ", "") for t in txt.split("\n") if len(t) > 0 and t[0] != "#"]))
print(f"We're going to label {len(texts)} texts.")

We're going to label 78 texts.


In [5]:
# The language agnostic bert model works is a good starting option, 
# especially for Non-English use-cases but it is a fair bit slower.
# You can swap this out with another embedding source if you feel like though. 
# lang = LaBSELanguage()
lang = UniversalSentenceLanguage(variant="large")

In [6]:
# This is where we prepare all of the state
embset = lang[texts]
df = embset.transform(Umap(2)).to_dataframe().reset_index()
df.columns = ['text', 'd1', 'd2']
df['label'] = ''

In [7]:
# Here's the global state object
state = {}
state['df'] = df.copy()
state['chart'] = InteractiveCharts(df.loc[lambda d: d['label'] == ''], labels=['group'])

In [8]:
pd.set_option('display.max_colwidth', -1)

def show_draw_chart(b=None):
    with out_table:
        out_table.clear_output()
    with out_chart:
        out_chart.clear_output()
        state['chart'].dataf = state['df'].loc[lambda d: d['label'] == '']
        state['chart'].charts = []
        state['chart'].add_chart(x='d1', y='d2', legend=False)

def show_examples(b=None):
    with out_table:
        out_table.clear_output()
        tfm = InteractivePreprocessor(json_desc=state['chart'].data())
        subset = state['df'].pipe(tfm.pandas_pipe).loc[lambda d: d['group'] != 0]
        display(subset.sample(min(15, subset.shape[0]))[['text']])

def assign_label(b=None):
    tfm = InteractivePreprocessor(json_desc=state['chart'].data())
    idx = state['df'].pipe(tfm.pandas_pipe).loc[lambda d: d['group'] != 0].index
    state['df'].iloc[idx, 3] = label_name.value
    with out_counter:
        out_counter.clear_output()
        n_lab = state['df'].loc[lambda d: d['label'] != ''].shape[0]
        print(f"{n_lab}/{state['df'].shape[0]} labelled")

def retrain_state(b=None):
    keep = list(state['df'].loc[lambda d: d['label'] == '']['text'])
    umap = Umap(2)
    new_df = EmbeddingSet(*[e for e in embset if e.name in keep]).transform(umap).to_dataframe().reset_index()
    new_df.columns = ['text', 'd1', 'd2']
    new_df['label'] = ''
    state['df'] = pd.concat([new_df, state['df'].loc[lambda d: d['label'] != '']])
    show_draw_chart(b)

out_table = widgets.Output()
out_chart = widgets.Output()
out_counter = widgets.Output()

label_name = widgets.Text("label name")

btn_examples = widgets.Button(
    description='Show Examples',
    icon='eye'
)

btn_label = widgets.Button(
    description='Add label',
    icon='check'
)

btn_retrain = widgets.Button(
    description='Retrain',
    icon='coffee'
)

btn_redraw = widgets.Button(
    description='Redraw',
    icon='check'
)

btn_examples.on_click(show_examples)
btn_label.on_click(assign_label)
btn_redraw.on_click(show_draw_chart)
btn_retrain.on_click(retrain_state)

show_draw_chart()
display(widgets.VBox([widgets.HBox([btn_retrain, btn_examples, btn_redraw]), 
                      widgets.HBox([out_chart, out_table])]), 
        label_name, 
        widgets.HBox([btn_label, out_counter]))

VBox(children=(HBox(children=(Button(description='Retrain', icon='coffee', style=ButtonStyle()), Button(descri…

Text(value='label name')

HBox(children=(Button(description='Add label', icon='check', style=ButtonStyle()), Output()))

In [10]:
# This is the dataframe with the labels attached
# you can inspect it here or save it to disk.
state['df']

Unnamed: 0,text,d1,d2,label
0,extremly sad,9.411379,6.181701,
1,have a nice day,6.601752,9.824533,
2,I am feeling very good,9.304821,7.789327,
3,goodevening,11.361326,9.273573,
4,"version: ""3.1""",11.954450,8.162363,
...,...,...,...,...
73,yes,12.421310,8.724361,
74,y,12.066045,9.497900,
75,feeling like a king,9.369506,8.003271,
76,- intent: mood_unhappy,4.960393,8.796636,
