From 2a309e521423c3c71e68166264d3aab41896588b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Mon, 11 Jul 2022 19:14:47 +0200 Subject: [PATCH 1/6] adds basic rasa adapter --- .gitignore | 1 + kern/adapter/__init__.py | 0 kern/adapter/rasa.py | 45 ++++++++++++++++++++++++++++++++++++++++ requirements.txt | 6 +++++- 4 files changed, 51 insertions(+), 1 deletion(-) create mode 100644 kern/adapter/__init__.py create mode 100644 kern/adapter/rasa.py diff --git a/.gitignore b/.gitignore index 86f5eb5..1990771 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +data/ .vscode/ secrets.json diff --git a/kern/adapter/__init__.py b/kern/adapter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/kern/adapter/rasa.py b/kern/adapter/rasa.py new file mode 100644 index 0000000..a319060 --- /dev/null +++ b/kern/adapter/rasa.py @@ -0,0 +1,45 @@ +import yaml +import os +from collections import OrderedDict + + +class literal(str): + pass + + +def literal_presenter(dumper, data): + return dumper.represent_scalar("tag:yaml.org,2002:str", data, style="|") + + +yaml.add_representer(literal, literal_presenter) + + +def ordered_dict_presenter(dumper, data): + return dumper.represent_dict(data.items()) + + +yaml.add_representer(OrderedDict, ordered_dict_presenter) + + +def build_literal_from_iterable(iterable): + return "\n".join([f"- {value}" for value in iterable]) + "\n" + + +def build_intent_yaml( + client, input_name, label_name, dir_name="data", file_name="nlu.yml" +): + df = client.get_record_export(tokenize=False) + + nlu_list = [] + for label, df_label in df.groupby(label_name): + literal_string = build_literal_from_iterable(df_label[input_name].tolist()) + nlu_list.append(OrderedDict(intent=label, examples=literal(literal_string))) + nlu_dict = OrderedDict(nlu=nlu_list) + + if dir_name is not None and not os.path.isdir(dir_name): + os.mkdir(dir_name) + + file_path = os.path.join(dir_name, file_name) + + with open(file_path, "w") as f: + yaml.dump(nlu_dict, f, allow_unicode=True) diff --git a/requirements.txt b/requirements.txt index a0c2849..78af298 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,8 @@ backcall==0.2.0 beautifulsoup4==4.11.1 black==22.3.0 bleach==5.0.0 +boto3==1.23.1 +botocore==1.26.1 certifi==2021.10.8 cffi==1.15.0 charset-normalizer==2.0.12 @@ -24,6 +26,7 @@ ipython-genutils==0.2.0 ipywidgets==7.7.0 jedi==0.18.1 Jinja2==3.1.2 +jmespath==1.0.0 jsonschema==4.5.1 jupyter==1.0.0 jupyter-client==7.3.1 @@ -31,7 +34,6 @@ jupyter-console==6.4.3 jupyter-core==4.10.0 jupyterlab-pygments==0.2.2 jupyterlab-widgets==1.1.0 -kern-python-client @ file:///Users/jhoetter/repos/kern-python MarkupSafe==2.1.1 matplotlib-inline==0.1.3 minio==7.1.8 @@ -62,10 +64,12 @@ pyparsing==3.0.9 pyrsistent==0.18.1 python-dateutil==2.8.2 pytz==2022.1 +PyYAML==6.0 pyzmq==22.3.0 qtconsole==5.3.0 QtPy==2.1.0 requests==2.27.1 +s3transfer==0.5.2 Send2Trash==1.8.0 six==1.16.0 soupsieve==2.3.2.post1 From 502e06da4d99e531b115c28af7a3907c6bd9769c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Mon, 11 Jul 2022 19:32:19 +0200 Subject: [PATCH 2/6] adds metadata task --- kern/adapter/rasa.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/kern/adapter/rasa.py b/kern/adapter/rasa.py index a319060..5fc18c1 100644 --- a/kern/adapter/rasa.py +++ b/kern/adapter/rasa.py @@ -26,14 +26,38 @@ def build_literal_from_iterable(iterable): def build_intent_yaml( - client, input_name, label_name, dir_name="data", file_name="nlu.yml" + client, + text_name, + intent_label_task, + metadata_label_task=None, + dir_name="data", + file_name="nlu.yml", ): df = client.get_record_export(tokenize=False) nlu_list = [] - for label, df_label in df.groupby(label_name): - literal_string = build_literal_from_iterable(df_label[input_name].tolist()) - nlu_list.append(OrderedDict(intent=label, examples=literal(literal_string))) + for label, df_sub_label in df.groupby(intent_label_task): + + if metadata_label_task is not None: + metadata_label_name = metadata_label_task.split("__")[1] + for metadata_label, df_sub_label_sub_metadata_label in df_sub_label.groupby( + metadata_label_task + ): + literal_string = build_literal_from_iterable( + df_sub_label_sub_metadata_label[text_name].tolist() + ) + nlu_list.append( + OrderedDict( + intent=label, + metadata=OrderedDict(**{metadata_label_name: metadata_label}), + examples=literal(literal_string), + ) + ) + else: + literal_string = build_literal_from_iterable( + df_sub_label[text_name].tolist() + ) + nlu_list.append(OrderedDict(intent=label, examples=literal(literal_string))) nlu_dict = OrderedDict(nlu=nlu_list) if dir_name is not None and not os.path.isdir(dir_name): From e9b03313f0f4199ed87a48bff59e34f97c2afb06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 12 Jul 2022 11:12:46 +0200 Subject: [PATCH 3/6] adds label injection --- kern/adapter/rasa.py | 58 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/kern/adapter/rasa.py b/kern/adapter/rasa.py index 5fc18c1..b2c02d1 100644 --- a/kern/adapter/rasa.py +++ b/kern/adapter/rasa.py @@ -2,6 +2,10 @@ import os from collections import OrderedDict +CONSTANT_OUTSIDE = "OUTSIDE" +CONSTANT_LABEL_BEGIN = "B-" +CONSTANT_LABEL_INTERMEDIATE = "I-" + class literal(str): pass @@ -25,15 +29,67 @@ def build_literal_from_iterable(iterable): return "\n".join([f"- {value}" for value in iterable]) + "\n" +def inject_label_in_text(row, text_name, tokenized_label_task, constant_outside): + string = "" + token_list = row[f"{text_name}__tokenized"] + + close_multitoken_label = False + multitoken_label = False + for idx, token in enumerate(token_list): + + if idx < len(token_list) - 1: + token_next = token_list[idx + 1] + label_next = row[tokenized_label_task][idx + 1] + if label_next.startswith(CONSTANT_LABEL_INTERMEDIATE): + multitoken_label = True + else: + if multitoken_label: + close_multitoken_label = True + multitoken_label = False + num_whitespaces = token_next.idx - (token.idx + len(token)) + else: + num_whitespaces = 0 + whitespaces = " " * num_whitespaces + + label = row[tokenized_label_task][idx] + if label != constant_outside: + if multitoken_label: + if label.startswith(CONSTANT_LABEL_BEGIN): + string += f"[{token.text}{whitespaces}" + else: + string += f"{token.text}{whitespaces}" + else: + if close_multitoken_label: + string += f"{token.text}]({label[2:]}){whitespaces}" + close_multitoken_label = False + else: + string += f"[{token.text}]({label[2:]}){whitespaces}" + else: + string += f"{token.text}{whitespaces}" + return string + + def build_intent_yaml( client, text_name, intent_label_task, metadata_label_task=None, + tokenized_label_task=None, dir_name="data", file_name="nlu.yml", + constant_outside=CONSTANT_OUTSIDE, ): - df = client.get_record_export(tokenize=False) + df = client.get_record_export(tokenize=(tokenized_label_task is not None)) + + if tokenized_label_task is not None: + text_name_injected = f"{text_name}__injected" + df[text_name_injected] = df.apply( + lambda x: inject_label_in_text( + x, text_name, tokenized_label_task, constant_outside + ), + axis=1, + ) + text_name = text_name_injected nlu_list = [] for label, df_sub_label in df.groupby(intent_label_task): From 7468cc3517d501e2a70c4712905c4f1dbba9913c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 12 Jul 2022 11:35:12 +0200 Subject: [PATCH 4/6] adds lookups --- kern/adapter/rasa.py | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/kern/adapter/rasa.py b/kern/adapter/rasa.py index b2c02d1..2083761 100644 --- a/kern/adapter/rasa.py +++ b/kern/adapter/rasa.py @@ -59,11 +59,12 @@ def inject_label_in_text(row, text_name, tokenized_label_task, constant_outside) else: string += f"{token.text}{whitespaces}" else: + label_trimmed = label[2:] # remove B- and I- if close_multitoken_label: - string += f"{token.text}]({label[2:]}){whitespaces}" + string += f"{token.text}]({label_trimmed}){whitespaces}" close_multitoken_label = False else: - string += f"[{token.text}]({label[2:]}){whitespaces}" + string += f"[{token.text}]({label_trimmed}){whitespaces}" else: string += f"{token.text}{whitespaces}" return string @@ -114,6 +115,29 @@ def build_intent_yaml( df_sub_label[text_name].tolist() ) nlu_list.append(OrderedDict(intent=label, examples=literal(literal_string))) + + if tokenized_label_task is not None: + + def flatten(xss): + return [x for xs in xss for x in xs] + + labels = set(flatten(df[tokenized_label_task].tolist())) + lookup_list_names = [] + for label in labels: + if label.startswith(CONSTANT_LABEL_BEGIN): + label_trimmed = label[2:] # remove B- + lookup_list_names.append(label_trimmed) + + for lookup_list in client.get_lookup_lists(): + if lookup_list["name"] in lookup_list_names: + values = [entry["value"] for entry in lookup_list["terms"]] + literal_string = build_literal_from_iterable(values) + nlu_list.append( + OrderedDict( + lookup=lookup_list["name"], examples=literal(literal_string) + ) + ) + nlu_dict = OrderedDict(nlu=nlu_list) if dir_name is not None and not os.path.isdir(dir_name): From 680e04f7820a29687a4b0822dec38643855412a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 12 Jul 2022 12:05:44 +0200 Subject: [PATCH 5/6] adds documentation and output log --- kern/adapter/rasa.py | 93 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 19 deletions(-) diff --git a/kern/adapter/rasa.py b/kern/adapter/rasa.py index 2083761..cb68e83 100644 --- a/kern/adapter/rasa.py +++ b/kern/adapter/rasa.py @@ -1,12 +1,8 @@ +from typing import Any, List, Optional +import pandas as pd import yaml -import os -from collections import OrderedDict - -CONSTANT_OUTSIDE = "OUTSIDE" -CONSTANT_LABEL_BEGIN = "B-" -CONSTANT_LABEL_INTERMEDIATE = "I-" - +# https://stackoverflow.com/questions/8640959/how-can-i-control-what-scalar-form-pyyaml-uses-for-my-data class literal(str): pass @@ -24,12 +20,44 @@ def ordered_dict_presenter(dumper, data): yaml.add_representer(OrderedDict, ordered_dict_presenter) +import os +from collections import OrderedDict +from wasabi import msg -def build_literal_from_iterable(iterable): +from kern import Client + +CONSTANT_OUTSIDE = "OUTSIDE" +CONSTANT_LABEL_BEGIN = "B-" +CONSTANT_LABEL_INTERMEDIATE = "I-" + + +def build_literal_from_iterable(iterable: List[Any]) -> str: + """Builds a Rasa-conform yaml string from an iterable. + + Args: + iterable (List[Any]): List with values to be converted to a literal block. + + Returns: + str: literal block + """ return "\n".join([f"- {value}" for value in iterable]) + "\n" -def inject_label_in_text(row, text_name, tokenized_label_task, constant_outside): +def inject_label_in_text( + row: pd.Series, text_name: str, tokenized_label_task: str, constant_outside: str +) -> str: + """Insert token labels into text. + E.g. "Hello, my name is Johannes Hötter" -> "Hello, my name is [Johannes Hötter](person)" + + Args: + row (pd.Series): row of the record export dataframe + text_name (str): name of the text/chat field + tokenized_label_task (str): name of the label task containing token-level labels + constant_outside (str): constant to be used for outside labels + + Returns: + str: injected text + """ string = "" token_list = row[f"{text_name}__tokenized"] @@ -71,15 +99,31 @@ def inject_label_in_text(row, text_name, tokenized_label_task, constant_outside) def build_intent_yaml( - client, - text_name, - intent_label_task, - metadata_label_task=None, - tokenized_label_task=None, - dir_name="data", - file_name="nlu.yml", - constant_outside=CONSTANT_OUTSIDE, -): + client: Client, + text_name: str, + intent_label_task: str, + metadata_label_task: Optional[str] = None, + tokenized_label_task: Optional[str] = None, + dir_name: str = "data", + file_name: str = "nlu.yml", + constant_outside: str = CONSTANT_OUTSIDE, + version: str = "3.1", +) -> None: + """builds a Rasa NLU yaml file from your project data via the client object. + + Args: + client (Client): connected Client object for your project + text_name (str): name of the text/chat field + intent_label_task (str): name of the classification label with the intents + metadata_label_task (Optional[str], optional): if you have a metadata task (e.g. sentiment), you can list it here. Currently, only one is possible to provide. Defaults to None. + tokenized_label_task (Optional[str], optional): if you have a token-level task (e.g. for entities), you can list it here. Currently, only one is possible to provide. Defaults to None. + dir_name (str, optional): name of your rasa data directory. Defaults to "data". + file_name (str, optional): name of the file you want to store the data to. Defaults to "nlu.yml". + constant_outside (str, optional): constant to be used for outside labels in token-level tasks. Defaults to CONSTANT_OUTSIDE. + version (str, optional): Rasa version. Defaults to "3.1". + """ + msg.info("Building training data for Rasa") + msg.warn("If you haven't done so yet, please install rasa and run `rasa init`") df = client.get_record_export(tokenize=(tokenized_label_task is not None)) if tokenized_label_task is not None: @@ -138,7 +182,7 @@ def flatten(xss): ) ) - nlu_dict = OrderedDict(nlu=nlu_list) + nlu_dict = OrderedDict(version=version, nlu=nlu_list) if dir_name is not None and not os.path.isdir(dir_name): os.mkdir(dir_name) @@ -147,3 +191,14 @@ def flatten(xss): with open(file_path, "w") as f: yaml.dump(nlu_dict, f, allow_unicode=True) + msg.good(f"Saved training data to {file_path}! 🚀") + msg.warn( + f"Please make sure to add the project-specific files domain.yml, {os.path.join(dir_name, 'rules.yml')} and {os.path.join(dir_name, 'stories.yml')}." + ) + msg.info("More information about these files can be found here:") + msg.info(" - Domain: https://rasa.com/docs/rasa/domain") + msg.info(" - Rules: https://rasa.com/docs/rasa/rules") + msg.info(" - Stories: https://rasa.com/docs/rasa/stories") + msg.good( + "You're all set, and can now start building your conversational AI via `rasa train`! 🎉" + ) From 98bd2c3bdf4b6c4149b5ea9194beb0639feca73d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 12 Jul 2022 14:04:16 +0200 Subject: [PATCH 6/6] integrate rasa adapter description --- README.md | 147 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 132 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 5ab2367..561f8c3 100644 --- a/README.md +++ b/README.md @@ -4,27 +4,31 @@ # Kern AI API for Python -This is the official Python SDK for Kern AI, your IDE for programmatic data enrichment and management. +This is the official Python SDK for [*refinery*](https://github.com/code-kern-ai/refinery), your **open-source** data-centric IDE for NLP. ## Installation -You can set up this library via either running `$ pip install kern-sdk`, or via cloning this repository and running `$ pip install -r requirements.txt` in this repository. +You can set up this SDK either via running `$ pip install kern-sdk`, or by cloning this repository and running `$ pip install -r requirements.txt`. ## Usage -Once you installed the package, you can access the application from any Python terminal as follows: + +### Creating a `Client` object +Once you installed the package, you can create a `Client` object from any Python terminal as follows: ```python from kern import Client -username = "your-username" +user_name = "your-username" password = "your-password" project_id = "your-project-id" # can be found in the URL of the web application -client = Client(username, password, project_id) +client = Client(user_name, password, project_id) # if you run the application locally, please use the following instead: # client = Client(username, password, project_id, uri="http://localhost:4455") ``` +The `project_id` can be found in your browser, e.g. if you run the app on your localhost: `http://localhost:4455/app/projects/{project_id}/overview` + Alternatively, you can provide a `secrets.json` file in your directory where you want to run the SDK, looking as follows: ```json { @@ -33,26 +37,140 @@ Alternatively, you can provide a `secrets.json` file in your directory where you "project_id": "your-project-id" } ``` -Again, if you run on your local machine, you should also provide `"uri": "http://localhost:4455"`. Afterwards, you can access the client like this: + +Again, if you run on your localhost, you should also provide `"uri": "http://localhost:4455"`. Afterwards, you can access the client like this: + ```python client = Client.from_secrets_file("secrets.json") ``` +With the `Client`, you easily integrate your data into any kind of system; may it be a custom implementation, an AutoML system or a plain data analytics framework 🚀 + +### Fetching labeled data + Now, you can easily fetch the data from your project: ```python -df = client.get_record_export() +df = client.get_record_export(tokenize=False) +# if you set tokenize=True (default), the project-specific +# spaCy tokenizer will process your textual data ``` Alternatively, you can also just run `kern pull` in your CLI given that you have provided the `secrets.json` file in the same directory. -The `df` contains data of the following scheme: -- all your record attributes are stored as columns, e.g. `headline` or `running_id` if you uploaded records like `{"headline": "some text", "running_id": 1234}` -- per labeling task three columns: - - `____MANUAL`: those are the manually set labels of your records - - `____WEAK SUPERVISION`: those are the weakly supervised labels of your records - - `____WEAK SUPERVISION_confidence`: those are the probabilities or your weakly supervised labels +The `df` contains both your originally uploaded data (e.g. `headline` and `running_id` if you uploaded records like `{"headline": "some text", "running_id": 1234}`), and a triplet for each labeling task you create. This triplet consists of the manual labels, the weakly supervised labels, and their confidence. For extraction tasks, this data is on token-level. + +An example export file looks like this: +```json +[ + { + "running_id": "0", + "Headline": "T. Rowe Price (TROW) Dips More Than Broader Markets", + "Date": "Jun-30-22 06:00PM\u00a0\u00a0", + "Headline__Sentiment Label__MANUAL": null, + "Headline__Sentiment Label__WEAK_SUPERVISION": "Negative", + "Headline__Sentiment Label__WEAK_SUPERVISION__confidence": "0.6220" + } +] +``` + +In this example, there is no manual label, but a weakly supervised label `"Negative"` has been set with 62.2% confidence. + +### Fetch lookup lists +- [ ] Todo + +### Upload files +- [ ] Todo + +### Adapters + +#### Rasa +*refinery* is perfect to be used for building chatbots with [Rasa](https://github.com/RasaHQ/rasa). We've built an adapter with which you can easily create the required Rasa training data directly from *refinery*. + +To do so, do the following: + +```python +from kern.adapter import rasa + +rasa.build_intent_yaml( + client, + "text", + "__intent__WEAK_SUPERVISION" +) +``` + +This will create a `.yml` file looking as follows: + +```yml +nlu: +- intent: check_balance + examples: | + - how much do I have on my savings account + - how much money is in my checking account + - What's the balance on my credit card account +``` + +If you want to provide a metadata-level label (such as sentiment), you can provide the optional argument `metadata_label_task`: + +```python +from kern.adapter import rasa + +rasa.build_intent_yaml( + client, + "text", + "__intent__WEAK_SUPERVISION", + metadata_label_task="__sentiment__WEAK_SUPERVISION" +) +``` + +This will create a file like this: +```yml +nlu: +- intent: check_balance + metadata: + sentiment: neutral + examples: | + - how much do I have on my savings account + - how much money is in my checking account + - What's the balance on my credit card account +``` + +And if you have entities in your texts which you'd like to recognize, simply add the `tokenized_label_task` argument: + +```python +from kern.adapter import rasa + +rasa.build_intent_yaml( + client, + "text", + "__intent__WEAK_SUPERVISION", + metadata_label_task="__sentiment__WEAK_SUPERVISION", + tokenized_label_task="text__entities__WEAK_SUPERVISION" +) +``` + +This will not only inject the label names on token-level, but also creates lookup lists for your chatbot: + +```yml +nlu: +- intent: check_balance + metadata: + sentiment: neutral + examples: | + - how much do I have on my [savings](account) account + - how much money is in my [checking](account) account + - What's the balance on my [credit card account](account) +- lookup: account + examples: | + - savings + - checking + - credit card account +``` + +Please make sure to also create the further necessary files (`domain.yml`, `data/stories.yml` and `data/rules.yml`) if you want to train your Rasa chatbot. For further reference, see their [documentation](https://rasa.com/docs/rasa). + +### What's missing? +Let us know what open-source/closed-source NLP framework you are using, for which you'd like to have an adapter implemented in the SDK. To do so, simply create an issue in this repository with the tag "enhancement". -With the `client`, you easily integrate your data into any kind of system; may it be a custom implementation, an AutoML system or a plain data analytics framework 🚀 ## Roadmap - [ ] Register heuristics via wrappers @@ -66,7 +184,6 @@ If you want to have something added, feel free to open an [issue](https://github Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**. If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement". -Don't forget to give the project a star! Thanks again! 1. Fork the Project 2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`)