From 302d64a030748d5efe647d94ce5cace1636695ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 21 Jul 2021 01:26:25 +0200 Subject: [PATCH 01/35] update README --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e909af6..cce8070 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,14 @@ [![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) -# onetask API for Python - WiP +# onetask API for Python Official Python SDK for onetask. ## [](https://github.com/onetask-ai/onetask-python#installation)Installation +You can use pip to install the library: + +`$ pip install onetask` You can clone the repository and run the setup.py script: @@ -13,14 +16,16 @@ You can clone the repository and run the setup.py script: ## [](https://github.com/onetask-ai/onetask-python#usage)Usage -Before making requests to the API, you need to create an instance of the onetask client. At the moment, you will have to use the org id and the project id: +Before making requests to the API, you need to create an instance of the onetask client. +To do so, you will have to login like you do in the system while providing the project id you work in: ```python from onetask import Client # Instantiate the client using your org_id and project_id -org_id = '' +user_name = '' +password = '' project_id = '' -client = Client(org_id=org_id, project_id=project_id) +client = Client(user_name=user_name, password=password, project_id=project_id) ``` You can now register your custom Python function From cc48dc687e039bc889bd1e47cf8a5473b9b0591b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 21 Jul 2021 01:31:53 +0200 Subject: [PATCH 02/35] increment version --- setup.py | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/setup.py b/setup.py index cedb35e..8d1639d 100644 --- a/setup.py +++ b/setup.py @@ -5,33 +5,33 @@ from setuptools import setup, find_packages this_directory = os.path.abspath(os.path.dirname(__file__)) -with open(os.path.join(this_directory, 'README.md')) as file: +with open(os.path.join(this_directory, "README.md")) as file: long_description = file.read() setup( - name='onetask', - version='0.0.4', - author='onetask', - author_email='info@onetask.ai', - description='Official Python SDK for the onetask API', + name="onetask", + version="0.1.0", + author="onetask", + author_email="info@onetask.ai", + description="Official Python SDK for the onetask API", long_description=long_description, long_description_content_type="text/markdown", - url='https://github.com/onetask-ai/onetask-python', - keywords=['onetask', 'machine learning', 'supervised learning', 'python'], + url="https://github.com/onetask-ai/onetask-python", + keywords=["onetask", "machine learning", "supervised learning", "python"], classifiers=[ - 'Development Status :: 3 - Alpha', - 'Programming Language :: Python :: 3', - 'License :: OSI Approved :: MIT License', + "Development Status :: 3 - Alpha", + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", ], - package_dir={'': '.'}, - packages=find_packages('.'), + package_dir={"": "."}, + packages=find_packages("."), install_requires=[ - 'better-abc==0.0.3', - 'certifi==2021.5.30', - 'chardet==4.0.0', - 'idna==2.10', - 'requests==2.25.1', - 'urllib3==1.26.5', - 'wasabi==0.8.2' + "better-abc==0.0.3", + "certifi==2021.5.30", + "chardet==4.0.0", + "idna==2.10", + "requests==2.25.1", + "urllib3==1.26.5", + "wasabi==0.8.2", ], ) From 537bb5c02365346d0ff997235fd3272df89fcbf6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 21 Jul 2021 01:33:32 +0200 Subject: [PATCH 03/35] adds pypi publish script --- onetask/publish.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 onetask/publish.sh diff --git a/onetask/publish.sh b/onetask/publish.sh new file mode 100644 index 0000000..5c00099 --- /dev/null +++ b/onetask/publish.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +rm -rf dist/* +python3 setup.py bdist_wheel --universal +twine upload dist/* \ No newline at end of file From a6fedf7fda9150db5bf30608d2a9fd214233b937 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 21 Jul 2021 01:34:06 +0200 Subject: [PATCH 04/35] moves publish --- onetask/publish.sh => publish.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename onetask/publish.sh => publish.sh (100%) diff --git a/onetask/publish.sh b/publish.sh similarity index 100% rename from onetask/publish.sh rename to publish.sh From 295b4e71e48cfd0602dcef7b29ea812d195ffcc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 28 Jul 2021 18:52:08 +0200 Subject: [PATCH 05/35] switch between stages --- onetask/__init__.py | 5 +++-- onetask/settings.py | 18 +++++++++++++----- setup.py | 2 +- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 1a0e261..7ba6650 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -10,9 +10,10 @@ class Client: def __init__( - self, user_name: str, password: str, project_id: str, debug: bool = False + self, user_name: str, password: str, project_id: str, stage: str = "prod" ): - if not debug: + settings.set_stage(stage) + if stage in ["prod", "test", "dev"]: self.session_token = api_calls.create_session_token( user_name=user_name, password=password ) diff --git a/onetask/settings.py b/onetask/settings.py index ebaccd0..19b8fb1 100644 --- a/onetask/settings.py +++ b/onetask/settings.py @@ -1,14 +1,22 @@ # -*- coding: utf-8 -*- -LOCALHOST_SWITCH = False +STAGE: str -def set_to_localhost(): - global LOCALHOST_SWITCH - LOCALHOST_SWITCH = True +def set_stage(stage): + global STAGE + STAGE = stage def get_base_url(): - return "http://localhost:8000" if LOCALHOST_SWITCH else "https://app.dev.onetask.ai" + global STAGE + if STAGE == "prod": + return "https://app.onetask.ai" + elif STAGE == "test": + return "https://app.test.onetask.ai" + elif STAGE == "dev": + return "https://app.dev.onetask.ai" + else: + return "http://localhost:8000" def get_authentication_url(): diff --git a/setup.py b/setup.py index 8d1639d..e9b83ea 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.0", + version="0.1.1", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 86f1eac138019634d65de660cde0c48766c30994 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 28 Jul 2021 19:02:42 +0200 Subject: [PATCH 06/35] solves localhost bug --- onetask/__init__.py | 1 - setup.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 7ba6650..f7a311e 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -23,7 +23,6 @@ def __init__( msg.fail("Could not log in. Please check your username and password.") else: self.session_token = None - settings.set_to_localhost() msg.info("Sending requests to localhost") self.project_id = project_id diff --git a/setup.py b/setup.py index e9b83ea..d97a92b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.1", + version="0.1.2", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 2497c03091635303642536d7704e7298b2bfa1b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 10 Aug 2021 19:20:36 +0200 Subject: [PATCH 07/35] postgres change --- onetask/__init__.py | 16 ++++++---------- onetask/api_calls.py | 3 +-- onetask/settings.py | 2 +- setup.py | 2 +- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index f7a311e..79e67e9 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -13,17 +13,13 @@ def __init__( self, user_name: str, password: str, project_id: str, stage: str = "prod" ): settings.set_stage(stage) - if stage in ["prod", "test", "dev"]: - self.session_token = api_calls.create_session_token( - user_name=user_name, password=password - ) - if self.session_token is not None: - msg.good("Logged in to system.") - else: - msg.fail("Could not log in. Please check your username and password.") + self.session_token = api_calls.create_session_token( + user_name=user_name, password=password + ) + if self.session_token is not None: + msg.good("Logged in to system.") else: - self.session_token = None - msg.info("Sending requests to localhost") + msg.fail("Could not log in. Please check your username and password.") self.project_id = project_id def register_custom_lf(self, lf: Callable) -> None: diff --git a/onetask/api_calls.py b/onetask/api_calls.py index 67249b1..d0a0e36 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -47,9 +47,8 @@ def __init__( headers = { "Content-Type": "application/json", "User-Agent": f"python-sdk-{version}", + "Authorization": f"Bearer {session_token}", } - if session_token: - headers["Authorization"] = f"Bearer {session_token}" if data is None: self.response = requests.request(self.method, url, headers=headers) diff --git a/onetask/settings.py b/onetask/settings.py index 19b8fb1..e30892d 100644 --- a/onetask/settings.py +++ b/onetask/settings.py @@ -16,7 +16,7 @@ def get_base_url(): elif STAGE == "dev": return "https://app.dev.onetask.ai" else: - return "http://localhost:8000" + return STAGE def get_authentication_url(): diff --git a/setup.py b/setup.py index d97a92b..407eb08 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.2", + version="0.1.6", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 9eccb7463d0cd5aaa045ea1c3f407bc8b90c77d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Sun, 3 Oct 2021 22:44:25 +0200 Subject: [PATCH 08/35] change to graphql and add autolf v1 --- onetask/__init__.py | 64 +++++++------- onetask/api_calls.py | 156 +++++++++++++++++++++++------------ onetask/auto_lf.py | 117 ++++++++++++++++++++++++++ onetask/exceptions.py | 9 +- onetask/labeling_function.py | 56 ------------- onetask/settings.py | 10 ++- onetask/util.py | 69 ++++++++++++++++ setup.py | 3 +- 8 files changed, 333 insertions(+), 151 deletions(-) create mode 100644 onetask/auto_lf.py delete mode 100644 onetask/labeling_function.py create mode 100644 onetask/util.py diff --git a/onetask/__init__.py b/onetask/__init__.py index 79e67e9..aeed62a 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -1,11 +1,8 @@ # -*- coding: utf-8 -*- -from typing import Callable, List - -from onetask import api_calls, settings +from typing import Callable from wasabi import msg - -from onetask.labeling_function import build_keywords_lf, unpack_python_function +from onetask import api_calls, settings, util, auto_lf class Client: @@ -16,38 +13,43 @@ def __init__( self.session_token = api_calls.create_session_token( user_name=user_name, password=password ) + self.project_id = project_id if self.session_token is not None: msg.good("Logged in to system.") + if not api_calls.ProjectByProjectId( + self.project_id, self.session_token + ).exists: + msg.fail(f"Project with ID {self.project_id} does not exist.") else: msg.fail("Could not log in. Please check your username and password.") - self.project_id = project_id - def register_custom_lf(self, lf: Callable) -> None: - fn_name, source_code, description = unpack_python_function(lf) - _ = api_calls.RegisterLabelingFunctionCall( - fn_name=fn_name, - source_code=source_code, - description=description, - project_id=self.project_id, - session_token=self.session_token, - ) - msg.good(f"Registered labeling function '{fn_name}'.") + def manually_labeled_records(self, as_df: bool = True): + fetched_records = api_calls.ManuallyLabeledRecords( + self.project_id, self.session_token + ).data + records = util.unpack_records(fetched_records) + if as_df and len(records) > 0: + return util.records_to_df(records) + else: + return records - def register_keywords_lf( - self, - label: str, - keywords: List[str], - attributes: List[str], - lowercase: bool = True, + def autogenerate_regex_labeling_functions( + self, nlp, attribute, num_functions: int = 10 ): - fn_name, source_code, description = build_keywords_lf( - label, keywords, attributes, lowercase + records = self.manually_labeled_records(as_df=True) + if len(records) > 0: + candidates = auto_lf.derive_regex_candidates( + nlp, records, attribute, most_common=num_functions + ) + auto_lf.create_regex_fns(records, candidates, attribute) + else: + msg.fail("No manually labeled records available!") + + def register_lf(self, lf: Callable) -> None: + project_id, name, source_code, docs = util.unpack_python_function( + lf, self.project_id ) - _ = api_calls.RegisterLabelingFunctionCall( - fn_name=fn_name, - source_code=source_code, - description=description, - project_id=self.project_id, - session_token=self.session_token, + api_calls.CreateLabelingFunction( + project_id, name, source_code, docs, self.session_token ) - msg.good(f"Registered labeling function '{fn_name}'.") + msg.good(f"Registered labeling function '{name}'.") diff --git a/onetask/api_calls.py b/onetask/api_calls.py index d0a0e36..efe3656 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -1,18 +1,13 @@ # -*- coding: utf-8 -*- import pkg_resources - from onetask import exceptions, settings +import requests try: version = pkg_resources.get_distribution("onetask").version except pkg_resources.DistributionNotFound: version = "noversion" -import requests -from typing import Dict, Any, Optional, Union - -from better_abc import ABCMeta, abstract_attribute - # no call to the onetask system, therefore include it here def create_session_token(user_name: str, password: str): @@ -40,32 +35,29 @@ def create_session_token(user_name: str, password: str): return session_token -class OneTaskCall(metaclass=ABCMeta): - def __init__( - self, url: str, session_token: str, data: Optional[Dict[str, Any]] = None - ): +class GraphQLRequest: + def __init__(self, query, variables, session_token): + self.query = query + self.variables = variables + self.session_token = session_token + + def execute(self): + body = { + "query": self.query, + "variables": self.variables, + } + headers = { "Content-Type": "application/json", "User-Agent": f"python-sdk-{version}", - "Authorization": f"Bearer {session_token}", + "Authorization": f"Bearer {self.session_token}", } - if data is None: - self.response = requests.request(self.method, url, headers=headers) - else: - self.response = requests.request( - self.method, url, json=data, headers=headers - ) - - @abstract_attribute - def method(self): - pass + response = requests.post(url=settings.graphql(), json=body, headers=headers) - @property - def content(self) -> Union[Dict[str, Any], exceptions.APIError]: - status_code = self.response.status_code + status_code = response.status_code - json_data = self.response.json() + json_data = response.json() if status_code == 200: return json_data @@ -80,35 +72,95 @@ def content(self) -> Union[Dict[str, Any], exceptions.APIError]: raise exception -class PostCall(OneTaskCall): - def __init__( - self, - url: str, - session_token: str, - data: Dict[str, Any], - ): - self.method = "POST" +class ProjectByProjectId(GraphQLRequest): + def __init__(self, project_id, session_token): + QUERY = """ + query ($projectId: ID!) { + projectByProjectId(projectId: $projectId) { + id + labels { + edges { + node { + name + } + } + } + } + } + """ + + variables = { + "projectId": project_id, + } - super().__init__(url=url, session_token=session_token, data=data) + super().__init__(QUERY, variables, session_token) + try: + self.data = self.execute() + self.exists = self.data.get("data").get("projectByProjectId") is not None + except exceptions.APIError: + self.exists = False + + +class ManuallyLabeledRecords(GraphQLRequest): + def __init__(self, project_id, session_token): + data = ProjectByProjectId(project_id, session_token).data + edges = data["data"]["projectByProjectId"]["labels"]["edges"] + manual = [edge["node"]["name"] for edge in edges] + + QUERY = """ + query ($projectId: ID!, $manual: [String!]) { + searchRecords(projectId: $projectId, manual: $manual) { + data + labelAssociations { + edges { + node { + label { + name + } + source + } + } + } + } + } + """ + + variables = {"projectId": project_id, "manual": manual} + + super().__init__(QUERY, variables, session_token) + self.data = self.execute() + + +class CreateLabelingFunction(GraphQLRequest): + def __init__(self, project_id, name, function, description, session_token): + QUERY = """ + mutation ( + $projectId: ID!, + $name: String!, + $function: String!, + $description: String! + ) { + createLabelingFunction( + projectId: $projectId, + name: $name, + function: + $function, + description: $description + ) { + labelingFunction { + id + } + } + } + """ -class RegisterLabelingFunctionCall(PostCall): - def __init__( - self, - fn_name: str, - source_code: str, - description: str, - project_id: str, - session_token: str, - ): - body = { - "project_id": project_id, - "name": fn_name, - "function": source_code, + variables = { + "projectId": project_id, + "name": name, + "function": function, "description": description, } - super().__init__( - url=settings.get_labeling_function_url(), - session_token=session_token, - data=body, - ) + + super().__init__(QUERY, variables, session_token) + _ = self.execute() diff --git a/onetask/auto_lf.py b/onetask/auto_lf.py new file mode 100644 index 0000000..c06fd31 --- /dev/null +++ b/onetask/auto_lf.py @@ -0,0 +1,117 @@ +from collections import defaultdict +from tqdm import tqdm +from collections import Counter +import re +from collections import defaultdict +import numpy as np +from wasabi import msg + + +def derive_regex_candidates(nlp, df, attribute, most_common=10): + if len(df) < 100: + msg.warn( + "Only very few records to analyze; it's best to continue labeling further records before analysis" + ) + + def normalize_token(token): + if "d" in token.shape_ and not "x" in token.shape_: + return token.shape_.replace("d", "[0-9]") + else: + return token.text + + def is_relevant_token(token): + return not (token.is_punct or token.is_stop or token.is_bracket) + + candidates = [] + for text in tqdm(df[attribute], total=len(df)): + doc = nlp(text.lower()) + for token in doc: + if is_relevant_token(token): + has_children = False + for token_left in token.lefts: + if is_relevant_token(token_left): + prefix = "^" if token_left.idx == 0 else " " + suffix = "$" if token.idx == len(doc) - 1 else " " + candidate = f"{prefix}{normalize_token(token_left)}.*?{normalize_token(token)}{suffix}" + candidates.append(candidate) + has_children = True + for token_right in token.rights: + if is_relevant_token(token_right): + prefix = "^" if token.idx == 0 else " " + suffix = "$" if token_right.idx == len(doc) - 1 else " " + candidate = f"{prefix}{normalize_token(token)}.*?{normalize_token(token_right)}{suffix}" + candidates.append(candidate) + has_children = True + if not has_children: + prefix = "^" if token.idx == 0 else " " + suffix = "$" if token.idx == len(doc) - 1 else " " + candidate = f"{prefix}{normalize_token(token)}{suffix}" + candidates.append(candidate) + return [regex for regex, _ in Counter(candidates).most_common(most_common)] + + +def create_regex_fns(df, candidates, regex_col, label_col="label"): + def regex_explainer(regex, attribute): + description = "" + terms = regex.replace("^", "").replace("$", "").split(".*?") + if "^" in regex: + description += f"attribute '{attribute}' starts with term '{terms[0]}'" + if len(terms) > 1: + for term in terms[1:]: + description += f" (in-)directly followed by term '{term}'" + if "$" in regex: + description += " and then ends" + elif "$" in regex: + description += ( + f"attribute '{attribute}' somewhere contains term '{terms[0]}'" + ) + if len(terms) > 1: + for term in terms[1:]: + description += f" (in-)directly followed by term '{term}'" + description += " and then ends" + else: + description += ( + f"attribute '{attribute}' somewhere contains term '{terms[0]}'" + ) + if len(terms) > 1: + for term in terms[1:]: + description += f" followed by term '{term}'" + if "[0-9]" in regex: + description += ", where [0-9] is an arbitrary number" + description += "." + return description + + def build_regex_lf(regex, attribute, prediction, iteration): + source_code = f""" +def regex_{iteration}(record): + '''{regex_explainer(regex, attribute)}''' + import re + if re.search(r'{regex}', record['{attribute}'].lower()): + return '{prediction}' + +client.register_lf(regex_{iteration}) + """ + + return source_code.strip() + + regex_nr = 1 + for regex in candidates: + labels = defaultdict(int) + for text, label in zip(df[regex_col], df[label_col]): + if re.search(regex, text.lower()): + labels[label] += 1 + coverage = sum(labels.values()) + if coverage > 0: + regex_prediction, max_count = None, 0 + for prediction, count in labels.items(): + if count > max_count: + max_count = count + regex_prediction = prediction + precision = np.round(labels[regex_prediction] / coverage, 2) + coverage = np.round(coverage / len(df), 2) + if precision > 0.75 and coverage >= 0.01: + lf = build_regex_lf(regex, regex_col, regex_prediction, regex_nr) + regex_nr += 1 + print(f"# Cov:\t{coverage}\tPrec:{precision}") + print(lf) + print() diff --git a/onetask/exceptions.py b/onetask/exceptions.py index 6a5ce34..d7c9126 100644 --- a/onetask/exceptions.py +++ b/onetask/exceptions.py @@ -26,15 +26,12 @@ class UnauthorizedError(APIError): pass -# 404 Not Found -class UnknownIDException(APIError): +# 500 Server Error +class InternalServerError(APIError): pass -RESPONSE_CODES_API_EXCEPTION_MAP = { - 401: UnauthorizedError, - 404: UnknownIDException, -} +RESPONSE_CODES_API_EXCEPTION_MAP = {401: UnauthorizedError, 500: InternalServerError} def get_api_exception_class( diff --git a/onetask/labeling_function.py b/onetask/labeling_function.py deleted file mode 100644 index 8bdc272..0000000 --- a/onetask/labeling_function.py +++ /dev/null @@ -1,56 +0,0 @@ -import inspect -from typing import Callable, List - -from onetask import exceptions - - -def unpack_python_function(fn: Callable): - name = fn.__name__ - replace_operations = { - f"def {name}(": "def lf(", - f' """{fn.__doc__}"""\n': "", - " ": "\t", - } - source_code = inspect.getsource(fn) - for key, value in replace_operations.items(): - source_code = source_code.replace(key, value) - docs = inspect.getdoc(fn) - - check_signature(source_code) - - return name, source_code, docs - - -def check_signature(source_code: str) -> None: - # validate that only one parameter is given - import re - - parameters = re.search(r"\((.*?)\):", source_code).group(1).split(",") - if parameters == [""]: - number_parameters = 0 - else: - number_parameters = len(parameters) - if number_parameters != 1: - raise exceptions.ParameterError( - f"{number_parameters} parameters provided. Please use exactly one." - ) - - -def build_keywords_lf( - label, keywords: List[str], attributes: List[str], lowercase: bool -): - fn_name = f"lookup_kw_{'_'.join(keywords)}_in_{'_'.join(attributes)}" - source_code = "def lf(record):\n" - source_code += f"\tkeywords = {keywords}\n" - source_code += "\tattributes = [" - for attribute in attributes: - if lowercase: - source_code += f"record['{attribute}'].lower()," - else: - source_code += f"record['{attribute}']," - source_code += "]\n\tfor keyword in keywords:\n" - source_code += "\t\tfor attribute in attributes:\n" - source_code += "\t\t\tif keyword in attribute:\n" - source_code += f"\t\t\t\treturn '{label}'\n" - description = f"Lookup keywords {keywords} in attributes {attributes}" - return fn_name, source_code, description diff --git a/onetask/settings.py b/onetask/settings.py index e30892d..c58ef23 100644 --- a/onetask/settings.py +++ b/onetask/settings.py @@ -9,12 +9,14 @@ def set_stage(stage): def get_base_url(): global STAGE - if STAGE == "prod": - return "https://app.onetask.ai" + if STAGE == "beta": + return "https://app.beta.onetask.ai" elif STAGE == "test": return "https://app.test.onetask.ai" elif STAGE == "dev": return "https://app.dev.onetask.ai" + elif STAGE == "local": + return "http://localhost:4455" else: return STAGE @@ -23,5 +25,5 @@ def get_authentication_url(): return f"{get_base_url()}/.ory/kratos/public/self-service/login/api" -def get_labeling_function_url(): - return f"{get_base_url()}/labelfunction" +def graphql(): + return f"{get_base_url()}/graphql/" diff --git a/onetask/util.py b/onetask/util.py new file mode 100644 index 0000000..e7ff755 --- /dev/null +++ b/onetask/util.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +import inspect +from typing import Callable +import re +import json +from onetask import exceptions +import pandas as pd + + +def unpack_records(fetched_records): + records = [] + fetched_records = fetched_records["data"]["searchRecords"] + if len(fetched_records) > 0: + for record in fetched_records: + record_data = json.loads(record["data"]) + record_manual_labels = [] + edges = record["labelAssociations"]["edges"] + for edge in edges: + node = edge["node"] + if node["source"] == "manual": + record_manual_labels.append(node["label"]["name"]) + records.append( + { + "data": record_data, + "manual_labels": record_manual_labels[ + 0 + ], # remove [0] for multilabel support + } + ) + return records + else: + return [] + + +def records_to_df(records): + raw_df = pd.DataFrame(records) + df = raw_df["data"].apply(pd.Series) + df["label"] = raw_df["manual_labels"] + return df + + +def unpack_python_function(fn: Callable, project_id: str): + def check_signature(source_code: str) -> None: + # validate that only one parameter is given + + parameters = re.search(r"\((.*?)\):", source_code).group(1).split(",") + if parameters == [""]: + number_parameters = 0 + else: + number_parameters = len(parameters) + if number_parameters != 1: + raise exceptions.ParameterError( + f"{number_parameters} parameters provided. Please use exactly one." + ) + + name = fn.__name__ + replace_operations = { + f"def {name}(": "def lf(", + f' """{fn.__doc__}"""\n': "", + " ": "\t", + } + source_code = inspect.getsource(fn) + for key, value in replace_operations.items(): + source_code = source_code.replace(key, value) + docs = inspect.getdoc(fn) or "" # default + + check_signature(source_code) + + return project_id, name, source_code, docs diff --git a/setup.py b/setup.py index 407eb08..d5291b0 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.6", + version="0.1.7w", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", @@ -26,7 +26,6 @@ package_dir={"": "."}, packages=find_packages("."), install_requires=[ - "better-abc==0.0.3", "certifi==2021.5.30", "chardet==4.0.0", "idna==2.10", From e0932df558829f0afbfea5f1a5f93ca4cfeb0419 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Sun, 3 Oct 2021 22:47:12 +0200 Subject: [PATCH 09/35] update setup --- publish.sh | 0 setup.py | 3 ++- 2 files changed, 2 insertions(+), 1 deletion(-) mode change 100644 => 100755 publish.sh diff --git a/publish.sh b/publish.sh old mode 100644 new mode 100755 diff --git a/setup.py b/setup.py index d5291b0..62aa353 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.7w", + version="0.1.7", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", @@ -27,6 +27,7 @@ packages=find_packages("."), install_requires=[ "certifi==2021.5.30", + "spacy==3.1.3", "chardet==4.0.0", "idna==2.10", "requests==2.25.1", From 5f106677f917818bfa07a54fe714d30fce401cd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <62521561+jhoetter@users.noreply.github.com> Date: Mon, 4 Oct 2021 00:02:03 +0200 Subject: [PATCH 10/35] Update README.md --- README.md | 49 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 37 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index cce8070..6468317 100644 --- a/README.md +++ b/README.md @@ -16,22 +16,47 @@ You can clone the repository and run the setup.py script: ## [](https://github.com/onetask-ai/onetask-python#usage)Usage -Before making requests to the API, you need to create an instance of the onetask client. -To do so, you will have to login like you do in the system while providing the project id you work in: +The SDK is currently focused solely on labeling functions. You can register your own functions or let our system generate suggestions on which you can build. In the near future, we'll extend the Python SDK to include programmatic imports and exports, data access, and many more. + +You begin by creating a `Client` object. The `Client` will generate and store a session token for you based on your user name, password, and project id. The project id can be found in the URL, e.g. https://app.dev.onetask.ai/app/projects/**03f7d82c-f14c-4f0f-a1ff-59533bab30cc**/overview. Simply copy and paste this into the following pattern: ```python from onetask import Client -# Instantiate the client using your org_id and project_id -user_name = '' -password = '' -project_id = '' -client = Client(user_name=user_name, password=password, project_id=project_id) + +username = "your-username" +project_id = "your-project-id" +password = "your-password" +stage="beta" # if you have onetask on local, you can also set stage to "local" +client = Client(username, password, project_id, stage) +``` + +Once you correctly instantiated your Client, you can start accessing our GraphQL endpoints. Please always ensure that your labeling functions: + +return label names that also exist in your project definition +have exactly one parameter; we execute labeling functions on a record-basis +If you need an import statement in your labeling functions, please check if it is given in the [whitelisted libraries](https://onetask.readme.io/reference/whitelisted-libraries). If you need a library that we have not yet whitelisted, feel free to reach out to us. + +The most straightforward way to create and register a labeling function is as follows: + +```python +def my_labeling_function(record): + """ + This is my first labeling function. Yay! + Its purpose is to detect a list of values in the records that tend to + occur in urgent messages. + """ + keywords = ["asap", "as soon as possible", "urgent"] + + message_lower = record["message"].lower() + for keyword in keywords: + if keyword in message_lower: + return "Urgent" ``` -You can now register your custom Python function +You can then enter them using the client as follows: + ```python -def my_first_lf(record): - if "you" in record["headline"].lower(): - return "Clickbait" -client.register_lf(my_first_lf) +client.register_lf(my_labeling_function) ``` + +And that's it. You should now be able to see your labeling function in the web application. For further steps, please refer to our [readme.io](https://onetask.readme.io/reference/setting-up-the-python-sdk) documentation From e091fbc42d73f5fd8cee99b047bd1ed71a9bb860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 5 Oct 2021 16:35:09 +0200 Subject: [PATCH 11/35] graphql to rest --- onetask/__init__.py | 14 ++--- onetask/api_calls.py | 128 ++++++++++++------------------------------- onetask/auto_lf.py | 2 +- onetask/settings.py | 12 +++- onetask/util.py | 34 ------------ setup.py | 2 +- 6 files changed, 54 insertions(+), 138 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index aeed62a..77950cc 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -2,6 +2,7 @@ from typing import Callable from wasabi import msg +import pandas as pd from onetask import api_calls, settings, util, auto_lf @@ -16,20 +17,17 @@ def __init__( self.project_id = project_id if self.session_token is not None: msg.good("Logged in to system.") - if not api_calls.ProjectByProjectId( - self.project_id, self.session_token - ).exists: + if not api_calls.PostProjectExists(project_id, self.session_token).exists: msg.fail(f"Project with ID {self.project_id} does not exist.") else: msg.fail("Could not log in. Please check your username and password.") def manually_labeled_records(self, as_df: bool = True): - fetched_records = api_calls.ManuallyLabeledRecords( + records = api_calls.PostManuallyLabeledRecords( self.project_id, self.session_token - ).data - records = util.unpack_records(fetched_records) + ).records if as_df and len(records) > 0: - return util.records_to_df(records) + return pd.DataFrame(records) else: return records @@ -49,7 +47,7 @@ def register_lf(self, lf: Callable) -> None: project_id, name, source_code, docs = util.unpack_python_function( lf, self.project_id ) - api_calls.CreateLabelingFunction( + api_calls.PostLabelingFunction( project_id, name, source_code, docs, self.session_token ) msg.good(f"Registered labeling function '{name}'.") diff --git a/onetask/api_calls.py b/onetask/api_calls.py index efe3656..d992f80 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- +from json.decoder import JSONDecodeError import pkg_resources from onetask import exceptions, settings import requests @@ -35,17 +36,13 @@ def create_session_token(user_name: str, password: str): return session_token -class GraphQLRequest: - def __init__(self, query, variables, session_token): - self.query = query - self.variables = variables +class PostRequest: + def __init__(self, url, body, session_token): + self.url = url + self.body = body self.session_token = session_token def execute(self): - body = { - "query": self.query, - "variables": self.variables, - } headers = { "Content-Type": "application/json", @@ -53,17 +50,22 @@ def execute(self): "Authorization": f"Bearer {self.session_token}", } - response = requests.post(url=settings.graphql(), json=body, headers=headers) + response = requests.post(url=self.url, json=self.body, headers=headers) status_code = response.status_code - json_data = response.json() - if status_code == 200: + json_data = response.json() return json_data else: - error_code = json_data.get("error_code") - error_message = json_data.get("error_message") + try: + json_data = response.json() + error_code = json_data.get("error_code") + error_message = json_data.get("error_message") + except JSONDecodeError: + error_code = 500 + error_message = "The server was unable to process the provided data." + exception = exceptions.get_api_exception_class( status_code=status_code, error_code=error_code, @@ -72,95 +74,37 @@ def execute(self): raise exception -class ProjectByProjectId(GraphQLRequest): - def __init__(self, project_id, session_token): - QUERY = """ - query ($projectId: ID!) { - projectByProjectId(projectId: $projectId) { - id - labels { - edges { - node { - name - } - } - } - } - } - """ +class PostLabelingFunction(PostRequest): + def __init__(self, project_id, name, function, description, session_token): - variables = { - "projectId": project_id, + body = { + "project_id": project_id, + "name": name, + "function": function, + "description": description, } - super().__init__(QUERY, variables, session_token) - try: - self.data = self.execute() - self.exists = self.data.get("data").get("projectByProjectId") is not None - except exceptions.APIError: - self.exists = False + super().__init__(settings.get_post_lf_url(), body, session_token) + _ = self.execute() -class ManuallyLabeledRecords(GraphQLRequest): +class PostProjectExists(PostRequest): def __init__(self, project_id, session_token): - data = ProjectByProjectId(project_id, session_token).data - edges = data["data"]["projectByProjectId"]["labels"]["edges"] - manual = [edge["node"]["name"] for edge in edges] - - QUERY = """ - query ($projectId: ID!, $manual: [String!]) { - searchRecords(projectId: $projectId, manual: $manual) { - data - labelAssociations { - edges { - node { - label { - name - } - source - } - } - } - } - } - - """ - variables = {"projectId": project_id, "manual": manual} + body = { + "project_id": project_id, + } - super().__init__(QUERY, variables, session_token) - self.data = self.execute() + super().__init__(settings.get_project_exists_url(), body, session_token) + self.exists = self.execute()["exists"] -class CreateLabelingFunction(GraphQLRequest): - def __init__(self, project_id, name, function, description, session_token): - QUERY = """ - mutation ( - $projectId: ID!, - $name: String!, - $function: String!, - $description: String! - ) { - createLabelingFunction( - projectId: $projectId, - name: $name, - function: - $function, - description: $description - ) { - labelingFunction { - id - } - } - } - """ +class PostManuallyLabeledRecords(PostRequest): + def __init__(self, project_id, session_token): - variables = { - "projectId": project_id, - "name": name, - "function": function, - "description": description, + body = { + "project_id": project_id, } - super().__init__(QUERY, variables, session_token) - _ = self.execute() + super().__init__(settings.get_manually_labeled_data_url(), body, session_token) + self.records = self.execute()["records"] diff --git a/onetask/auto_lf.py b/onetask/auto_lf.py index c06fd31..75b96e7 100644 --- a/onetask/auto_lf.py +++ b/onetask/auto_lf.py @@ -50,7 +50,7 @@ def is_relevant_token(token): return [regex for regex, _ in Counter(candidates).most_common(most_common)] -def create_regex_fns(df, candidates, regex_col, label_col="label"): +def create_regex_fns(df, candidates, regex_col, label_col="manual_label"): def regex_explainer(regex, attribute): description = "" terms = regex.replace("^", "").replace("$", "").split(".*?") diff --git a/onetask/settings.py b/onetask/settings.py index c58ef23..fceecfd 100644 --- a/onetask/settings.py +++ b/onetask/settings.py @@ -25,5 +25,13 @@ def get_authentication_url(): return f"{get_base_url()}/.ory/kratos/public/self-service/login/api" -def graphql(): - return f"{get_base_url()}/graphql/" +def get_project_exists_url(): + return f"{get_base_url()}/project_exists" + + +def get_post_lf_url(): + return f"{get_base_url()}/labelfunction" + + +def get_manually_labeled_data_url(): + return f"{get_base_url()}/manually_labeled_data" diff --git a/onetask/util.py b/onetask/util.py index e7ff755..68c733a 100644 --- a/onetask/util.py +++ b/onetask/util.py @@ -2,41 +2,7 @@ import inspect from typing import Callable import re -import json from onetask import exceptions -import pandas as pd - - -def unpack_records(fetched_records): - records = [] - fetched_records = fetched_records["data"]["searchRecords"] - if len(fetched_records) > 0: - for record in fetched_records: - record_data = json.loads(record["data"]) - record_manual_labels = [] - edges = record["labelAssociations"]["edges"] - for edge in edges: - node = edge["node"] - if node["source"] == "manual": - record_manual_labels.append(node["label"]["name"]) - records.append( - { - "data": record_data, - "manual_labels": record_manual_labels[ - 0 - ], # remove [0] for multilabel support - } - ) - return records - else: - return [] - - -def records_to_df(records): - raw_df = pd.DataFrame(records) - df = raw_df["data"].apply(pd.Series) - df["label"] = raw_df["manual_labels"] - return df def unpack_python_function(fn: Callable, project_id: str): diff --git a/setup.py b/setup.py index 62aa353..811aa12 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.7", + version="0.1.8", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 120b476820fc4fe2fc2c6116826ec5c9aaedad4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 5 Oct 2021 16:40:07 +0200 Subject: [PATCH 12/35] add pandas to setup --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 811aa12..509cc62 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.8", + version="0.1.9", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", @@ -28,6 +28,7 @@ install_requires=[ "certifi==2021.5.30", "spacy==3.1.3", + "pandas==1.3.3", "chardet==4.0.0", "idna==2.10", "requests==2.25.1", From fee7aba63c1213fae696009d1a712b4586b3d4a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 5 Oct 2021 17:25:18 +0200 Subject: [PATCH 13/35] adds autoexecution --- onetask/__init__.py | 18 +++++++++--------- onetask/api_calls.py | 5 ++++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 77950cc..b2f3aef 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -22,6 +22,15 @@ def __init__( else: msg.fail("Could not log in. Please check your username and password.") + def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: + project_id, name, source_code, docs = util.unpack_python_function( + lf, self.project_id + ) + api_calls.PostLabelingFunction( + project_id, name, source_code, docs, autoexecute, self.session_token + ) + msg.good(f"Registered labeling function '{name}'.") + def manually_labeled_records(self, as_df: bool = True): records = api_calls.PostManuallyLabeledRecords( self.project_id, self.session_token @@ -42,12 +51,3 @@ def autogenerate_regex_labeling_functions( auto_lf.create_regex_fns(records, candidates, attribute) else: msg.fail("No manually labeled records available!") - - def register_lf(self, lf: Callable) -> None: - project_id, name, source_code, docs = util.unpack_python_function( - lf, self.project_id - ) - api_calls.PostLabelingFunction( - project_id, name, source_code, docs, self.session_token - ) - msg.good(f"Registered labeling function '{name}'.") diff --git a/onetask/api_calls.py b/onetask/api_calls.py index d992f80..0ef2161 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -75,13 +75,16 @@ def execute(self): class PostLabelingFunction(PostRequest): - def __init__(self, project_id, name, function, description, session_token): + def __init__( + self, project_id, name, function, description, autoexecute, session_token + ): body = { "project_id": project_id, "name": name, "function": function, "description": description, + "autoexecute": autoexecute, } super().__init__(settings.get_post_lf_url(), body, session_token) From 2d89a2e75459c3363c44361c31c8a904fa288ef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 5 Oct 2021 17:38:12 +0200 Subject: [PATCH 14/35] adds checks for existing lfs --- onetask/__init__.py | 10 +++++++--- onetask/api_calls.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index b2f3aef..85d31f4 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -26,10 +26,14 @@ def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: project_id, name, source_code, docs = util.unpack_python_function( lf, self.project_id ) - api_calls.PostLabelingFunction( + if api_calls.PostLabelingFunction( project_id, name, source_code, docs, autoexecute, self.session_token - ) - msg.good(f"Registered labeling function '{name}'.") + ).already_exists: + msg.warn( + f"Labeling function '{name}' already exists. It has not been entered again!" + ) + else: + msg.good(f"Registered labeling function '{name}'.") def manually_labeled_records(self, as_df: bool = True): records = api_calls.PostManuallyLabeledRecords( diff --git a/onetask/api_calls.py b/onetask/api_calls.py index 0ef2161..bf5c570 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -88,7 +88,7 @@ def __init__( } super().__init__(settings.get_post_lf_url(), body, session_token) - _ = self.execute() + self.already_exists = self.execute()["already_exists"] class PostProjectExists(PostRequest): From 9b93bbcca44c8aa0ec2bcf59bce9edcf8d5ed98c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Tue, 5 Oct 2021 17:38:53 +0200 Subject: [PATCH 15/35] 0.1.10 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 509cc62..7c8f120 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.9", + version="0.1.10", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 9da9b3762f1972993000e90e5923f7903a0aa369 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 6 Oct 2021 09:13:49 +0200 Subject: [PATCH 16/35] improves auto_lf --- onetask/__init__.py | 25 +++++++++++++++++++++---- onetask/auto_lf.py | 41 ++++++++++++++++++++++++++++++++--------- 2 files changed, 53 insertions(+), 13 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 85d31f4..3c3ae1c 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Callable +from typing import Callable, Optional from wasabi import msg import pandas as pd from onetask import api_calls, settings, util, auto_lf @@ -45,13 +45,30 @@ def manually_labeled_records(self, as_df: bool = True): return records def autogenerate_regex_labeling_functions( - self, nlp, attribute, num_functions: int = 10 + self, nlp, attribute, min_precision=0.8, filter_stopwords=False ): records = self.manually_labeled_records(as_df=True) if len(records) > 0: candidates = auto_lf.derive_regex_candidates( - nlp, records, attribute, most_common=num_functions + nlp, records, attribute, filter_stopwords + ) + return auto_lf.create_regex_fns( + records, candidates, attribute, min_precision ) - auto_lf.create_regex_fns(records, candidates, attribute) else: msg.fail("No manually labeled records available!") + + def display_autogenerated_labeling_functions( + self, lf_df: pd.DataFrame, label: Optional[str] = None + ): + if label is not None: + lf_df = lf_df.loc[lf_df["label"] == label] + for _, row in lf_df.iterrows(): + est_coverage = row["est_coverage"] + est_precision = row["est_precision"] + code = row["code"] + msg.info( + f"Coverage: {est_coverage * 100}% | Precision: {est_precision * 100}%" + ) + print(code) + print() diff --git a/onetask/auto_lf.py b/onetask/auto_lf.py index 75b96e7..4c3aa61 100644 --- a/onetask/auto_lf.py +++ b/onetask/auto_lf.py @@ -4,10 +4,11 @@ import re from collections import defaultdict import numpy as np +import pandas as pd from wasabi import msg -def derive_regex_candidates(nlp, df, attribute, most_common=10): +def derive_regex_candidates(nlp, df, attribute, filter_stopwords): if len(df) < 100: msg.warn( "Only very few records to analyze; it's best to continue labeling further records before analysis" @@ -20,7 +21,10 @@ def normalize_token(token): return token.text def is_relevant_token(token): - return not (token.is_punct or token.is_stop or token.is_bracket) + conditions = [token.is_punct, token.is_bracket, len(token.text) == 1] + if filter_stopwords: + conditions.append(token.is_stop) + return not any(conditions) candidates = [] for text in tqdm(df[attribute], total=len(df)): @@ -47,10 +51,19 @@ def is_relevant_token(token): suffix = "$" if token.idx == len(doc) - 1 else " " candidate = f"{prefix}{normalize_token(token)}{suffix}" candidates.append(candidate) - return [regex for regex, _ in Counter(candidates).most_common(most_common)] + return [regex for regex, _ in Counter(candidates).most_common(100)] -def create_regex_fns(df, candidates, regex_col, label_col="manual_label"): +def create_regex_fns( + df, candidates, regex_col, min_precision, label_col="manual_label" +): + n = len(df) + + def calc_min_cov(x): + return 0.5 / (x ** 0.5) + + min_coverage = calc_min_cov(n) + def regex_explainer(regex, attribute): description = "" terms = regex.replace("^", "").replace("$", "").split(".*?") @@ -75,7 +88,7 @@ def regex_explainer(regex, attribute): ) if len(terms) > 1: for term in terms[1:]: - description += f" followed by term '{term}'" + description += f" (in-)directly followed by term '{term}'" if "[0-9]" in regex: description += ", where [0-9] is an arbitrary number" description += "." @@ -95,6 +108,7 @@ def regex_{iteration}(record): return source_code.strip() regex_nr = 1 + rows = [] for regex in candidates: labels = defaultdict(int) for text, label in zip(df[regex_col], df[label_col]): @@ -109,9 +123,18 @@ def regex_{iteration}(record): regex_prediction = prediction precision = np.round(labels[regex_prediction] / coverage, 2) coverage = np.round(coverage / len(df), 2) - if precision > 0.75 and coverage >= 0.01: + if precision >= min_precision and coverage >= min_coverage: lf = build_regex_lf(regex, regex_col, regex_prediction, regex_nr) regex_nr += 1 - print(f"# Cov:\t{coverage}\tPrec:{precision}") - print(lf) - print() + rows.append( + { + "est_coverage": coverage, + "est_precision": precision, + "label": regex_prediction, + "code": lf, + } + ) + lf_df = pd.DataFrame(rows) + lf_df["priority"] = (lf_df["est_coverage"] ** 2) * lf_df["est_precision"] + lf_df = lf_df.sort_values(by="priority", ascending=False) + return lf_df From 9ac941f6fe2761f81b94ae2085d745a89a80ea1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 6 Oct 2021 09:14:17 +0200 Subject: [PATCH 17/35] improves auto_lf --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7c8f120..8d6e4ea 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.10", + version="0.1.11", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 3e7b789f65293192a6f02d41e6ff1d7dc73144af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Thu, 7 Oct 2021 14:16:51 +0200 Subject: [PATCH 18/35] bugfix in auto_lf regex with unmatched parenthesis --- onetask/auto_lf.py | 25 +++++++++++++++++++------ setup.py | 2 +- 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/onetask/auto_lf.py b/onetask/auto_lf.py index 4c3aa61..0fbcd1d 100644 --- a/onetask/auto_lf.py +++ b/onetask/auto_lf.py @@ -60,7 +60,7 @@ def create_regex_fns( n = len(df) def calc_min_cov(x): - return 0.5 / (x ** 0.5) + return 0.3 / (x ** 0.5) min_coverage = calc_min_cov(n) @@ -94,12 +94,17 @@ def regex_explainer(regex, attribute): description += "." return description - def build_regex_lf(regex, attribute, prediction, iteration): + def build_regex_lf(regex, attribute, prediction, iteration, escape_regex): + + if escape_regex: + _regex = f"re.escape('{regex}')" + else: + _regex = f"r'{regex}'" source_code = f""" def regex_{iteration}(record): '''{regex_explainer(regex, attribute)}''' import re - if re.search(r'{regex}', record['{attribute}'].lower()): + if re.search({_regex}, record['{attribute}'].lower()): return '{prediction}' client.register_lf(regex_{iteration}) @@ -111,9 +116,15 @@ def regex_{iteration}(record): rows = [] for regex in candidates: labels = defaultdict(int) + escape_regex = False for text, label in zip(df[regex_col], df[label_col]): - if re.search(regex, text.lower()): - labels[label] += 1 + try: + if re.search(regex, text.lower()): + labels[label] += 1 + except: # there is sadly no better way (I know of) to handle this other than using a plain except + escape_regex = True + if re.search(re.escape(regex), text.lower()): + labels[label] += 1 coverage = sum(labels.values()) if coverage > 0: regex_prediction, max_count = None, 0 @@ -124,7 +135,9 @@ def regex_{iteration}(record): precision = np.round(labels[regex_prediction] / coverage, 2) coverage = np.round(coverage / len(df), 2) if precision >= min_precision and coverage >= min_coverage: - lf = build_regex_lf(regex, regex_col, regex_prediction, regex_nr) + lf = build_regex_lf( + regex, regex_col, regex_prediction, regex_nr, escape_regex + ) regex_nr += 1 rows.append( { diff --git a/setup.py b/setup.py index 8d6e4ea..3ce838b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.11", + version="0.1.12", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 624f693021e95fdbcc598753baa1aefbc7949b27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 13 Oct 2021 16:26:06 +0200 Subject: [PATCH 19/35] address rest-api --- onetask/__init__.py | 17 +++++---- onetask/api_calls.py | 91 ++++++++++++++++++++++++-------------------- onetask/auto_lf.py | 4 +- onetask/settings.py | 8 ++-- 4 files changed, 64 insertions(+), 56 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 3c3ae1c..712af94 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -17,7 +17,7 @@ def __init__( self.project_id = project_id if self.session_token is not None: msg.good("Logged in to system.") - if not api_calls.PostProjectExists(project_id, self.session_token).exists: + if not api_calls.GetProjectExists(project_id, self.session_token).exists: msg.fail(f"Project with ID {self.project_id} does not exist.") else: msg.fail("Could not log in. Please check your username and password.") @@ -35,19 +35,22 @@ def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: else: msg.good(f"Registered labeling function '{name}'.") - def manually_labeled_records(self, as_df: bool = True): - records = api_calls.PostManuallyLabeledRecords( + def get_manually_labeled_records(self): + records = api_calls.GetManuallyLabeledRecords( self.project_id, self.session_token ).records - if as_df and len(records) > 0: - return pd.DataFrame(records) + fetched_df = pd.DataFrame(records) + if len(fetched_df) > 0: + df = fetched_df["data"].apply(pd.Series) + df["label"] = fetched_df["label"] + return df else: - return records + return fetched_df # empty df def autogenerate_regex_labeling_functions( self, nlp, attribute, min_precision=0.8, filter_stopwords=False ): - records = self.manually_labeled_records(as_df=True) + records = self.get_manually_labeled_records() if len(records) > 0: candidates = auto_lf.derive_regex_candidates( nlp, records, attribute, filter_stopwords diff --git a/onetask/api_calls.py b/onetask/api_calls.py index bf5c570..9806cbf 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -36,6 +36,36 @@ def create_session_token(user_name: str, password: str): return session_token +def build_headers(session_token): + return { + "Content-Type": "application/json", + "User-Agent": f"python-sdk-{version}", + "Authorization": f"Bearer {session_token}", + } + + +def handle_response(response): + status_code = response.status_code + if status_code == 200: + json_data = response.json() + return json_data + else: + try: + json_data = response.json() + error_code = json_data.get("error_code") + error_message = json_data.get("error_message") + except JSONDecodeError: + error_code = 500 + error_message = "The server was unable to process the provided data." + + exception = exceptions.get_api_exception_class( + status_code=status_code, + error_code=error_code, + error_message=error_message, + ) + raise exception + + class PostRequest: def __init__(self, url, body, session_token): self.url = url @@ -43,35 +73,20 @@ def __init__(self, url, body, session_token): self.session_token = session_token def execute(self): + response = requests.post( + url=self.url, json=self.body, headers=build_headers(self.session_token) + ) + return handle_response(response) - headers = { - "Content-Type": "application/json", - "User-Agent": f"python-sdk-{version}", - "Authorization": f"Bearer {self.session_token}", - } - - response = requests.post(url=self.url, json=self.body, headers=headers) - status_code = response.status_code +class GetRequest: + def __init__(self, url, session_token): + self.url = url + self.session_token = session_token - if status_code == 200: - json_data = response.json() - return json_data - else: - try: - json_data = response.json() - error_code = json_data.get("error_code") - error_message = json_data.get("error_message") - except JSONDecodeError: - error_code = 500 - error_message = "The server was unable to process the provided data." - - exception = exceptions.get_api_exception_class( - status_code=status_code, - error_code=error_code, - error_message=error_message, - ) - raise exception + def execute(self): + response = requests.get(url=self.url, headers=build_headers(self.session_token)) + return handle_response(response) class PostLabelingFunction(PostRequest): @@ -91,23 +106,15 @@ def __init__( self.already_exists = self.execute()["already_exists"] -class PostProjectExists(PostRequest): +class GetProjectExists(GetRequest): def __init__(self, project_id, session_token): + super().__init__(settings.get_project_url(project_id), session_token) + self.exists = self.execute() - body = { - "project_id": project_id, - } - super().__init__(settings.get_project_exists_url(), body, session_token) - self.exists = self.execute()["exists"] - - -class PostManuallyLabeledRecords(PostRequest): +class GetManuallyLabeledRecords(GetRequest): def __init__(self, project_id, session_token): - - body = { - "project_id": project_id, - } - - super().__init__(settings.get_manually_labeled_data_url(), body, session_token) - self.records = self.execute()["records"] + super().__init__( + settings.get_manually_labeled_data_url(project_id), session_token + ) + self.records = self.execute() diff --git a/onetask/auto_lf.py b/onetask/auto_lf.py index 0fbcd1d..5065b07 100644 --- a/onetask/auto_lf.py +++ b/onetask/auto_lf.py @@ -54,9 +54,7 @@ def is_relevant_token(token): return [regex for regex, _ in Counter(candidates).most_common(100)] -def create_regex_fns( - df, candidates, regex_col, min_precision, label_col="manual_label" -): +def create_regex_fns(df, candidates, regex_col, min_precision, label_col="label"): n = len(df) def calc_min_cov(x): diff --git a/onetask/settings.py b/onetask/settings.py index fceecfd..069ec7f 100644 --- a/onetask/settings.py +++ b/onetask/settings.py @@ -25,13 +25,13 @@ def get_authentication_url(): return f"{get_base_url()}/.ory/kratos/public/self-service/login/api" -def get_project_exists_url(): - return f"{get_base_url()}/project_exists" +def get_project_url(project_id): + return f"{get_base_url()}/api/project/{project_id}" def get_post_lf_url(): return f"{get_base_url()}/labelfunction" -def get_manually_labeled_data_url(): - return f"{get_base_url()}/manually_labeled_data" +def get_manually_labeled_data_url(project_id): + return f"{get_project_url(project_id)}/data?labelled=manual" From 91f8383acf3c0a240ad2770f6555702ee7fd0088 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 13 Oct 2021 16:26:58 +0200 Subject: [PATCH 20/35] version 0.1.13 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3ce838b..e28dec4 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.12", + version="0.1.13", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 42445265d322e712b9bb6b9f96744c1eb9da6566 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Mon, 1 Nov 2021 16:36:40 +0100 Subject: [PATCH 21/35] adds embedding generatioN --- onetask/__init__.py | 64 ++++++++++++++++++++++++--- onetask/api_calls.py | 11 ++++- onetask/embedding.py | 101 +++++++++++++++++++++++++++++++++++++++++++ onetask/settings.py | 11 ++++- setup.py | 8 +++- 5 files changed, 183 insertions(+), 12 deletions(-) create mode 100644 onetask/embedding.py diff --git a/onetask/__init__.py b/onetask/__init__.py index 712af94..8c590c2 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -3,7 +3,10 @@ from typing import Callable, Optional from wasabi import msg import pandas as pd -from onetask import api_calls, settings, util, auto_lf +import numpy as np +from tqdm import tqdm +import json +from onetask import api_calls, settings, util, auto_lf, embedding class Client: @@ -35,22 +38,30 @@ def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: else: msg.good(f"Registered labeling function '{name}'.") - def get_manually_labeled_records(self): - records = api_calls.GetManuallyLabeledRecords( + def get_attributes(self): + attributes = api_calls.GetUniqueAttributes( self.project_id, self.session_token + ).attributes + return attributes + + def get_records(self, manually_labeled=True): + records = api_calls.GetRecords( + self.project_id, self.session_token, manually_labeled=manually_labeled ).records + fetched_df = pd.DataFrame(records) if len(fetched_df) > 0: df = fetched_df["data"].apply(pd.Series) - df["label"] = fetched_df["label"] + if manually_labeled: + df["label"] = fetched_df["label"] return df else: return fetched_df # empty df - def autogenerate_regex_labeling_functions( + def generate_regex_labeling_functions( self, nlp, attribute, min_precision=0.8, filter_stopwords=False ): - records = self.get_manually_labeled_records() + records = self.get_records() if len(records) > 0: candidates = auto_lf.derive_regex_candidates( nlp, records, attribute, filter_stopwords @@ -61,7 +72,46 @@ def autogenerate_regex_labeling_functions( else: msg.fail("No manually labeled records available!") - def display_autogenerated_labeling_functions( + def generate_embeddings(self, attribute, config_string): + msg.info("Loading schema") + attributes = self.get_attributes() + unique_attribute = None + data_type = None + for attr in attributes: + if attr["unique"]: + unique_attribute = attr["name"] + if attr["name"] == attribute: + data_type = attr["data_type"] + + if unique_attribute: + msg.info("Loading records") + records = self.get_records(manually_labeled=False) + docs = np.stack(records[attribute]) + export = records[[unique_attribute, attribute]].to_dict(orient="records") + msg.info("Loading embedding model") + model = embedding.get_fitted_model_by_config_string( + data_type, config_string, docs + ) + if model: + msg.info("Starting embedding procedure") + for idx, row in tqdm( + enumerate(export), total=len(export), desc="Embedding records..." + ): + row[config_string] = model.encode(row[attribute]).tolist() + del row[attribute] + export[idx] = row + msg.good("Finished embedding procedure") + file_path = ( + f"embeddings_{attribute}_{config_string}_{self.project_id}.json" + ) + with open(file_path, "w") as file: + json.dump(export, file) + else: + msg.fail( + "Currently, you must have exactly one unique attribute for embedding generation. Please validate this in the web app under 'Settings'" + ) + + def display_generated_labeling_functions( self, lf_df: pd.DataFrame, label: Optional[str] = None ): if label is not None: diff --git a/onetask/api_calls.py b/onetask/api_calls.py index 9806cbf..c880e18 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -112,9 +112,16 @@ def __init__(self, project_id, session_token): self.exists = self.execute() -class GetManuallyLabeledRecords(GetRequest): +class GetUniqueAttributes(GetRequest): def __init__(self, project_id, session_token): + super().__init__(settings.get_schema_url(project_id), session_token) + self.attributes = self.execute() + + +class GetRecords(GetRequest): + def __init__(self, project_id, session_token, manually_labeled): super().__init__( - settings.get_manually_labeled_data_url(project_id), session_token + settings.get_data_url(project_id, manually_labeled), + session_token, ) self.records = self.execute() diff --git a/onetask/embedding.py b/onetask/embedding.py new file mode 100644 index 0000000..6a70867 --- /dev/null +++ b/onetask/embedding.py @@ -0,0 +1,101 @@ +from abc import ABC, abstractmethod +from transformers import AutoTokenizer, AutoModel +import torch +import nltk +from nltk.tokenize import sent_tokenize +from wasabi import msg +from sklearn.feature_extraction.text import CountVectorizer + + +def get_fitted_model_by_config_string(data_type, config_string, records): + if data_type == "str": + if config_string == "boc": + return BoCEmbedder(records) + elif config_string == "bow": + return BoWEmbedder(records) + elif config_string == "tfidf": + raise NotImplementedError("TFIDF is not implemented yet") + else: + try: + return BERTEmbedder(records, config_string) + except: + msg.fail( + f"Embedding '{config_string}' is unknown. Please check https://onetask.readme.io/ for more information" + ) + else: + msg.fail(f"Currently unsupported data type {data_type} of attribute.") + + +class Embedder(ABC): + def __init__(self, records): + self.fit(records) + + @abstractmethod + def encode(self, document): + pass + + @abstractmethod + def fit(self, records): + pass + + +class BoCEmbedder(Embedder): + def __init__(self, records): + self.model = CountVectorizer(analyzer="char") + super().__init__(records) + + def fit(self, records): + self.model.fit(records) + + def encode(self, document): + return self.model.transform([document]).toarray()[0] + + +class BoWEmbedder(Embedder): + def __init__(self, records): + self.model = CountVectorizer(min_df=0.1) + super().__init__(records) + + def fit(self, records): + self.model.fit(records) + + def encode(self, document): + return self.model.transform([document]).toarray()[0] + + +class BERTEmbedder(Embedder): + def __init__(self, records, configuration_string: str = "bert-base-uncased"): + self.tokenizer = AutoTokenizer.from_pretrained(configuration_string) + self.model = AutoModel.from_pretrained( + configuration_string, output_hidden_states=True + ) + self.model.eval() + super().__init__(records) + nltk.download("punkt") + + def fit(self, records): + pass + + def encode(self, document: str): + embeddings = [] + for sentence in sent_tokenize(document): + encoded_dict = self.tokenizer.encode_plus( + sentence, return_tensors="pt", max_length=512, truncation=True + ) + with torch.no_grad(): + outputs = self.model(**encoded_dict) + embedding = self.mean_pooling(outputs, encoded_dict["attention_mask"]) + embeddings.append(embedding) + embedding_output = torch.mean(torch.stack(embeddings), axis=0) + return embedding_output.cpu().numpy()[0] + + def mean_pooling(self, model_output, attention_mask): + token_embeddings = model_output[ + 0 + ] # First element of model_output contains all token embeddings + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) + sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) + sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) + return sum_embeddings / sum_mask diff --git a/onetask/settings.py b/onetask/settings.py index 069ec7f..658ab36 100644 --- a/onetask/settings.py +++ b/onetask/settings.py @@ -33,5 +33,12 @@ def get_post_lf_url(): return f"{get_base_url()}/labelfunction" -def get_manually_labeled_data_url(project_id): - return f"{get_project_url(project_id)}/data?labelled=manual" +def get_schema_url(project_id): + return f"{get_project_url(project_id)}/schema" + + +def get_data_url(project_id, manually_labeled): + url = f"{get_project_url(project_id)}/data" + if manually_labeled: + url = f"{url}?labelled=manual" + return url diff --git a/setup.py b/setup.py index e28dec4..ee7fa4d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.13", + version="0.1.14", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", @@ -26,6 +26,12 @@ package_dir={"": "."}, packages=find_packages("."), install_requires=[ + "transformers==4.12.2", + "torch==1.10.0", + "torchvision==0.11.1", + "torchaudio==0.10.0", + "nltk==3.6.5", + "scikit-learn==1.0.1", "certifi==2021.5.30", "spacy==3.1.3", "pandas==1.3.3", From ab674bf2777f3b4d6cae282bd10312ca6438811e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 17 Nov 2021 00:15:34 +0100 Subject: [PATCH 22/35] closed beta version --- onetask/__init__.py | 133 ++++++++++++++++++++++++++++--------------- onetask/api_calls.py | 9 +++ onetask/auto_lf.py | 1 + onetask/embedding.py | 110 +++++++++++++++++++---------------- onetask/settings.py | 4 ++ requirements.txt | 93 ++++++++++++++++++++++++++++++ setup.py | 105 +++++++++++++++++++++++++++++----- 7 files changed, 347 insertions(+), 108 deletions(-) create mode 100644 requirements.txt diff --git a/onetask/__init__.py b/onetask/__init__.py index 8c590c2..e836b2e 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -7,11 +7,14 @@ from tqdm import tqdm import json from onetask import api_calls, settings, util, auto_lf, embedding +import numpy as np +from bertopic import BERTopic +from collections import defaultdict class Client: def __init__( - self, user_name: str, password: str, project_id: str, stage: str = "prod" + self, user_name: str, password: str, project_id: str, stage: str = "beta" ): settings.set_stage(stage) self.session_token = api_calls.create_session_token( @@ -25,6 +28,12 @@ def __init__( else: msg.fail("Could not log in. Please check your username and password.") + def _get_unique_attributes(self): + attributes = api_calls.GetUniqueAttributes( + self.project_id, self.session_token + ).attributes + return attributes + def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: project_id, name, source_code, docs = util.unpack_python_function( lf, self.project_id @@ -38,13 +47,7 @@ def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: else: msg.good(f"Registered labeling function '{name}'.") - def get_attributes(self): - attributes = api_calls.GetUniqueAttributes( - self.project_id, self.session_token - ).attributes - return attributes - - def get_records(self, manually_labeled=True): + def get_records(self, manually_labeled=True) -> pd.DataFrame: records = api_calls.GetRecords( self.project_id, self.session_token, manually_labeled=manually_labeled ).records @@ -52,65 +55,103 @@ def get_records(self, manually_labeled=True): fetched_df = pd.DataFrame(records) if len(fetched_df) > 0: df = fetched_df["data"].apply(pd.Series) - if manually_labeled: - df["label"] = fetched_df["label"] + df["label"] = fetched_df["label"] return df else: + msg.warn("Empty result") return fetched_df # empty df - def generate_regex_labeling_functions( - self, nlp, attribute, min_precision=0.8, filter_stopwords=False - ): - records = self.get_records() - if len(records) > 0: - candidates = auto_lf.derive_regex_candidates( - nlp, records, attribute, filter_stopwords - ) - return auto_lf.create_regex_fns( - records, candidates, attribute, min_precision - ) + def get_embeddings(self, config_string): + embeddings = api_calls.GetEmbeddings( + self.project_id, self.session_token, config_string + ).embeddings + + fetched_embeddings = pd.DataFrame(embeddings) + if len(fetched_embeddings) > 0: + df = fetched_embeddings["data"].apply(pd.Series) + df[config_string] = fetched_embeddings["embedding"] + return df else: - msg.fail("No manually labeled records available!") + msg.warn("Empty result") + return fetched_embeddings + + def generate_embeddings(self, attribute_configs_dict, file_path=None): + if not file_path: + file_path = f"embeddings_{self.project_id}.json" - def generate_embeddings(self, attribute, config_string): msg.info("Loading schema") - attributes = self.get_attributes() + attributes = self._get_unique_attributes() unique_attribute = None - data_type = None for attr in attributes: if attr["unique"]: unique_attribute = attr["name"] - if attr["name"] == attribute: - data_type = attr["data_type"] + + embedding_name = "-".join(list(attribute_configs_dict.values())) if unique_attribute: msg.info("Loading records") records = self.get_records(manually_labeled=False) - docs = np.stack(records[attribute]) - export = records[[unique_attribute, attribute]].to_dict(orient="records") - msg.info("Loading embedding model") - model = embedding.get_fitted_model_by_config_string( - data_type, config_string, docs - ) - if model: - msg.info("Starting embedding procedure") - for idx, row in tqdm( - enumerate(export), total=len(export), desc="Embedding records..." - ): - row[config_string] = model.encode(row[attribute]).tolist() - del row[attribute] - export[idx] = row - msg.good("Finished embedding procedure") - file_path = ( - f"embeddings_{attribute}_{config_string}_{self.project_id}.json" + embedding_concat = defaultdict(list) + for attribute, config_string in attribute_configs_dict.items(): + vals = np.stack(records[attribute]) + records_subset = records[[unique_attribute, attribute]].to_dict( + orient="records" + ) + msg.info(f"Loading embedding model {config_string} for {attribute}") + model = embedding.get_fitted_model_by_config_string(config_string, vals) + if model: + msg.info("Starting embedding procedure") + for idx, row in tqdm( + enumerate(records_subset), + total=len(records_subset), + desc="Embedding records...", + ): + embedding_concat[idx].extend( + model.encode(row[attribute]).tolist() + ) + msg.good(f"Finished embedding procedure. Storing to {file_path}") + export = [] + for unique_val, embedding_vector in embedding_concat.items(): + export.append( + {unique_attribute: unique_val, embedding_name: embedding_vector} ) - with open(file_path, "w") as file: - json.dump(export, file) + with open(file_path, "w") as file: + json.dump(export, file) else: msg.fail( "Currently, you must have exactly one unique attribute for embedding generation. Please validate this in the web app under 'Settings'" ) + def model_topics(self, attribute, config_string): + msg.info("Loading embeddings") + embeddings_df = self.get_embeddings(config_string) + if len(embeddings_df) > 0: + docs = embeddings_df[attribute].tolist() + embeddings = np.stack(embeddings_df[config_string]) + + msg.info("Fitting Topic Model") + model = BERTopic(verbose=True, n_gram_range=[1, 2], top_n_words=30) + model.fit(docs, embeddings) + msg.good("Finished training") + msg.info( + "Further docs: https://maartengr.github.io/BERTopic/tutorial/visualization/visualization.html" + ) + return model + + def generate_regex_labeling_functions( + self, nlp, attribute, min_precision=0.8, filter_stopwords=False + ): + records = self.get_records() + if len(records) > 0: + candidates = auto_lf.derive_regex_candidates( + nlp, records, attribute, filter_stopwords + ) + return auto_lf.create_regex_fns( + records, candidates, attribute, min_precision + ) + else: + msg.fail("No manually labeled records available!") + def display_generated_labeling_functions( self, lf_df: pd.DataFrame, label: Optional[str] = None ): diff --git a/onetask/api_calls.py b/onetask/api_calls.py index c880e18..06d1938 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -125,3 +125,12 @@ def __init__(self, project_id, session_token, manually_labeled): session_token, ) self.records = self.execute() + + +class GetEmbeddings(GetRequest): + def __init__(self, project_id, session_token, config_string): + super().__init__( + settings.get_embeddings_url(project_id, config_string), + session_token, + ) + self.embeddings = self.execute() diff --git a/onetask/auto_lf.py b/onetask/auto_lf.py index 5065b07..5d733cf 100644 --- a/onetask/auto_lf.py +++ b/onetask/auto_lf.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- from collections import defaultdict from tqdm import tqdm from collections import Counter diff --git a/onetask/embedding.py b/onetask/embedding.py index 6a70867..797cc58 100644 --- a/onetask/embedding.py +++ b/onetask/embedding.py @@ -1,29 +1,32 @@ +# -*- coding: utf-8 -*- from abc import ABC, abstractmethod -from transformers import AutoTokenizer, AutoModel -import torch -import nltk -from nltk.tokenize import sent_tokenize +from sentence_transformers import SentenceTransformer, models +from torch import nn +import numpy as np from wasabi import msg from sklearn.feature_extraction.text import CountVectorizer +from sklearn.preprocessing import OneHotEncoder +from transformers import logging +logging.set_verbosity_error() -def get_fitted_model_by_config_string(data_type, config_string, records): - if data_type == "str": - if config_string == "boc": - return BoCEmbedder(records) - elif config_string == "bow": - return BoWEmbedder(records) - elif config_string == "tfidf": - raise NotImplementedError("TFIDF is not implemented yet") - else: - try: - return BERTEmbedder(records, config_string) - except: - msg.fail( - f"Embedding '{config_string}' is unknown. Please check https://onetask.readme.io/ for more information" - ) + +def get_fitted_model_by_config_string(config_string, records): + if config_string == "identity": + return IdentityEmbedder(records) + elif config_string == "boc": + return BoCEmbedder(records) + elif config_string == "bow": + return BoWEmbedder(records) + elif config_string == "onehot": + return OneHotEmbedder(records) else: - msg.fail(f"Currently unsupported data type {data_type} of attribute.") + try: + return DocumentEmbedder(records, config_string) + except: + msg.fail( + f"Embedding '{config_string}' is unknown. Please check https://onetask.readme.io/ for more information" + ) class Embedder(ABC): @@ -39,6 +42,17 @@ def fit(self, records): pass +class IdentityEmbedder(Embedder): + def __init__(self, records): + super().__init__(records) + + def fit(self, records): + pass + + def encode(self, document): + return np.array([document]) + + class BoCEmbedder(Embedder): def __init__(self, records): self.model = CountVectorizer(analyzer="char") @@ -63,39 +77,37 @@ def encode(self, document): return self.model.transform([document]).toarray()[0] -class BERTEmbedder(Embedder): - def __init__(self, records, configuration_string: str = "bert-base-uncased"): - self.tokenizer = AutoTokenizer.from_pretrained(configuration_string) - self.model = AutoModel.from_pretrained( - configuration_string, output_hidden_states=True +class OneHotEmbedder(Embedder): + def __init__(self, records): + self.model = OneHotEncoder() + super().__init__(records) + + def fit(self, records): + self.model.fit(records.reshape(-1, 1)) + + def encode(self, document): + return self.model.transform([[document]]).toarray()[0] + + +class DocumentEmbedder(Embedder): + def __init__(self, records, configuration_string: str = "distilbert-base-uncased"): + word_embedding_model = models.Transformer(configuration_string) + pooling_model = models.Pooling( + word_embedding_model.get_word_embedding_dimension() + ) + dense_model = models.Dense( + in_features=pooling_model.get_sentence_embedding_dimension(), + out_features=256, + activation_function=nn.Tanh(), + ) + + self.model = SentenceTransformer( + modules=[word_embedding_model, pooling_model, dense_model] ) - self.model.eval() super().__init__(records) - nltk.download("punkt") def fit(self, records): pass def encode(self, document: str): - embeddings = [] - for sentence in sent_tokenize(document): - encoded_dict = self.tokenizer.encode_plus( - sentence, return_tensors="pt", max_length=512, truncation=True - ) - with torch.no_grad(): - outputs = self.model(**encoded_dict) - embedding = self.mean_pooling(outputs, encoded_dict["attention_mask"]) - embeddings.append(embedding) - embedding_output = torch.mean(torch.stack(embeddings), axis=0) - return embedding_output.cpu().numpy()[0] - - def mean_pooling(self, model_output, attention_mask): - token_embeddings = model_output[ - 0 - ] # First element of model_output contains all token embeddings - input_mask_expanded = ( - attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - ) - sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) - sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) - return sum_embeddings / sum_mask + return self.model.encode(document) diff --git a/onetask/settings.py b/onetask/settings.py index 658ab36..fac1765 100644 --- a/onetask/settings.py +++ b/onetask/settings.py @@ -37,6 +37,10 @@ def get_schema_url(project_id): return f"{get_project_url(project_id)}/schema" +def get_embeddings_url(project_id, config_string): + return f"{get_project_url(project_id)}/embeddings/{config_string}" + + def get_data_url(project_id, manually_labeled): url = f"{get_project_url(project_id)}/data" if manually_labeled: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c791d3a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,93 @@ +appnope==0.1.2 +argcomplete==1.12.3 +argon2-cffi==21.1.0 +attrs==21.2.0 +backcall==0.2.0 +bertopic==0.9.3 +bleach==4.1.0 +certifi==2021.10.8 +cffi==1.15.0 +charset-normalizer==2.0.7 +click==8.0.3 +Cython==0.29.24 +debugpy==1.5.1 +decorator==5.1.0 +defusedxml==0.7.1 +entrypoints==0.3 +filelock==3.4.0 +hdbscan==0.8.27 +huggingface-hub==0.1.2 +idna==3.3 +importlib-metadata==4.8.2 +importlib-resources==5.4.0 +jedi==0.18.0 +Jinja2==3.0.3 +joblib==1.1.0 +jsonschema==4.2.1 +llvmlite==0.37.0 +MarkupSafe==2.0.1 +matplotlib-inline==0.1.3 +mistune==0.8.4 +mypy-extensions==0.4.3 +nbclient==0.5.8 +nbconvert==6.3.0 +nbformat==5.1.3 +nest-asyncio==1.5.1 +nltk==3.6.5 +notebook==6.4.5 +numba==0.54.1 +numpy==1.20.3 +packaging==21.2 +pandas==1.3.4 +pandocfilters==1.5.0 +parso==0.8.2 +pathspec==0.9.0 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==8.4.0 +platformdirs==2.4.0 +plotly==4.14.2 +prometheus-client==0.12.0 +prompt-toolkit==3.0.22 +ptyprocess==0.7.0 +pycparser==2.21 +Pygments==2.10.0 +pynndescent==0.5.5 +pyparsing==2.4.7 +pyrsistent==0.18.0 +python-dateutil==2.8.2 +pytz==2021.3 +PyYAML==5.4.1 +pyzmq==22.3.0 +qtconsole==5.2.0 +QtPy==1.11.2 +regex==2021.11.10 +requests==2.26.0 +retrying==1.3.3 +sacremoses==0.0.46 +scikit-learn==1.0.1 +scipy==1.7.2 +Send2Trash==1.8.0 +sentence-transformers==2.1.0 +sentencepiece==0.1.96 +six==1.16.0 +terminado==0.12.1 +testpath==0.5.0 +threadpoolctl==3.0.0 +tokenizers==0.10.3 +tomli==1.2.2 +torch==1.10.0 +torchvision==0.11.1 +tornado==6.1 +tqdm==4.62.3 +traitlets==5.1.1 +transformers==4.12.4 +typed-ast==1.5.0 +typing-extensions==4.0.0 +umap-learn==0.5.2 +urllib3==1.26.7 +wasabi==0.8.2 +wcwidth==0.2.5 +webencodings==0.5.1 +widgetsnbextension==3.5.2 +zipp==3.6.0 diff --git a/setup.py b/setup.py index ee7fa4d..53b1a47 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.1.14", + version="0.2.0", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", @@ -19,26 +19,105 @@ url="https://github.com/onetask-ai/onetask-python", keywords=["onetask", "machine learning", "supervised learning", "python"], classifiers=[ - "Development Status :: 3 - Alpha", + "Development Status :: 3 - Beta", "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", ], package_dir={"": "."}, packages=find_packages("."), install_requires=[ - "transformers==4.12.2", - "torch==1.10.0", - "torchvision==0.11.1", - "torchaudio==0.10.0", + "appnope==0.1.2", + "argcomplete==1.12.3", + "argon2-cffi==21.1.0", + "attrs==21.2.0", + "backcall==0.2.0", + "bertopic==0.9.3", + "bleach==4.1.0", + "certifi==2021.10.8", + "cffi==1.15.0", + "charset-normalizer==2.0.7", + "click==8.0.3", + "Cython==0.29.24", + "debugpy==1.5.1", + "decorator==5.1.0", + "defusedxml==0.7.1", + "entrypoints==0.3", + "filelock==3.4.0", + "hdbscan==0.8.27", + "huggingface-hub==0.1.2", + "idna==3.3", + "importlib-metadata==4.8.2", + "importlib-resources==5.4.0", + "jedi==0.18.0", + "Jinja2==3.0.3", + "joblib==1.1.0", + "jsonschema==4.2.1", + "llvmlite==0.37.0", + "MarkupSafe==2.0.1", + "matplotlib-inline==0.1.3", + "mistune==0.8.4", + "mypy-extensions==0.4.3", + "nbclient==0.5.8", + "nbconvert==6.3.0", + "nbformat==5.1.3", + "nest-asyncio==1.5.1", "nltk==3.6.5", + "notebook==6.4.5", + "numba==0.54.1", + "numpy==1.20.3", + "packaging==21.2", + "pandas==1.3.4", + "pandocfilters==1.5.0", + "parso==0.8.2", + "pathspec==0.9.0", + "pexpect==4.8.0", + "pickleshare==0.7.5", + "Pillow==8.4.0", + "platformdirs==2.4.0", + "plotly==4.14.2", + "prometheus-client==0.12.0", + "prompt-toolkit==3.0.22", + "ptyprocess==0.7.0", + "pycparser==2.21", + "Pygments==2.10.0", + "pynndescent==0.5.5", + "pyparsing==2.4.7", + "pyrsistent==0.18.0", + "python-dateutil==2.8.2", + "pytz==2021.3", + "PyYAML==5.4.1", + "pyzmq==22.3.0", + "qtconsole==5.2.0", + "QtPy==1.11.2", + "regex==2021.11.10", + "requests==2.26.0", + "retrying==1.3.3", + "sacremoses==0.0.46", "scikit-learn==1.0.1", - "certifi==2021.5.30", - "spacy==3.1.3", - "pandas==1.3.3", - "chardet==4.0.0", - "idna==2.10", - "requests==2.25.1", - "urllib3==1.26.5", + "scipy==1.7.2", + "Send2Trash==1.8.0", + "sentence-transformers==2.1.0", + "sentencepiece==0.1.96", + "six==1.16.0", + "terminado==0.12.1", + "testpath==0.5.0", + "threadpoolctl==3.0.0", + "tokenizers==0.10.3", + "tomli==1.2.2", + "torch==1.10.0", + "torchvision==0.11.1", + "tornado==6.1", + "tqdm==4.62.3", + "traitlets==5.1.1", + "transformers==4.12.4", + "typed-ast==1.5.0", + "typing-extensions==4.0.0", + "umap-learn==0.5.2", + "urllib3==1.26.7", "wasabi==0.8.2", + "wcwidth==0.2.5", + "webencodings==0.5.1", + "widgetsnbextension==3.5.2", + "zipp==3.6.0" ], ) From 512929abb02a4cd7ed09a0b151cbebc84c7afa95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 17 Nov 2021 00:16:34 +0100 Subject: [PATCH 23/35] correct classifier --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 53b1a47..89afc71 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ url="https://github.com/onetask-ai/onetask-python", keywords=["onetask", "machine learning", "supervised learning", "python"], classifiers=[ - "Development Status :: 3 - Beta", + "Development Status :: 3 - Alpha", "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", ], From 31aa7d19c457a7d4797a1662082b01e7ec80d504 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 17 Nov 2021 01:00:40 +0100 Subject: [PATCH 24/35] adds documentation --- onetask/__init__.py | 96 +++++++++++++++++++++++++++++++++++++++++---- setup.py | 2 +- 2 files changed, 90 insertions(+), 8 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index e836b2e..56219a2 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Callable, Optional +from typing import Callable, Dict, List, Optional from wasabi import msg import pandas as pd import numpy as np @@ -13,6 +13,15 @@ class Client: + """ + Python Client for the onetask API. If you have any questions, please contact our support. + + Args: + user_name (str): The email with which you've been registered at onetask + password (str): Your password for onetask + project_id (str): The unique identifier for a project, can be found in the url after projects/ + stage (str): The onetask system staging environment [beta, test, dev, local] + """ def __init__( self, user_name: str, password: str, project_id: str, stage: str = "beta" ): @@ -28,13 +37,26 @@ def __init__( else: msg.fail("Could not log in. Please check your username and password.") - def _get_unique_attributes(self): + def _get_unique_attributes(self) -> List[Dict[str, np.union1d[str, bool]]]: + """ + Get the record schema for your project shown in the web app under 'Settings' + + Returns: + List[Dict[str, np.union1d[str, bool]]]: each record schema element + """ attributes = api_calls.GetUniqueAttributes( self.project_id, self.session_token ).attributes return attributes def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: + """ + Send a local labeling function to the onetask application. Please make sure that the function fits the desired structure (for more information, please visit onetask.readme.io/reference) + + Args: + lf (Callable): The function object you want to send to the system + autoexecute (bool, optional): If true, the function is automatically executed when entered in the system. Defaults to True. + """ project_id, name, source_code, docs = util.unpack_python_function( lf, self.project_id ) @@ -48,6 +70,15 @@ def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: msg.good(f"Registered labeling function '{name}'.") def get_records(self, manually_labeled=True) -> pd.DataFrame: + """ + Get the records of your project. + + Args: + manually_labeled (bool, optional): If true, only manually labeled records are returned. Defaults to True. + + Returns: + pd.DataFrame: containing the record attributes and the labels + """ records = api_calls.GetRecords( self.project_id, self.session_token, manually_labeled=manually_labeled ).records @@ -61,7 +92,16 @@ def get_records(self, manually_labeled=True) -> pd.DataFrame: msg.warn("Empty result") return fetched_df # empty df - def get_embeddings(self, config_string): + def get_embeddings(self, config_string: str) -> pd.DataFrame: + """ + Get the embeddings of your project of a configuration string + + Args: + config_string (str): The name of your embedding + + Returns: + pd.DataFrame: containing the record attributes and the embedding vectors + """ embeddings = api_calls.GetEmbeddings( self.project_id, self.session_token, config_string ).embeddings @@ -75,7 +115,16 @@ def get_embeddings(self, config_string): msg.warn("Empty result") return fetched_embeddings - def generate_embeddings(self, attribute_configs_dict, file_path=None): + def generate_embeddings(self, attribute_configs_dict: Dict[str, str], file_path: Optional[str]=None) -> None: + """ + ---EXPERIMENTAL--- + + Create new embeddings to upload into your project. + + Args: + attribute_configs_dict (Dict[str, str]): describe which attribute should be embedded using which technique or model. + file_path (Optional[str], optional): path where the embeddings should be stored to. Defaults to 'embeddings_{project_id}.json'. + """ if not file_path: file_path = f"embeddings_{self.project_id}.json" @@ -122,7 +171,19 @@ def generate_embeddings(self, attribute_configs_dict, file_path=None): "Currently, you must have exactly one unique attribute for embedding generation. Please validate this in the web app under 'Settings'" ) - def model_topics(self, attribute, config_string): + def model_topics(self, attribute: str, config_string: str) -> BERTopic: + """ + ---EXPERIMENTAL--- + + Apply a BERTopic to your data to do topic modelling. Further docs: https://maartengr.github.io/BERTopic/tutorial/visualization/visualization.html + + Args: + attribute (str): the name of the string attribute you want to model + config_string (str): name of the embedding vector in the web application that you want to make use of. This MUST be a BERT-related embedding to work properly. + + Returns: + BERTopic: BERTopic object that can be called for topic modelling + """ msg.info("Loading embeddings") embeddings_df = self.get_embeddings(config_string) if len(embeddings_df) > 0: @@ -139,8 +200,22 @@ def model_topics(self, attribute, config_string): return model def generate_regex_labeling_functions( - self, nlp, attribute, min_precision=0.8, filter_stopwords=False - ): + self, nlp, attribute: str, min_precision:Optional[float]=0.8, filter_stopwords:Optional[bool]=False + ) -> pd.DataFrame: + """ + ---EXPERIMENTAL--- + + Autogenerate labeling functions containing regular expressions to model your data. Uses spacy to model the linguistics of your data. + + Args: + nlp (spacy.lang): nlp object of spacy for the specific language (e.g. en_core_web_sm) + attribute (str): the name of the attribute that should be analyzed for regular expressions + min_precision (Optional[float], optional): needed precision to generate a labeling function. Defaults to 0.8. + filter_stopwords (Optional[bool], optional): if set to true, stop words like 'this', 'that' etc. will be removed. Defaults to False. + + Returns: + pd.DataFrame: [description] + """ records = self.get_records() if len(records) > 0: candidates = auto_lf.derive_regex_candidates( @@ -155,6 +230,13 @@ def generate_regex_labeling_functions( def display_generated_labeling_functions( self, lf_df: pd.DataFrame, label: Optional[str] = None ): + """ + Helper function to display the autogenerated labeling functions + + Args: + lf_df (pd.DataFrame): outcome of client.generate_regex_labeling_functions + label (Optional[str], optional): filter option to only show one label. Defaults to None. + """ if label is not None: lf_df = lf_df.loc[lf_df["label"] == label] for _, row in lf_df.iterrows(): diff --git a/setup.py b/setup.py index 89afc71..b88427b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.2.0", + version="0.2.1", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From a032ac08a2df0b9966ad9e39f350996c7262aa7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <62521561+jhoetter@users.noreply.github.com> Date: Wed, 17 Nov 2021 15:43:00 +0100 Subject: [PATCH 25/35] Update README.md --- README.md | 88 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 6468317..48d1955 100644 --- a/README.md +++ b/README.md @@ -4,46 +4,56 @@ Official Python SDK for onetask. -## [](https://github.com/onetask-ai/onetask-python#installation)Installation +## Installation You can use pip to install the library: `$ pip install onetask` -You can clone the repository and run the setup.py script: +Alternatively, you can clone the repository and run the setup.py script: `$ python setup.py install` -## [](https://github.com/onetask-ai/onetask-python#usage)Usage +## Usage -The SDK is currently focused solely on labeling functions. You can register your own functions or let our system generate suggestions on which you can build. In the near future, we'll extend the Python SDK to include programmatic imports and exports, data access, and many more. +The SDK currently offers the following functions: +- registering local Python functions as labeling functions in our system (you can of course also develop such functions within our web system) +- generating embeddings for your attributes, e.g. free texts or structured data containing categories and numbers +- _experimental_: autogenerating labeling functions from manually labeled records in your project +- _experimental_: topic modeling using BERT embeddings that you have registered in your project -You begin by creating a `Client` object. The `Client` will generate and store a session token for you based on your user name, password, and project id. The project id can be found in the URL, e.g. https://app.dev.onetask.ai/app/projects/**03f7d82c-f14c-4f0f-a1ff-59533bab30cc**/overview. Simply copy and paste this into the following pattern: +All of this is also documented in the [onetask Documentation](https://onetask.readme.io/reference/getting-started), with additional screenshots to guide you through the process. + + +### Instantiating a Client object + +You begin by creating a `Client` object. The `Client` will generate and store a session token for you based on your user name, password, and project id. The project id can be found in the URL, e.g. https://app.beta.onetask.ai/app/projects/**03f7d82c-f14c-4f0f-a1ff-59533bab30cc**/overview. Simply copy and paste this into the following pattern: ```python from onetask import Client username = "your-username" -project_id = "your-project-id" password = "your-password" -stage="beta" # if you have onetask on local, you can also set stage to "local" +project_id = "your-project-id" +stage = "beta" # if you have onetask on local, you can also set stage to "local" client = Client(username, password, project_id, stage) ``` -Once you correctly instantiated your Client, you can start accessing our GraphQL endpoints. Please always ensure that your labeling functions: +Once you correctly instantiated your Client, you can start using it for the various functions provided in the SDK. -return label names that also exist in your project definition -have exactly one parameter; we execute labeling functions on a record-basis -If you need an import statement in your labeling functions, please check if it is given in the [whitelisted libraries](https://onetask.readme.io/reference/whitelisted-libraries). If you need a library that we have not yet whitelisted, feel free to reach out to us. -The most straightforward way to create and register a labeling function is as follows: +### Registering local Python labeling functions +You can register functions e.g. from your local Jupyter Notebook using our SDK. When doing so, please always ensure that your labeling functions: +- return label names that also exist in your project definition +- have exactly one parameter; we execute labeling functions on a record-basis +- If you need an import statement in your labeling functions, please check if it is given in the [whitelisted libraries](https://onetask.readme.io/reference/whitelisted-libraries). If you need a library that we have not yet whitelisted, feel free to reach out to us. + +An example to register your custom labeling function is as follows: ```python def my_labeling_function(record): """ - This is my first labeling function. Yay! - Its purpose is to detect a list of values in the records that tend to - occur in urgent messages. + Detect a list of values in the records that tend to occur in urgent messages. """ keywords = ["asap", "as soon as possible", "urgent"] @@ -53,10 +63,54 @@ def my_labeling_function(record): return "Urgent" ``` -You can then enter them using the client as follows: +You can then enter them using the client: ```python client.register_lf(my_labeling_function) ``` -And that's it. You should now be able to see your labeling function in the web application. For further steps, please refer to our [readme.io](https://onetask.readme.io/reference/setting-up-the-python-sdk) documentation +The labeling function is then automatically executed once registered, where you can always change and re-run it. + +### Generating embeddings + +One of the main features of onetask is to apply both Weak Supervision and Active Learning jointly. To build the best possible Active Learning Weak Sources, you can generate embeddings for your attributes using the SDK. To do so, you have to first upload your data in our web application and select a unique attribute (see our [documentation](https://onetask.readme.io/reference/create-your-project) for further reference on how to set this up). + +Once this is done, you can easily generate embedding files. Imagine you have the following attributes in your records: +- `headline`: an english text describing e.g. the news of a paper (e.g. _"5 footballers that should have won the ballon d'or"_, ...) +- `running_id`: a unique identifier for each headline, i.e. a simple number (e.g. 1, 2, 3, ...) + +You can then call the client object to generate an embedding file using a dictionary of attribute/configuration string pairs: +```python +client.generate_embeddings({"headline": "distilbert-base-uncased"}) +``` + +This will generate an embedding JSON-file as follows: + +```json +[ + { + "running_id": 1, + "distilbert-base-uncased": [0.123456789, "..."] + }, + { + "running_id": 2, + "distilbert-base-uncased": [0.234567891, "..."] + }, +] +``` + +You can upload this file to your project in the overview tab of your project. + +The following configuration strings are available to configure how your attributes are embedded: +| Configuration String | Data Type | Explanation | +|----------------------|-------------------------------|----------------------------------------------------------------------------------------------| +| identity | integer, float | No transformation | +| onehot | category (low-entropy string) | one-hot encodes attribute | +| bow | string | Bag of Words transformation | +| boc | string | Bag of Characters transformation | +| _huggingface_ | string | Huggingface-based transformation. You can use any available huggingface configuration string | + +If you want to embed multiple attributes (which makes sense e.g. when you have structured data), you can provide multiple key/value pairs in your input dictionary. The resulting embeddings will be concatenated into one vector. + +### Outlook +In the near future, we'll extend the Python SDK to include programmatic imports and exports, data access, and many more. If you have any requests, feel free to [contact us](https://www.onetask.ai/contact-us). From 68e310cee7c0fdceef115ab501262157d2d88d36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <62521561+jhoetter@users.noreply.github.com> Date: Wed, 17 Nov 2021 15:55:55 +0100 Subject: [PATCH 26/35] Update README.md --- README.md | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 48d1955..45056b8 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Alternatively, you can clone the repository and run the setup.py script: The SDK currently offers the following functions: - registering local Python functions as labeling functions in our system (you can of course also develop such functions within our web system) -- generating embeddings for your attributes, e.g. free texts or structured data containing categories and numbers +- _experimental_: generating embeddings for your attributes, e.g. free texts or structured data containing categories and numbers - _experimental_: autogenerating labeling functions from manually labeled records in your project - _experimental_: topic modeling using BERT embeddings that you have registered in your project @@ -71,7 +71,7 @@ client.register_lf(my_labeling_function) The labeling function is then automatically executed once registered, where you can always change and re-run it. -### Generating embeddings +### Generating embeddings (experimental) One of the main features of onetask is to apply both Weak Supervision and Active Learning jointly. To build the best possible Active Learning Weak Sources, you can generate embeddings for your attributes using the SDK. To do so, you have to first upload your data in our web application and select a unique attribute (see our [documentation](https://onetask.readme.io/reference/create-your-project) for further reference on how to set this up). @@ -108,9 +108,44 @@ The following configuration strings are available to configure how your attribut | onehot | category (low-entropy string) | one-hot encodes attribute | | bow | string | Bag of Words transformation | | boc | string | Bag of Characters transformation | -| _huggingface_ | string | Huggingface-based transformation. You can use any available huggingface configuration string | +| _huggingface_ | string | Huggingface-based transformation. You can use any available [huggingface](https://huggingface.co/) configuration string | If you want to embed multiple attributes (which makes sense e.g. when you have structured data), you can provide multiple key/value pairs in your input dictionary. The resulting embeddings will be concatenated into one vector. -### Outlook + +### Autogenerating labeling functions (experimental) + +As you manually label data, onetask can help you to analyze both the explicitic and implicit data patterns. Our first approach for explicit pattern detection is to find regular expressions in free text attributes you provide. They are being mined using linguistic analysis, therefore you need to provide a spacy nlp object for the respective language of your free text. + +If you have an english free text, you can implement the mining as follows: +```python +import spacy # you need to also download the en_core_web_sm file using $ python -m spacy download en_core_web_sm + +nlp = spacy.load("en_core_web_sm") +lf_df = client.generate_regex_labeling_functions(nlp, "headline") +``` + +This creates a DataFrame containing mined regular expressions. You can display them in a convenient way: + +```python +client.display_generated_labeling_functions(lf_df) +``` + +**Caution**: The quality and quantity of mined regular expressions heavily depends on how much data you have labeled and how diverse your dataset is. We have tested the feature on various datasets and found it to be very helpful. If you have problems autogenerating labeling functions in your project, contact us. + + +### Topic Modeling using BERT embeddings (experimental) + +As onetask lets you put insights of explorative analysis into programmatic data labeling, we also provide topic modeling. We use the [BERTopic](https://github.com/MaartenGr/BERTopic) library for topic modeling, and provide an easy access to your projects data and embeddings. Once you uploaded BERT embeddings to your project (such that can be created using a huggingface configuration string), you can create a topic model: + +```python +topic_model = client.model_topics("headline", "distilbert-base-uncased") +``` + +The `topic_model` provides various methods to explore the different keywords and topics. You can also find further documentation [here](https://maartengr.github.io/BERTopic/api/bertopic.html). + +## Outlook and Feature Requests In the near future, we'll extend the Python SDK to include programmatic imports and exports, data access, and many more. If you have any requests, feel free to [contact us](https://www.onetask.ai/contact-us). + +## Support +If you need help, feel free to join our Slack Community channel. It is currently only available via invitation. From 69322936f62c253bcbbc8d16ac793cbcb58ed58e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <62521561+jhoetter@users.noreply.github.com> Date: Wed, 17 Nov 2021 15:59:31 +0100 Subject: [PATCH 27/35] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 45056b8..17e02bf 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # onetask API for Python -Official Python SDK for onetask. +This is the official Python SDK for onetask, your IDE for programmatic data labeling. ## Installation From 76aea511eea4edd33bc4f2b3d7648e3c8ea6b499 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= <62521561+jhoetter@users.noreply.github.com> Date: Wed, 17 Nov 2021 16:39:30 +0100 Subject: [PATCH 28/35] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 17e02bf..e4af00c 100644 --- a/README.md +++ b/README.md @@ -119,7 +119,9 @@ As you manually label data, onetask can help you to analyze both the explicitic If you have an english free text, you can implement the mining as follows: ```python -import spacy # you need to also download the en_core_web_sm file using $ python -m spacy download en_core_web_sm +import spacy +# you need to also download the en_core_web_sm file +# using $ python -m spacy download en_core_web_sm nlp = spacy.load("en_core_web_sm") lf_df = client.generate_regex_labeling_functions(nlp, "headline") From 973771c82dfedb4c3192225aacd1d05f73f69a4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Mon, 22 Nov 2021 17:28:03 +0100 Subject: [PATCH 29/35] bugfix np.union1d -> Union --- onetask/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 56219a2..750be76 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Callable, Dict, List, Optional +from typing import Callable, Dict, List, Optional, Union from wasabi import msg import pandas as pd import numpy as np @@ -37,12 +37,12 @@ def __init__( else: msg.fail("Could not log in. Please check your username and password.") - def _get_unique_attributes(self) -> List[Dict[str, np.union1d[str, bool]]]: + def _get_unique_attributes(self) -> List[Dict[str, Union[str, bool]]]: """ Get the record schema for your project shown in the web app under 'Settings' Returns: - List[Dict[str, np.union1d[str, bool]]]: each record schema element + List[Dict[str, Union[str, bool]]]: each record schema element """ attributes = api_calls.GetUniqueAttributes( self.project_id, self.session_token From a8160152083534b53fd259f364339937744d8daa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Mon, 22 Nov 2021 17:28:46 +0100 Subject: [PATCH 30/35] update version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b88427b..3b76291 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.2.1", + version="0.2.2", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 6bb83fbbdd6259521aa5ac84f8a10916bcfa3aa7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Fri, 26 Nov 2021 15:39:26 +0100 Subject: [PATCH 31/35] solves embedding identifier bug --- onetask/__init__.py | 19 +++++++++++++------ setup.py | 4 ++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 750be76..c774c12 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -22,6 +22,7 @@ class Client: project_id (str): The unique identifier for a project, can be found in the url after projects/ stage (str): The onetask system staging environment [beta, test, dev, local] """ + def __init__( self, user_name: str, password: str, project_id: str, stage: str = "beta" ): @@ -115,10 +116,12 @@ def get_embeddings(self, config_string: str) -> pd.DataFrame: msg.warn("Empty result") return fetched_embeddings - def generate_embeddings(self, attribute_configs_dict: Dict[str, str], file_path: Optional[str]=None) -> None: + def generate_embeddings( + self, attribute_configs_dict: Dict[str, str], file_path: Optional[str] = None + ) -> None: """ ---EXPERIMENTAL--- - + Create new embeddings to upload into your project. Args: @@ -150,12 +153,12 @@ def generate_embeddings(self, attribute_configs_dict: Dict[str, str], file_path: model = embedding.get_fitted_model_by_config_string(config_string, vals) if model: msg.info("Starting embedding procedure") - for idx, row in tqdm( + for _, row in tqdm( enumerate(records_subset), total=len(records_subset), desc="Embedding records...", ): - embedding_concat[idx].extend( + embedding_concat[row[unique_attribute]].extend( model.encode(row[attribute]).tolist() ) msg.good(f"Finished embedding procedure. Storing to {file_path}") @@ -200,11 +203,15 @@ def model_topics(self, attribute: str, config_string: str) -> BERTopic: return model def generate_regex_labeling_functions( - self, nlp, attribute: str, min_precision:Optional[float]=0.8, filter_stopwords:Optional[bool]=False + self, + nlp, + attribute: str, + min_precision: Optional[float] = 0.8, + filter_stopwords: Optional[bool] = False, ) -> pd.DataFrame: """ ---EXPERIMENTAL--- - + Autogenerate labeling functions containing regular expressions to model your data. Uses spacy to model the linguistics of your data. Args: diff --git a/setup.py b/setup.py index 3b76291..18792a9 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.2.2", + version="0.2.3", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", @@ -118,6 +118,6 @@ "wcwidth==0.2.5", "webencodings==0.5.1", "widgetsnbextension==3.5.2", - "zipp==3.6.0" + "zipp==3.6.0", ], ) From 07892a69707473fc89033cdcf0bb05e097138387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Fri, 3 Dec 2021 14:54:33 +0100 Subject: [PATCH 32/35] adds is_programmatic enrichment to fetching record data --- onetask/__init__.py | 14 ++++++++++---- onetask/api_calls.py | 4 ++-- onetask/settings.py | 7 ++----- setup.py | 2 +- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index c774c12..926e5a4 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -70,24 +70,30 @@ def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: else: msg.good(f"Registered labeling function '{name}'.") - def get_records(self, manually_labeled=True) -> pd.DataFrame: + def get_records(self, keep_unlabeled=False, keep_programmatic=False): """ Get the records of your project. + Args: - manually_labeled (bool, optional): If true, only manually labeled records are returned. Defaults to True. + keep_unlabeled (bool, optional): If true, you will receive all records, even if they are not labeled yet. Defaults to False. + keep_programmatic (bool, optional): if true, you will receive also the programmatically labeled records. Defaults to False. Returns: - pd.DataFrame: containing the record attributes and the labels + [type]: [description] """ records = api_calls.GetRecords( - self.project_id, self.session_token, manually_labeled=manually_labeled + self.project_id, + self.session_token, + keep_unlabeled=keep_unlabeled, + keep_programmatic=keep_programmatic, ).records fetched_df = pd.DataFrame(records) if len(fetched_df) > 0: df = fetched_df["data"].apply(pd.Series) df["label"] = fetched_df["label"] + df["is_programmatic"] = fetched_df["is_programmatic"] return df else: msg.warn("Empty result") diff --git a/onetask/api_calls.py b/onetask/api_calls.py index 06d1938..33afe49 100644 --- a/onetask/api_calls.py +++ b/onetask/api_calls.py @@ -119,9 +119,9 @@ def __init__(self, project_id, session_token): class GetRecords(GetRequest): - def __init__(self, project_id, session_token, manually_labeled): + def __init__(self, project_id, session_token, keep_unlabeled, keep_programmatic): super().__init__( - settings.get_data_url(project_id, manually_labeled), + settings.get_data_url(project_id, keep_unlabeled, keep_programmatic), session_token, ) self.records = self.execute() diff --git a/onetask/settings.py b/onetask/settings.py index fac1765..ab9dfda 100644 --- a/onetask/settings.py +++ b/onetask/settings.py @@ -41,8 +41,5 @@ def get_embeddings_url(project_id, config_string): return f"{get_project_url(project_id)}/embeddings/{config_string}" -def get_data_url(project_id, manually_labeled): - url = f"{get_project_url(project_id)}/data" - if manually_labeled: - url = f"{url}?labelled=manual" - return url +def get_data_url(project_id, keep_unlabeled, keep_programmatic): + return f"{get_project_url(project_id)}/data?keep_unlabeled={keep_unlabeled}&keep_programmatic={keep_programmatic}" diff --git a/setup.py b/setup.py index 18792a9..8ee05fd 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.2.3", + version="0.2.4", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From f0ab02a85aca685523d86a08f23339f80abfe8b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 8 Dec 2021 14:58:16 +0100 Subject: [PATCH 33/35] minor bugfix generate_embeddings --- onetask/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 926e5a4..4a2b3e7 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -148,7 +148,7 @@ def generate_embeddings( if unique_attribute: msg.info("Loading records") - records = self.get_records(manually_labeled=False) + records = self.get_records(keep_programmatic=False, keep_unlabeled=False) embedding_concat = defaultdict(list) for attribute, config_string in attribute_configs_dict.items(): vals = np.stack(records[attribute]) diff --git a/setup.py b/setup.py index 8ee05fd..e471e78 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.2.4", + version="0.2.5", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From ea810a3092a029d5b30f6af9e9a5f17567e0b901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Wed, 8 Dec 2021 16:41:10 +0100 Subject: [PATCH 34/35] bugfix for empty records at embedding creation --- onetask/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onetask/__init__.py b/onetask/__init__.py index 4a2b3e7..24488a6 100644 --- a/onetask/__init__.py +++ b/onetask/__init__.py @@ -148,7 +148,7 @@ def generate_embeddings( if unique_attribute: msg.info("Loading records") - records = self.get_records(keep_programmatic=False, keep_unlabeled=False) + records = self.get_records(keep_programmatic=False, keep_unlabeled=True) embedding_concat = defaultdict(list) for attribute, config_string in attribute_configs_dict.items(): vals = np.stack(records[attribute]) diff --git a/setup.py b/setup.py index e471e78..575600f 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name="onetask", - version="0.2.5", + version="0.2.6", author="onetask", author_email="info@onetask.ai", description="Official Python SDK for the onetask API", From 2b28f48be33a50067eda743aa129f5153f8642d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20H=C3=B6tter?= Date: Sat, 7 May 2022 14:33:47 +0200 Subject: [PATCH 35/35] update to kern --- .gitignore | 2 + README.md | 163 +++++--------------- kern/__init__.py | 48 ++++++ kern/api_calls.py | 53 +++++++ kern/authentication.py | 27 ++++ {onetask => kern}/exceptions.py | 35 +++-- kern/settings.py | 29 ++++ onetask/__init__.py | 263 -------------------------------- onetask/api_calls.py | 136 ----------------- onetask/auto_lf.py | 152 ------------------ onetask/embedding.py | 113 -------------- onetask/settings.py | 45 ------ onetask/util.py | 35 ----- requirements.txt | 100 ++---------- setup.py | 114 +++----------- 15 files changed, 246 insertions(+), 1069 deletions(-) create mode 100644 kern/__init__.py create mode 100644 kern/api_calls.py create mode 100644 kern/authentication.py rename {onetask => kern}/exceptions.py (59%) create mode 100644 kern/settings.py delete mode 100644 onetask/__init__.py delete mode 100644 onetask/api_calls.py delete mode 100644 onetask/auto_lf.py delete mode 100644 onetask/embedding.py delete mode 100644 onetask/settings.py delete mode 100644 onetask/util.py diff --git a/.gitignore b/.gitignore index ddb1114..2c995a6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ +.vscode/ + # Jupyter *.ipynb diff --git a/README.md b/README.md index e4af00c..7351df0 100644 --- a/README.md +++ b/README.md @@ -1,153 +1,66 @@ -[![Python 3.8](https://img.shields.io/badge/python-3.8-blue.svg)](https://www.python.org/downloads/release/python-380/) +![kern-python](https://uploads-ssl.webflow.com/61e47fafb12bd56b40022a49/62766400bd3c57b579d289bf_kern-python%20Banner.png) +[![Python 3.9](https://img.shields.io/badge/python-3.9-blue.svg)](https://www.python.org/downloads/release/python-390/) -# onetask API for Python +# Kern AI API for Python -This is the official Python SDK for onetask, your IDE for programmatic data labeling. +This is the official Python SDK for Kern AI, your IDE for programmatic data enrichment and management. ## Installation -You can use pip to install the library: - -`$ pip install onetask` - -Alternatively, you can clone the repository and run the setup.py script: - -`$ python setup.py install` +You can set up this library via either running `$ pip install kern-python-client`, or via cloning this repository and running `$ pip install -r requirements.txt` in your repository. ## Usage - -The SDK currently offers the following functions: -- registering local Python functions as labeling functions in our system (you can of course also develop such functions within our web system) -- _experimental_: generating embeddings for your attributes, e.g. free texts or structured data containing categories and numbers -- _experimental_: autogenerating labeling functions from manually labeled records in your project -- _experimental_: topic modeling using BERT embeddings that you have registered in your project - -All of this is also documented in the [onetask Documentation](https://onetask.readme.io/reference/getting-started), with additional screenshots to guide you through the process. - - -### Instantiating a Client object - -You begin by creating a `Client` object. The `Client` will generate and store a session token for you based on your user name, password, and project id. The project id can be found in the URL, e.g. https://app.beta.onetask.ai/app/projects/**03f7d82c-f14c-4f0f-a1ff-59533bab30cc**/overview. Simply copy and paste this into the following pattern: +Once you installed the package, you can access the application from any Python terminal as follows: ```python -from onetask import Client +from kern import Client username = "your-username" password = "your-password" -project_id = "your-project-id" -stage = "beta" # if you have onetask on local, you can also set stage to "local" -client = Client(username, password, project_id, stage) -``` - -Once you correctly instantiated your Client, you can start using it for the various functions provided in the SDK. +project_id = "your-project-id" # can be found in the URL of the web application - -### Registering local Python labeling functions - -You can register functions e.g. from your local Jupyter Notebook using our SDK. When doing so, please always ensure that your labeling functions: -- return label names that also exist in your project definition -- have exactly one parameter; we execute labeling functions on a record-basis -- If you need an import statement in your labeling functions, please check if it is given in the [whitelisted libraries](https://onetask.readme.io/reference/whitelisted-libraries). If you need a library that we have not yet whitelisted, feel free to reach out to us. - -An example to register your custom labeling function is as follows: -```python -def my_labeling_function(record): - """ - Detect a list of values in the records that tend to occur in urgent messages. - """ - keywords = ["asap", "as soon as possible", "urgent"] - - message_lower = record["message"].lower() - for keyword in keywords: - if keyword in message_lower: - return "Urgent" +client = Client(username, password, project_id) +# if you run the application locally, please the following instead: +# client = Client(username, password, project_id, uri="http://localhost:4455") ``` -You can then enter them using the client: - +Now, you can easily fetch the data from your project: ```python -client.register_lf(my_labeling_function) +df = client.fetch_export() ``` -The labeling function is then automatically executed once registered, where you can always change and re-run it. +The `df` contains data of the following scheme: +- all your record attributes are stored as columns, e.g. `headline` or `running_id` if you uploaded records like `{"headline": "some text", "running_id": 1234}` +- per labeling task three columns: + - `____MANUAL`: those are the manually set labels of your records + - `____WEAK SUPERVISION`: those are the weakly supervised labels of your records + - `____WEAK SUPERVISION_confidence`: those are the probabilities or your weakly supervised labels -### Generating embeddings (experimental) +With the `client`, you easily integrate your data into any kind of system; may it be a custom implementation, an AutoML system or a plain data analytics framework 🚀 -One of the main features of onetask is to apply both Weak Supervision and Active Learning jointly. To build the best possible Active Learning Weak Sources, you can generate embeddings for your attributes using the SDK. To do so, you have to first upload your data in our web application and select a unique attribute (see our [documentation](https://onetask.readme.io/reference/create-your-project) for further reference on how to set this up). +## Roadmap +- [ ] Register information sources via wrappers +- [ ] Fetch project statistics -Once this is done, you can easily generate embedding files. Imagine you have the following attributes in your records: -- `headline`: an english text describing e.g. the news of a paper (e.g. _"5 footballers that should have won the ballon d'or"_, ...) -- `running_id`: a unique identifier for each headline, i.e. a simple number (e.g. 1, 2, 3, ...) -You can then call the client object to generate an embedding file using a dictionary of attribute/configuration string pairs: -```python -client.generate_embeddings({"headline": "distilbert-base-uncased"}) -``` - -This will generate an embedding JSON-file as follows: - -```json -[ - { - "running_id": 1, - "distilbert-base-uncased": [0.123456789, "..."] - }, - { - "running_id": 2, - "distilbert-base-uncased": [0.234567891, "..."] - }, -] -``` +If you want to have something added, feel free to open an [issue](https://github.com/code-kern-ai/kern-python/issues). -You can upload this file to your project in the overview tab of your project. +## Contributing +Contributions are what make the open source community such an amazing place to learn, inspire, and create. Any contributions you make are **greatly appreciated**. -The following configuration strings are available to configure how your attributes are embedded: -| Configuration String | Data Type | Explanation | -|----------------------|-------------------------------|----------------------------------------------------------------------------------------------| -| identity | integer, float | No transformation | -| onehot | category (low-entropy string) | one-hot encodes attribute | -| bow | string | Bag of Words transformation | -| boc | string | Bag of Characters transformation | -| _huggingface_ | string | Huggingface-based transformation. You can use any available [huggingface](https://huggingface.co/) configuration string | +If you have a suggestion that would make this better, please fork the repo and create a pull request. You can also simply open an issue with the tag "enhancement". +Don't forget to give the project a star! Thanks again! -If you want to embed multiple attributes (which makes sense e.g. when you have structured data), you can provide multiple key/value pairs in your input dictionary. The resulting embeddings will be concatenated into one vector. - - -### Autogenerating labeling functions (experimental) - -As you manually label data, onetask can help you to analyze both the explicitic and implicit data patterns. Our first approach for explicit pattern detection is to find regular expressions in free text attributes you provide. They are being mined using linguistic analysis, therefore you need to provide a spacy nlp object for the respective language of your free text. - -If you have an english free text, you can implement the mining as follows: -```python -import spacy -# you need to also download the en_core_web_sm file -# using $ python -m spacy download en_core_web_sm - -nlp = spacy.load("en_core_web_sm") -lf_df = client.generate_regex_labeling_functions(nlp, "headline") -``` - -This creates a DataFrame containing mined regular expressions. You can display them in a convenient way: - -```python -client.display_generated_labeling_functions(lf_df) -``` - -**Caution**: The quality and quantity of mined regular expressions heavily depends on how much data you have labeled and how diverse your dataset is. We have tested the feature on various datasets and found it to be very helpful. If you have problems autogenerating labeling functions in your project, contact us. - - -### Topic Modeling using BERT embeddings (experimental) - -As onetask lets you put insights of explorative analysis into programmatic data labeling, we also provide topic modeling. We use the [BERTopic](https://github.com/MaartenGr/BERTopic) library for topic modeling, and provide an easy access to your projects data and embeddings. Once you uploaded BERT embeddings to your project (such that can be created using a huggingface configuration string), you can create a topic model: - -```python -topic_model = client.model_topics("headline", "distilbert-base-uncased") -``` +1. Fork the Project +2. Create your Feature Branch (`git checkout -b feature/AmazingFeature`) +3. Commit your Changes (`git commit -m 'Add some AmazingFeature'`) +4. Push to the Branch (`git push origin feature/AmazingFeature`) +5. Open a Pull Request -The `topic_model` provides various methods to explore the different keywords and topics. You can also find further documentation [here](https://maartengr.github.io/BERTopic/api/bertopic.html). +And please don't forget to leave a ⭐ if you like the work! -## Outlook and Feature Requests -In the near future, we'll extend the Python SDK to include programmatic imports and exports, data access, and many more. If you have any requests, feel free to [contact us](https://www.onetask.ai/contact-us). +## License +Distributed under the MIT License. See LICENSE.txt for more information. -## Support -If you need help, feel free to join our Slack Community channel. It is currently only available via invitation. +## Contact +This library is developed and maintained by [kern.ai](https://github.com/code-kern-ai). If you want to provide us with feedback or have some questions, don't hesitate to contact us. We're super happy to help ✌️ diff --git a/kern/__init__.py b/kern/__init__.py new file mode 100644 index 0000000..dbd1e5b --- /dev/null +++ b/kern/__init__.py @@ -0,0 +1,48 @@ +# -*- coding: utf-8 -*- + +from wasabi import msg +import pandas as pd +from kern import authentication, api_calls, settings, exceptions +from typing import Optional + + +class Client: + """Client object which can be used to directly address the Kern AI API. + + Args: + user_name (str): Your username for the application. + password (str): The respective password. Do not share this! + project_id (str): The link to your project. This can be found in the URL in an active project. + uri (str, optional): Link to the host of the application. Defaults to "https://app.kern.ai". + + Raises: + exceptions.get_api_exception_class: If your credentials are incorrect, an exception is raised. + """ + + def __init__( + self, user_name: str, password: str, project_id: str, uri="https://app.kern.ai" + ): + settings.set_base_uri(uri) + self.session_token = authentication.create_session_token( + user_name=user_name, password=password + ) + if self.session_token is not None: + msg.good("Logged in to system.") + else: + msg.fail(f"Could not log in at {uri}. Please check username and password.") + raise exceptions.get_api_exception_class(401) + self.project_id = project_id + + def fetch_export(self, num_samples: Optional[int] = None) -> pd.DataFrame: + """Collects the export data of your project (i.e. the same data if you would export in the web app). + + Args: + num_samples (Optional[int], optional): If set, only the first `num_samples` records are collected. Defaults to None. + + Returns: + pd.DataFrame: DataFrame containing your record data. For more details, see https://docs.kern.ai + """ + url = settings.get_export_url(self.project_id, num_samples=num_samples) + api_response = api_calls.get_request(url, self.session_token) + df = pd.read_json(api_response) + return df diff --git a/kern/api_calls.py b/kern/api_calls.py new file mode 100644 index 0000000..047a8e2 --- /dev/null +++ b/kern/api_calls.py @@ -0,0 +1,53 @@ +# -*- coding: utf-8 -*- +from json.decoder import JSONDecodeError +import pkg_resources +from kern import exceptions +import requests +from typing import Any, Dict + +try: + version = pkg_resources.get_distribution("kern-python-client").version +except pkg_resources.DistributionNotFound: + version = "noversion" + + +def post_request(url: str, body: Dict[str, Any], session_token: str) -> str: + headers = _build_headers(session_token) + response = requests.post(url=url, json=body, headers=headers) + return _handle_response(response) + + +def get_request(url: str, session_token: str) -> str: + headers = _build_headers(session_token) + response = requests.get(url=url, headers=headers) + return _handle_response(response) + + +def _build_headers(session_token: str) -> Dict[str, str]: + return { + "Content-Type": "application/json", + "User-Agent": f"python-sdk-{version}", + "Authorization": f"Bearer {session_token}", + } + + +def _handle_response(response: requests.Response) -> str: + status_code = response.status_code + if status_code == 200: + json_data = response.json() + return json_data + else: + try: + json_data = response.json() + error_code = json_data.get("error_code") + error_message = json_data.get("error_message") + except JSONDecodeError: + error_code = 500 + error_message = "The server was unable to process the provided data." + + exception = exceptions.get_api_exception_class( + status_code=status_code, + error_code=error_code, + error_message=error_message, + ) + raise exception diff --git a/kern/authentication.py b/kern/authentication.py new file mode 100644 index 0000000..5424aba --- /dev/null +++ b/kern/authentication.py @@ -0,0 +1,27 @@ +# -*- coding: utf-8 -*- +from kern import settings +import requests + + +def create_session_token(user_name: str, password: str) -> str: + headers = {"Accept": "application/json"} + action_url = ( + requests.get(settings.get_authentication_url(), headers=headers) + .json() + .get("ui") + .get("action") + ) + session_token = ( + requests.post( + action_url, + headers=headers, + json={ + "method": "password", + "password": password, + "password_identifier": user_name, + }, + ) + .json() + .get("session_token") + ) + return session_token diff --git a/onetask/exceptions.py b/kern/exceptions.py similarity index 59% rename from onetask/exceptions.py rename to kern/exceptions.py index d7c9126..d7f8db4 100644 --- a/onetask/exceptions.py +++ b/kern/exceptions.py @@ -1,45 +1,44 @@ # -*- coding: utf-8 -*- from typing import Optional - -class ClientError(Exception): +# https://developer.mozilla.org/en-US/docs/Web/HTTP/Status#client_error_responses +class SDKError(Exception): def __init__(self, message: Optional[str] = None): if message is None: - message = "Please check the documentation." + message = ( + "Please check the SDK documentation at https://docs.kern.ai/reference." + ) super().__init__(message) -class ParameterError(ClientError): +# 401 Unauthorized +class UnauthorizedError(SDKError): pass -# https://developer.mozilla.org/en-US/docs/Web/HTTP/Status#client_error_responses -class APIError(Exception): - def __init__(self, message: Optional[str] = None): - if message is None: - message = "Please check the API reference." - super().__init__(message) - - -# 401 Unauthorized -class UnauthorizedError(APIError): +# 404 Not Found +class NotFoundError(SDKError): pass # 500 Server Error -class InternalServerError(APIError): +class InternalServerError(SDKError): pass -RESPONSE_CODES_API_EXCEPTION_MAP = {401: UnauthorizedError, 500: InternalServerError} +RESPONSE_CODES_API_EXCEPTION_MAP = { + 401: UnauthorizedError, + 404: NotFoundError, + 500: InternalServerError, +} def get_api_exception_class( status_code: int, error_code: Optional[str] = None, error_message: Optional[str] = None, -) -> APIError: - exception_or_dict = RESPONSE_CODES_API_EXCEPTION_MAP.get(status_code, APIError) +) -> SDKError: + exception_or_dict = RESPONSE_CODES_API_EXCEPTION_MAP.get(status_code, SDKError) if isinstance(exception_or_dict, dict): exception_class = exception_or_dict.get(error_code, exception_or_dict["*"]) else: diff --git a/kern/settings.py b/kern/settings.py new file mode 100644 index 0000000..11688e8 --- /dev/null +++ b/kern/settings.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +BASE_URI: str + + +def set_base_uri(uri: str): + global BASE_URI + BASE_URI = uri + + +def add_query_params(url: str, **kwargs) -> str: + set_question_mark = False + for key, value in kwargs.items(): + if value is not None: + if not set_question_mark: + url = f"{url}?{key}={value}" + set_question_mark = True + else: + url = f"{url}&{key}={value}" + return url + + +def get_authentication_url() -> str: + return f"{BASE_URI}/.ory/kratos/public/self-service/login/api" + + +def get_export_url(project_id: str, **kwargs) -> str: + url = f"{BASE_URI}/api/project/{project_id}/export" + url = add_query_params(url, **kwargs) + return url diff --git a/onetask/__init__.py b/onetask/__init__.py deleted file mode 100644 index 24488a6..0000000 --- a/onetask/__init__.py +++ /dev/null @@ -1,263 +0,0 @@ -# -*- coding: utf-8 -*- - -from typing import Callable, Dict, List, Optional, Union -from wasabi import msg -import pandas as pd -import numpy as np -from tqdm import tqdm -import json -from onetask import api_calls, settings, util, auto_lf, embedding -import numpy as np -from bertopic import BERTopic -from collections import defaultdict - - -class Client: - """ - Python Client for the onetask API. If you have any questions, please contact our support. - - Args: - user_name (str): The email with which you've been registered at onetask - password (str): Your password for onetask - project_id (str): The unique identifier for a project, can be found in the url after projects/ - stage (str): The onetask system staging environment [beta, test, dev, local] - """ - - def __init__( - self, user_name: str, password: str, project_id: str, stage: str = "beta" - ): - settings.set_stage(stage) - self.session_token = api_calls.create_session_token( - user_name=user_name, password=password - ) - self.project_id = project_id - if self.session_token is not None: - msg.good("Logged in to system.") - if not api_calls.GetProjectExists(project_id, self.session_token).exists: - msg.fail(f"Project with ID {self.project_id} does not exist.") - else: - msg.fail("Could not log in. Please check your username and password.") - - def _get_unique_attributes(self) -> List[Dict[str, Union[str, bool]]]: - """ - Get the record schema for your project shown in the web app under 'Settings' - - Returns: - List[Dict[str, Union[str, bool]]]: each record schema element - """ - attributes = api_calls.GetUniqueAttributes( - self.project_id, self.session_token - ).attributes - return attributes - - def register_lf(self, lf: Callable, autoexecute: bool = True) -> None: - """ - Send a local labeling function to the onetask application. Please make sure that the function fits the desired structure (for more information, please visit onetask.readme.io/reference) - - Args: - lf (Callable): The function object you want to send to the system - autoexecute (bool, optional): If true, the function is automatically executed when entered in the system. Defaults to True. - """ - project_id, name, source_code, docs = util.unpack_python_function( - lf, self.project_id - ) - if api_calls.PostLabelingFunction( - project_id, name, source_code, docs, autoexecute, self.session_token - ).already_exists: - msg.warn( - f"Labeling function '{name}' already exists. It has not been entered again!" - ) - else: - msg.good(f"Registered labeling function '{name}'.") - - def get_records(self, keep_unlabeled=False, keep_programmatic=False): - """ - Get the records of your project. - - - Args: - keep_unlabeled (bool, optional): If true, you will receive all records, even if they are not labeled yet. Defaults to False. - keep_programmatic (bool, optional): if true, you will receive also the programmatically labeled records. Defaults to False. - - Returns: - [type]: [description] - """ - records = api_calls.GetRecords( - self.project_id, - self.session_token, - keep_unlabeled=keep_unlabeled, - keep_programmatic=keep_programmatic, - ).records - - fetched_df = pd.DataFrame(records) - if len(fetched_df) > 0: - df = fetched_df["data"].apply(pd.Series) - df["label"] = fetched_df["label"] - df["is_programmatic"] = fetched_df["is_programmatic"] - return df - else: - msg.warn("Empty result") - return fetched_df # empty df - - def get_embeddings(self, config_string: str) -> pd.DataFrame: - """ - Get the embeddings of your project of a configuration string - - Args: - config_string (str): The name of your embedding - - Returns: - pd.DataFrame: containing the record attributes and the embedding vectors - """ - embeddings = api_calls.GetEmbeddings( - self.project_id, self.session_token, config_string - ).embeddings - - fetched_embeddings = pd.DataFrame(embeddings) - if len(fetched_embeddings) > 0: - df = fetched_embeddings["data"].apply(pd.Series) - df[config_string] = fetched_embeddings["embedding"] - return df - else: - msg.warn("Empty result") - return fetched_embeddings - - def generate_embeddings( - self, attribute_configs_dict: Dict[str, str], file_path: Optional[str] = None - ) -> None: - """ - ---EXPERIMENTAL--- - - Create new embeddings to upload into your project. - - Args: - attribute_configs_dict (Dict[str, str]): describe which attribute should be embedded using which technique or model. - file_path (Optional[str], optional): path where the embeddings should be stored to. Defaults to 'embeddings_{project_id}.json'. - """ - if not file_path: - file_path = f"embeddings_{self.project_id}.json" - - msg.info("Loading schema") - attributes = self._get_unique_attributes() - unique_attribute = None - for attr in attributes: - if attr["unique"]: - unique_attribute = attr["name"] - - embedding_name = "-".join(list(attribute_configs_dict.values())) - - if unique_attribute: - msg.info("Loading records") - records = self.get_records(keep_programmatic=False, keep_unlabeled=True) - embedding_concat = defaultdict(list) - for attribute, config_string in attribute_configs_dict.items(): - vals = np.stack(records[attribute]) - records_subset = records[[unique_attribute, attribute]].to_dict( - orient="records" - ) - msg.info(f"Loading embedding model {config_string} for {attribute}") - model = embedding.get_fitted_model_by_config_string(config_string, vals) - if model: - msg.info("Starting embedding procedure") - for _, row in tqdm( - enumerate(records_subset), - total=len(records_subset), - desc="Embedding records...", - ): - embedding_concat[row[unique_attribute]].extend( - model.encode(row[attribute]).tolist() - ) - msg.good(f"Finished embedding procedure. Storing to {file_path}") - export = [] - for unique_val, embedding_vector in embedding_concat.items(): - export.append( - {unique_attribute: unique_val, embedding_name: embedding_vector} - ) - with open(file_path, "w") as file: - json.dump(export, file) - else: - msg.fail( - "Currently, you must have exactly one unique attribute for embedding generation. Please validate this in the web app under 'Settings'" - ) - - def model_topics(self, attribute: str, config_string: str) -> BERTopic: - """ - ---EXPERIMENTAL--- - - Apply a BERTopic to your data to do topic modelling. Further docs: https://maartengr.github.io/BERTopic/tutorial/visualization/visualization.html - - Args: - attribute (str): the name of the string attribute you want to model - config_string (str): name of the embedding vector in the web application that you want to make use of. This MUST be a BERT-related embedding to work properly. - - Returns: - BERTopic: BERTopic object that can be called for topic modelling - """ - msg.info("Loading embeddings") - embeddings_df = self.get_embeddings(config_string) - if len(embeddings_df) > 0: - docs = embeddings_df[attribute].tolist() - embeddings = np.stack(embeddings_df[config_string]) - - msg.info("Fitting Topic Model") - model = BERTopic(verbose=True, n_gram_range=[1, 2], top_n_words=30) - model.fit(docs, embeddings) - msg.good("Finished training") - msg.info( - "Further docs: https://maartengr.github.io/BERTopic/tutorial/visualization/visualization.html" - ) - return model - - def generate_regex_labeling_functions( - self, - nlp, - attribute: str, - min_precision: Optional[float] = 0.8, - filter_stopwords: Optional[bool] = False, - ) -> pd.DataFrame: - """ - ---EXPERIMENTAL--- - - Autogenerate labeling functions containing regular expressions to model your data. Uses spacy to model the linguistics of your data. - - Args: - nlp (spacy.lang): nlp object of spacy for the specific language (e.g. en_core_web_sm) - attribute (str): the name of the attribute that should be analyzed for regular expressions - min_precision (Optional[float], optional): needed precision to generate a labeling function. Defaults to 0.8. - filter_stopwords (Optional[bool], optional): if set to true, stop words like 'this', 'that' etc. will be removed. Defaults to False. - - Returns: - pd.DataFrame: [description] - """ - records = self.get_records() - if len(records) > 0: - candidates = auto_lf.derive_regex_candidates( - nlp, records, attribute, filter_stopwords - ) - return auto_lf.create_regex_fns( - records, candidates, attribute, min_precision - ) - else: - msg.fail("No manually labeled records available!") - - def display_generated_labeling_functions( - self, lf_df: pd.DataFrame, label: Optional[str] = None - ): - """ - Helper function to display the autogenerated labeling functions - - Args: - lf_df (pd.DataFrame): outcome of client.generate_regex_labeling_functions - label (Optional[str], optional): filter option to only show one label. Defaults to None. - """ - if label is not None: - lf_df = lf_df.loc[lf_df["label"] == label] - for _, row in lf_df.iterrows(): - est_coverage = row["est_coverage"] - est_precision = row["est_precision"] - code = row["code"] - msg.info( - f"Coverage: {est_coverage * 100}% | Precision: {est_precision * 100}%" - ) - print(code) - print() diff --git a/onetask/api_calls.py b/onetask/api_calls.py deleted file mode 100644 index 33afe49..0000000 --- a/onetask/api_calls.py +++ /dev/null @@ -1,136 +0,0 @@ -# -*- coding: utf-8 -*- -from json.decoder import JSONDecodeError -import pkg_resources -from onetask import exceptions, settings -import requests - -try: - version = pkg_resources.get_distribution("onetask").version -except pkg_resources.DistributionNotFound: - version = "noversion" - - -# no call to the onetask system, therefore include it here -def create_session_token(user_name: str, password: str): - headers = {"Accept": "application/json"} - action_url = ( - requests.get(settings.get_authentication_url(), headers=headers) - .json() - .get("ui") - .get("action") - ) - session_token = ( - requests.post( - action_url, - headers=headers, - json={ - "method": "password", - "password": password, - "password_identifier": user_name, - }, - ) - .json() - .get("session_token") - ) - - return session_token - - -def build_headers(session_token): - return { - "Content-Type": "application/json", - "User-Agent": f"python-sdk-{version}", - "Authorization": f"Bearer {session_token}", - } - - -def handle_response(response): - status_code = response.status_code - if status_code == 200: - json_data = response.json() - return json_data - else: - try: - json_data = response.json() - error_code = json_data.get("error_code") - error_message = json_data.get("error_message") - except JSONDecodeError: - error_code = 500 - error_message = "The server was unable to process the provided data." - - exception = exceptions.get_api_exception_class( - status_code=status_code, - error_code=error_code, - error_message=error_message, - ) - raise exception - - -class PostRequest: - def __init__(self, url, body, session_token): - self.url = url - self.body = body - self.session_token = session_token - - def execute(self): - response = requests.post( - url=self.url, json=self.body, headers=build_headers(self.session_token) - ) - return handle_response(response) - - -class GetRequest: - def __init__(self, url, session_token): - self.url = url - self.session_token = session_token - - def execute(self): - response = requests.get(url=self.url, headers=build_headers(self.session_token)) - return handle_response(response) - - -class PostLabelingFunction(PostRequest): - def __init__( - self, project_id, name, function, description, autoexecute, session_token - ): - - body = { - "project_id": project_id, - "name": name, - "function": function, - "description": description, - "autoexecute": autoexecute, - } - - super().__init__(settings.get_post_lf_url(), body, session_token) - self.already_exists = self.execute()["already_exists"] - - -class GetProjectExists(GetRequest): - def __init__(self, project_id, session_token): - super().__init__(settings.get_project_url(project_id), session_token) - self.exists = self.execute() - - -class GetUniqueAttributes(GetRequest): - def __init__(self, project_id, session_token): - super().__init__(settings.get_schema_url(project_id), session_token) - self.attributes = self.execute() - - -class GetRecords(GetRequest): - def __init__(self, project_id, session_token, keep_unlabeled, keep_programmatic): - super().__init__( - settings.get_data_url(project_id, keep_unlabeled, keep_programmatic), - session_token, - ) - self.records = self.execute() - - -class GetEmbeddings(GetRequest): - def __init__(self, project_id, session_token, config_string): - super().__init__( - settings.get_embeddings_url(project_id, config_string), - session_token, - ) - self.embeddings = self.execute() diff --git a/onetask/auto_lf.py b/onetask/auto_lf.py deleted file mode 100644 index 5d733cf..0000000 --- a/onetask/auto_lf.py +++ /dev/null @@ -1,152 +0,0 @@ -# -*- coding: utf-8 -*- -from collections import defaultdict -from tqdm import tqdm -from collections import Counter -import re -from collections import defaultdict -import numpy as np -import pandas as pd -from wasabi import msg - - -def derive_regex_candidates(nlp, df, attribute, filter_stopwords): - if len(df) < 100: - msg.warn( - "Only very few records to analyze; it's best to continue labeling further records before analysis" - ) - - def normalize_token(token): - if "d" in token.shape_ and not "x" in token.shape_: - return token.shape_.replace("d", "[0-9]") - else: - return token.text - - def is_relevant_token(token): - conditions = [token.is_punct, token.is_bracket, len(token.text) == 1] - if filter_stopwords: - conditions.append(token.is_stop) - return not any(conditions) - - candidates = [] - for text in tqdm(df[attribute], total=len(df)): - doc = nlp(text.lower()) - for token in doc: - if is_relevant_token(token): - has_children = False - for token_left in token.lefts: - if is_relevant_token(token_left): - prefix = "^" if token_left.idx == 0 else " " - suffix = "$" if token.idx == len(doc) - 1 else " " - candidate = f"{prefix}{normalize_token(token_left)}.*?{normalize_token(token)}{suffix}" - candidates.append(candidate) - has_children = True - for token_right in token.rights: - if is_relevant_token(token_right): - prefix = "^" if token.idx == 0 else " " - suffix = "$" if token_right.idx == len(doc) - 1 else " " - candidate = f"{prefix}{normalize_token(token)}.*?{normalize_token(token_right)}{suffix}" - candidates.append(candidate) - has_children = True - if not has_children: - prefix = "^" if token.idx == 0 else " " - suffix = "$" if token.idx == len(doc) - 1 else " " - candidate = f"{prefix}{normalize_token(token)}{suffix}" - candidates.append(candidate) - return [regex for regex, _ in Counter(candidates).most_common(100)] - - -def create_regex_fns(df, candidates, regex_col, min_precision, label_col="label"): - n = len(df) - - def calc_min_cov(x): - return 0.3 / (x ** 0.5) - - min_coverage = calc_min_cov(n) - - def regex_explainer(regex, attribute): - description = "" - terms = regex.replace("^", "").replace("$", "").split(".*?") - if "^" in regex: - description += f"attribute '{attribute}' starts with term '{terms[0]}'" - if len(terms) > 1: - for term in terms[1:]: - description += f" (in-)directly followed by term '{term}'" - if "$" in regex: - description += " and then ends" - elif "$" in regex: - description += ( - f"attribute '{attribute}' somewhere contains term '{terms[0]}'" - ) - if len(terms) > 1: - for term in terms[1:]: - description += f" (in-)directly followed by term '{term}'" - description += " and then ends" - else: - description += ( - f"attribute '{attribute}' somewhere contains term '{terms[0]}'" - ) - if len(terms) > 1: - for term in terms[1:]: - description += f" (in-)directly followed by term '{term}'" - if "[0-9]" in regex: - description += ", where [0-9] is an arbitrary number" - description += "." - return description - - def build_regex_lf(regex, attribute, prediction, iteration, escape_regex): - - if escape_regex: - _regex = f"re.escape('{regex}')" - else: - _regex = f"r'{regex}'" - source_code = f""" -def regex_{iteration}(record): - '''{regex_explainer(regex, attribute)}''' - import re - if re.search({_regex}, record['{attribute}'].lower()): - return '{prediction}' - -client.register_lf(regex_{iteration}) - """ - - return source_code.strip() - - regex_nr = 1 - rows = [] - for regex in candidates: - labels = defaultdict(int) - escape_regex = False - for text, label in zip(df[regex_col], df[label_col]): - try: - if re.search(regex, text.lower()): - labels[label] += 1 - except: # there is sadly no better way (I know of) to handle this other than using a plain except - escape_regex = True - if re.search(re.escape(regex), text.lower()): - labels[label] += 1 - coverage = sum(labels.values()) - if coverage > 0: - regex_prediction, max_count = None, 0 - for prediction, count in labels.items(): - if count > max_count: - max_count = count - regex_prediction = prediction - precision = np.round(labels[regex_prediction] / coverage, 2) - coverage = np.round(coverage / len(df), 2) - if precision >= min_precision and coverage >= min_coverage: - lf = build_regex_lf( - regex, regex_col, regex_prediction, regex_nr, escape_regex - ) - regex_nr += 1 - rows.append( - { - "est_coverage": coverage, - "est_precision": precision, - "label": regex_prediction, - "code": lf, - } - ) - lf_df = pd.DataFrame(rows) - lf_df["priority"] = (lf_df["est_coverage"] ** 2) * lf_df["est_precision"] - lf_df = lf_df.sort_values(by="priority", ascending=False) - return lf_df diff --git a/onetask/embedding.py b/onetask/embedding.py deleted file mode 100644 index 797cc58..0000000 --- a/onetask/embedding.py +++ /dev/null @@ -1,113 +0,0 @@ -# -*- coding: utf-8 -*- -from abc import ABC, abstractmethod -from sentence_transformers import SentenceTransformer, models -from torch import nn -import numpy as np -from wasabi import msg -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.preprocessing import OneHotEncoder -from transformers import logging - -logging.set_verbosity_error() - - -def get_fitted_model_by_config_string(config_string, records): - if config_string == "identity": - return IdentityEmbedder(records) - elif config_string == "boc": - return BoCEmbedder(records) - elif config_string == "bow": - return BoWEmbedder(records) - elif config_string == "onehot": - return OneHotEmbedder(records) - else: - try: - return DocumentEmbedder(records, config_string) - except: - msg.fail( - f"Embedding '{config_string}' is unknown. Please check https://onetask.readme.io/ for more information" - ) - - -class Embedder(ABC): - def __init__(self, records): - self.fit(records) - - @abstractmethod - def encode(self, document): - pass - - @abstractmethod - def fit(self, records): - pass - - -class IdentityEmbedder(Embedder): - def __init__(self, records): - super().__init__(records) - - def fit(self, records): - pass - - def encode(self, document): - return np.array([document]) - - -class BoCEmbedder(Embedder): - def __init__(self, records): - self.model = CountVectorizer(analyzer="char") - super().__init__(records) - - def fit(self, records): - self.model.fit(records) - - def encode(self, document): - return self.model.transform([document]).toarray()[0] - - -class BoWEmbedder(Embedder): - def __init__(self, records): - self.model = CountVectorizer(min_df=0.1) - super().__init__(records) - - def fit(self, records): - self.model.fit(records) - - def encode(self, document): - return self.model.transform([document]).toarray()[0] - - -class OneHotEmbedder(Embedder): - def __init__(self, records): - self.model = OneHotEncoder() - super().__init__(records) - - def fit(self, records): - self.model.fit(records.reshape(-1, 1)) - - def encode(self, document): - return self.model.transform([[document]]).toarray()[0] - - -class DocumentEmbedder(Embedder): - def __init__(self, records, configuration_string: str = "distilbert-base-uncased"): - word_embedding_model = models.Transformer(configuration_string) - pooling_model = models.Pooling( - word_embedding_model.get_word_embedding_dimension() - ) - dense_model = models.Dense( - in_features=pooling_model.get_sentence_embedding_dimension(), - out_features=256, - activation_function=nn.Tanh(), - ) - - self.model = SentenceTransformer( - modules=[word_embedding_model, pooling_model, dense_model] - ) - super().__init__(records) - - def fit(self, records): - pass - - def encode(self, document: str): - return self.model.encode(document) diff --git a/onetask/settings.py b/onetask/settings.py deleted file mode 100644 index ab9dfda..0000000 --- a/onetask/settings.py +++ /dev/null @@ -1,45 +0,0 @@ -# -*- coding: utf-8 -*- -STAGE: str - - -def set_stage(stage): - global STAGE - STAGE = stage - - -def get_base_url(): - global STAGE - if STAGE == "beta": - return "https://app.beta.onetask.ai" - elif STAGE == "test": - return "https://app.test.onetask.ai" - elif STAGE == "dev": - return "https://app.dev.onetask.ai" - elif STAGE == "local": - return "http://localhost:4455" - else: - return STAGE - - -def get_authentication_url(): - return f"{get_base_url()}/.ory/kratos/public/self-service/login/api" - - -def get_project_url(project_id): - return f"{get_base_url()}/api/project/{project_id}" - - -def get_post_lf_url(): - return f"{get_base_url()}/labelfunction" - - -def get_schema_url(project_id): - return f"{get_project_url(project_id)}/schema" - - -def get_embeddings_url(project_id, config_string): - return f"{get_project_url(project_id)}/embeddings/{config_string}" - - -def get_data_url(project_id, keep_unlabeled, keep_programmatic): - return f"{get_project_url(project_id)}/data?keep_unlabeled={keep_unlabeled}&keep_programmatic={keep_programmatic}" diff --git a/onetask/util.py b/onetask/util.py deleted file mode 100644 index 68c733a..0000000 --- a/onetask/util.py +++ /dev/null @@ -1,35 +0,0 @@ -# -*- coding: utf-8 -*- -import inspect -from typing import Callable -import re -from onetask import exceptions - - -def unpack_python_function(fn: Callable, project_id: str): - def check_signature(source_code: str) -> None: - # validate that only one parameter is given - - parameters = re.search(r"\((.*?)\):", source_code).group(1).split(",") - if parameters == [""]: - number_parameters = 0 - else: - number_parameters = len(parameters) - if number_parameters != 1: - raise exceptions.ParameterError( - f"{number_parameters} parameters provided. Please use exactly one." - ) - - name = fn.__name__ - replace_operations = { - f"def {name}(": "def lf(", - f' """{fn.__doc__}"""\n': "", - " ": "\t", - } - source_code = inspect.getsource(fn) - for key, value in replace_operations.items(): - source_code = source_code.replace(key, value) - docs = inspect.getdoc(fn) or "" # default - - check_signature(source_code) - - return project_id, name, source_code, docs diff --git a/requirements.txt b/requirements.txt index c791d3a..45f9731 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,93 +1,19 @@ -appnope==0.1.2 -argcomplete==1.12.3 -argon2-cffi==21.1.0 -attrs==21.2.0 -backcall==0.2.0 -bertopic==0.9.3 -bleach==4.1.0 +black==22.3.0 certifi==2021.10.8 -cffi==1.15.0 -charset-normalizer==2.0.7 -click==8.0.3 -Cython==0.29.24 -debugpy==1.5.1 -decorator==5.1.0 -defusedxml==0.7.1 -entrypoints==0.3 -filelock==3.4.0 -hdbscan==0.8.27 -huggingface-hub==0.1.2 +charset-normalizer==2.0.12 +click==8.1.3 idna==3.3 -importlib-metadata==4.8.2 -importlib-resources==5.4.0 -jedi==0.18.0 -Jinja2==3.0.3 -joblib==1.1.0 -jsonschema==4.2.1 -llvmlite==0.37.0 -MarkupSafe==2.0.1 -matplotlib-inline==0.1.3 -mistune==0.8.4 mypy-extensions==0.4.3 -nbclient==0.5.8 -nbconvert==6.3.0 -nbformat==5.1.3 -nest-asyncio==1.5.1 -nltk==3.6.5 -notebook==6.4.5 -numba==0.54.1 -numpy==1.20.3 -packaging==21.2 -pandas==1.3.4 -pandocfilters==1.5.0 -parso==0.8.2 +numpy==1.22.3 +pandas==1.4.2 pathspec==0.9.0 -pexpect==4.8.0 -pickleshare==0.7.5 -Pillow==8.4.0 -platformdirs==2.4.0 -plotly==4.14.2 -prometheus-client==0.12.0 -prompt-toolkit==3.0.22 -ptyprocess==0.7.0 -pycparser==2.21 -Pygments==2.10.0 -pynndescent==0.5.5 -pyparsing==2.4.7 -pyrsistent==0.18.0 +platformdirs==2.5.2 python-dateutil==2.8.2 -pytz==2021.3 -PyYAML==5.4.1 -pyzmq==22.3.0 -qtconsole==5.2.0 -QtPy==1.11.2 -regex==2021.11.10 -requests==2.26.0 -retrying==1.3.3 -sacremoses==0.0.46 -scikit-learn==1.0.1 -scipy==1.7.2 -Send2Trash==1.8.0 -sentence-transformers==2.1.0 -sentencepiece==0.1.96 +pytz==2022.1 +requests==2.27.1 six==1.16.0 -terminado==0.12.1 -testpath==0.5.0 -threadpoolctl==3.0.0 -tokenizers==0.10.3 -tomli==1.2.2 -torch==1.10.0 -torchvision==0.11.1 -tornado==6.1 -tqdm==4.62.3 -traitlets==5.1.1 -transformers==4.12.4 -typed-ast==1.5.0 -typing-extensions==4.0.0 -umap-learn==0.5.2 -urllib3==1.26.7 -wasabi==0.8.2 -wcwidth==0.2.5 -webencodings==0.5.1 -widgetsnbextension==3.5.2 -zipp==3.6.0 +tinycss2==1.1.1 +tomli==2.0.1 +typing_extensions==4.2.0 +urllib3==1.26.9 +wasabi==0.9.1 diff --git a/setup.py b/setup.py index 575600f..3977945 100644 --- a/setup.py +++ b/setup.py @@ -9,15 +9,15 @@ long_description = file.read() setup( - name="onetask", - version="0.2.6", - author="onetask", - author_email="info@onetask.ai", - description="Official Python SDK for the onetask API", + name="kern-python-client", + version="0.0.1", + author="jhoetter", + author_email="johannes.hoetter@kern.ai", + description="Official Python SDK for the Kern AI API", long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/onetask-ai/onetask-python", - keywords=["onetask", "machine learning", "supervised learning", "python"], + url="https://github.com/code-kern-ai/kern-python", + keywords=["kern", "machine learning", "supervised learning", "python"], classifiers=[ "Development Status :: 3 - Alpha", "Programming Language :: Python :: 3", @@ -26,98 +26,22 @@ package_dir={"": "."}, packages=find_packages("."), install_requires=[ - "appnope==0.1.2", - "argcomplete==1.12.3", - "argon2-cffi==21.1.0", - "attrs==21.2.0", - "backcall==0.2.0", - "bertopic==0.9.3", - "bleach==4.1.0", "certifi==2021.10.8", - "cffi==1.15.0", - "charset-normalizer==2.0.7", - "click==8.0.3", - "Cython==0.29.24", - "debugpy==1.5.1", - "decorator==5.1.0", - "defusedxml==0.7.1", - "entrypoints==0.3", - "filelock==3.4.0", - "hdbscan==0.8.27", - "huggingface-hub==0.1.2", + "charset-normalizer==2.0.12", + "click==8.1.3", "idna==3.3", - "importlib-metadata==4.8.2", - "importlib-resources==5.4.0", - "jedi==0.18.0", - "Jinja2==3.0.3", - "joblib==1.1.0", - "jsonschema==4.2.1", - "llvmlite==0.37.0", - "MarkupSafe==2.0.1", - "matplotlib-inline==0.1.3", - "mistune==0.8.4", - "mypy-extensions==0.4.3", - "nbclient==0.5.8", - "nbconvert==6.3.0", - "nbformat==5.1.3", - "nest-asyncio==1.5.1", - "nltk==3.6.5", - "notebook==6.4.5", - "numba==0.54.1", - "numpy==1.20.3", - "packaging==21.2", - "pandas==1.3.4", - "pandocfilters==1.5.0", - "parso==0.8.2", + "numpy==1.22.3", + "pandas==1.4.2", "pathspec==0.9.0", - "pexpect==4.8.0", - "pickleshare==0.7.5", - "Pillow==8.4.0", - "platformdirs==2.4.0", - "plotly==4.14.2", - "prometheus-client==0.12.0", - "prompt-toolkit==3.0.22", - "ptyprocess==0.7.0", - "pycparser==2.21", - "Pygments==2.10.0", - "pynndescent==0.5.5", - "pyparsing==2.4.7", - "pyrsistent==0.18.0", + "platformdirs==2.5.2", "python-dateutil==2.8.2", - "pytz==2021.3", - "PyYAML==5.4.1", - "pyzmq==22.3.0", - "qtconsole==5.2.0", - "QtPy==1.11.2", - "regex==2021.11.10", - "requests==2.26.0", - "retrying==1.3.3", - "sacremoses==0.0.46", - "scikit-learn==1.0.1", - "scipy==1.7.2", - "Send2Trash==1.8.0", - "sentence-transformers==2.1.0", - "sentencepiece==0.1.96", + "pytz==2022.1", + "requests==2.27.1", "six==1.16.0", - "terminado==0.12.1", - "testpath==0.5.0", - "threadpoolctl==3.0.0", - "tokenizers==0.10.3", - "tomli==1.2.2", - "torch==1.10.0", - "torchvision==0.11.1", - "tornado==6.1", - "tqdm==4.62.3", - "traitlets==5.1.1", - "transformers==4.12.4", - "typed-ast==1.5.0", - "typing-extensions==4.0.0", - "umap-learn==0.5.2", - "urllib3==1.26.7", - "wasabi==0.8.2", - "wcwidth==0.2.5", - "webencodings==0.5.1", - "widgetsnbextension==3.5.2", - "zipp==3.6.0", + "tinycss2==1.1.1", + "tomli==2.0.1", + "typing_extensions==4.2.0", + "urllib3==1.26.9", + "wasabi==0.9.1", ], )