From 981cb6ab55d8309c765bf42e2dda718988aa9225 Mon Sep 17 00:00:00 2001 From: FelixKirschKern Date: Mon, 2 Oct 2023 14:19:02 +0200 Subject: [PATCH 1/7] implements overwriting and default value of ws stats --- app.py | 6 +- controller/integration.py | 140 ++++++++++++++++++++++++++------------ submodules/model | 2 +- 3 files changed, 104 insertions(+), 44 deletions(-) diff --git a/app.py b/app.py index ef54340..feed7a9 100644 --- a/app.py +++ b/app.py @@ -1,5 +1,6 @@ from fastapi import FastAPI, HTTPException, responses, status from pydantic import BaseModel +from typing import Union, Dict, Optional from controller import stats from controller import integration @@ -14,6 +15,7 @@ class WeakSupervisionRequest(BaseModel): labeling_task_id: str user_id: str weak_supervision_task_id: str + overwrite_weak_supervision: Optional[Union[float, Dict[str, float]]] class TaskStatsRequest(BaseModel): @@ -31,6 +33,7 @@ class SourceStatsRequest(BaseModel): class ExportWsStatsRequest(BaseModel): project_id: str labeling_task_id: str + overwrite_weak_supervision: Optional[Union[float, Dict[str, float]]] @app.post("/fit_predict") @@ -43,6 +46,7 @@ def weakly_supervise( request.labeling_task_id, request.user_id, request.weak_supervision_task_id, + request.overwrite_weak_supervision, ) general.remove_and_refresh_session(session_token) return responses.PlainTextResponse(status_code=status.HTTP_200_OK) @@ -80,7 +84,7 @@ def calculate_source_stats( def export_ws_stats(request: ExportWsStatsRequest) -> responses.PlainTextResponse: session_token = general.get_ctx_token() status_code, message = integration.export_weak_supervision_stats( - request.project_id, request.labeling_task_id + request.project_id, request.labeling_task_id, request.overwrite_weak_supervision ) general.remove_and_refresh_session(session_token) diff --git a/controller/integration.py b/controller/integration.py index 9f6251d..d66c35f 100644 --- a/controller/integration.py +++ b/controller/integration.py @@ -1,5 +1,5 @@ import os -from typing import Any, Dict, List, Tuple +from typing import Any, Dict, List, Tuple, Optional, Union import traceback import pandas as pd import pickle @@ -18,19 +18,62 @@ labeling_task, record_label_association, weak_supervision, + labeling_task_label, + information_source, ) +NO_LABEL_WS_PRECISION = 0.8 + + +def __create_stats_lkp( + project_id: str, + labeling_task_id: str, + overwrite_weak_supervision: Union[float, Dict[str, float]], +) -> Dict[Any, Any]: + if isinstance(overwrite_weak_supervision, float): + ws_weights = {} + for heuristic_id in information_source.get_all_ids_by_labeling_task_id( + project_id, labeling_task_id + ): + ws_weights[str(heuristic_id)] = overwrite_weak_supervision + else: + ws_weights = overwrite_weak_supervision + + ws_stats = {} + for heuristic_id in ws_weights: + label_ids = labeling_task_label.get_all_ids(project_id, labeling_task_id) + for (label_id,) in label_ids: + ws_stats[(heuristic_id, str(label_id))] = { + "precision": ws_weights[heuristic_id] + } + return ws_stats + def fit_predict( - project_id: str, labeling_task_id: str, user_id: str, weak_supervision_task_id: str + project_id: str, + labeling_task_id: str, + user_id: str, + weak_supervision_task_id: str, + overwrite_weak_supervision: Optional[Dict[Any, Any]] = None, ): + stats_lkp = None + if overwrite_weak_supervision is not None: + stats_lkp = __create_stats_lkp( + project_id, labeling_task_id, overwrite_weak_supervision + ) + elif not record_label_association.is_any_record_manually_labeled_by_lt_id( + project_id, labeling_task_id + ): + stats_lkp = __create_stats_lkp( + project_id, labeling_task_id, NO_LABEL_WS_PRECISION + ) + task_type, df = collect_data(project_id, labeling_task_id, True) try: if task_type == enums.LabelingTaskType.CLASSIFICATION.value: - results = integrate_classification(df) - + results = integrate_classification(df, stats_lkp) else: - results = integrate_extraction(df) + results = integrate_extraction(df, stats_lkp) weak_supervision.store_data( project_id, labeling_task_id, @@ -52,46 +95,59 @@ def fit_predict( def export_weak_supervision_stats( - project_id: str, labeling_task_id: str + project_id: str, + labeling_task_id: str, + overwrite_weak_supervision: Optional[Union[float, Dict[str, float]]] = None, ) -> Tuple[int, str]: + if overwrite_weak_supervision is not None: + ws_stats = __create_stats_lkp( + project_id, labeling_task_id, overwrite_weak_supervision + ) + elif not record_label_association.is_any_record_manually_labeled_by_lt_id( + project_id, labeling_task_id + ): + ws_stats = __create_stats_lkp( + project_id, labeling_task_id, NO_LABEL_WS_PRECISION + ) + else: + task_type, df = collect_data(project_id, labeling_task_id, False) + try: + if task_type == enums.LabelingTaskType.CLASSIFICATION.value: + cnlm = util.get_cnlm_from_df(df) + stats_df = cnlm.quality_metrics() + elif task_type == enums.LabelingTaskType.INFORMATION_EXTRACTION.value: + enlm = util.get_enlm_from_df(df) + stats_df = enlm.quality_metrics() + else: + return 404, f"Task type {task_type} not implemented" - task_type, df = collect_data(project_id, labeling_task_id, False) - try: - if task_type == enums.LabelingTaskType.CLASSIFICATION.value: - cnlm = util.get_cnlm_from_df(df) - stats_df = cnlm.quality_metrics() - elif task_type == enums.LabelingTaskType.INFORMATION_EXTRACTION.value: - enlm = util.get_enlm_from_df(df) - stats_df = enlm.quality_metrics() - else: - return 404, f"Task type {task_type} not implemented" + if len(stats_df) != 0: + ws_stats = stats_df.set_index(["identifier", "label_name"]).to_dict( + orient="index" + ) + else: + return 404, "Can't compute weak supervision" - if len(stats_df) != 0: - stats_lkp = stats_df.set_index(["identifier", "label_name"]).to_dict( - orient="index" - ) - else: - return 404, "Can't compute weak supervision" + except Exception: + print(traceback.format_exc(), flush=True) + general.rollback() + return 500, "Internal server error" - os.makedirs(os.path.join("/inference", project_id), exist_ok=True) - with open( - os.path.join( - "/inference", project_id, f"weak-supervision-{labeling_task_id}.pkl" - ), - "wb", - ) as f: - pickle.dump(stats_lkp, f) + os.makedirs(os.path.join("/inference", project_id), exist_ok=True) + with open( + os.path.join( + "/inference", project_id, f"weak-supervision-{labeling_task_id}.pkl" + ), + "wb", + ) as f: + pickle.dump(ws_stats, f) - except Exception: - print(traceback.format_exc(), flush=True) - general.rollback() - return 500, "Internal server error" return 200, "OK" -def integrate_classification(df: pd.DataFrame): +def integrate_classification(df: pd.DataFrame, stats_lkp: Dict[Any, Any] = None): cnlm = util.get_cnlm_from_df(df) - weak_supervision_results = cnlm.weakly_supervise() + weak_supervision_results = cnlm.weakly_supervise(stats_lkp) return_values = defaultdict(list) for record_id, ( label_id, @@ -128,12 +184,12 @@ def collect_data( query_results = [] if labeling_task_item.task_type == enums.LabelingTaskType.CLASSIFICATION.value: - for information_source in labeling_task_item.information_sources: - if only_selected and not information_source.is_selected: + for information_source_item in labeling_task_item.information_sources: + if only_selected and not information_source_item.is_selected: continue results = ( record_label_association.get_all_classifications_for_information_source( - project_id, information_source.id + project_id, information_source_item.id ) ) query_results.extend(results) @@ -149,11 +205,11 @@ def collect_data( labeling_task_item.task_type == enums.LabelingTaskType.INFORMATION_EXTRACTION.value ): - for information_source in labeling_task_item.information_sources: - if only_selected and not information_source.is_selected: + for information_source_item in labeling_task_item.information_sources: + if only_selected and not information_source_item.is_selected: continue results = record_label_association.get_all_extraction_tokens_for_information_source( - project_id, information_source.id + project_id, information_source_item.id ) query_results.extend(results) diff --git a/submodules/model b/submodules/model index bfd0695..8d4a9fc 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit bfd06954dfb5669e3c812f406ecf69c83dd38991 +Subproject commit 8d4a9fce378b617eac3f390ed7c3c396989bda5a From d0d71f9aada1616624c7b67ae9475d8de5969c4d Mon Sep 17 00:00:00 2001 From: FelixKirschKern Date: Mon, 2 Oct 2023 16:52:46 +0200 Subject: [PATCH 2/7] adds extraction task to weak supervision overwrite --- controller/integration.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controller/integration.py b/controller/integration.py index d66c35f..d5dff89 100644 --- a/controller/integration.py +++ b/controller/integration.py @@ -159,9 +159,9 @@ def integrate_classification(df: pd.DataFrame, stats_lkp: Dict[Any, Any] = None) return return_values -def integrate_extraction(df: pd.DataFrame): +def integrate_extraction(df: pd.DataFrame, stats_lkp: Dict[Any, Any] = None): enlm = util.get_enlm_from_df(df) - weak_supervision_results = enlm.weakly_supervise() + weak_supervision_results = enlm.weakly_supervise(stats_lkp) return_values = defaultdict(list) for record_id, preds in weak_supervision_results.items(): for pred in preds: From cc3d1f289698420cb089064e62ca2333b5ff7dd9 Mon Sep 17 00:00:00 2001 From: FelixKirschKern Date: Mon, 9 Oct 2023 09:37:01 +0200 Subject: [PATCH 3/7] updates submodule --- controller/integration.py | 4 ++-- submodules/model | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/controller/integration.py b/controller/integration.py index d5dff89..5a664ec 100644 --- a/controller/integration.py +++ b/controller/integration.py @@ -61,7 +61,7 @@ def fit_predict( stats_lkp = __create_stats_lkp( project_id, labeling_task_id, overwrite_weak_supervision ) - elif not record_label_association.is_any_record_manually_labeled_by_lt_id( + elif not record_label_association.is_any_record_manually_labeled( project_id, labeling_task_id ): stats_lkp = __create_stats_lkp( @@ -103,7 +103,7 @@ def export_weak_supervision_stats( ws_stats = __create_stats_lkp( project_id, labeling_task_id, overwrite_weak_supervision ) - elif not record_label_association.is_any_record_manually_labeled_by_lt_id( + elif not record_label_association.is_any_record_manually_labeled( project_id, labeling_task_id ): ws_stats = __create_stats_lkp( diff --git a/submodules/model b/submodules/model index 8d4a9fc..7ecd2d2 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 8d4a9fce378b617eac3f390ed7c3c396989bda5a +Subproject commit 7ecd2d2e118aae9aa272c5dd57368c560528b3c6 From 45b1452b111433f4116bb1425c3640d627b3ff86 Mon Sep 17 00:00:00 2001 From: FelixKirschKern Date: Mon, 9 Oct 2023 14:17:31 +0200 Subject: [PATCH 4/7] pr comments, changed attribute name for clarity --- controller/integration.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/controller/integration.py b/controller/integration.py index 5a664ec..29612cc 100644 --- a/controller/integration.py +++ b/controller/integration.py @@ -25,7 +25,7 @@ NO_LABEL_WS_PRECISION = 0.8 -def __create_stats_lkp( +def __create_quality_metrics( project_id: str, labeling_task_id: str, overwrite_weak_supervision: Union[float, Dict[str, float]], @@ -56,24 +56,24 @@ def fit_predict( weak_supervision_task_id: str, overwrite_weak_supervision: Optional[Dict[Any, Any]] = None, ): - stats_lkp = None + quality_metrics_overwrite = None if overwrite_weak_supervision is not None: - stats_lkp = __create_stats_lkp( + quality_metrics_overwrite = __create_quality_metrics( project_id, labeling_task_id, overwrite_weak_supervision ) elif not record_label_association.is_any_record_manually_labeled( project_id, labeling_task_id ): - stats_lkp = __create_stats_lkp( + quality_metrics_overwrite = __create_quality_metrics( project_id, labeling_task_id, NO_LABEL_WS_PRECISION ) task_type, df = collect_data(project_id, labeling_task_id, True) try: if task_type == enums.LabelingTaskType.CLASSIFICATION.value: - results = integrate_classification(df, stats_lkp) + results = integrate_classification(df, quality_metrics_overwrite) else: - results = integrate_extraction(df, stats_lkp) + results = integrate_extraction(df, quality_metrics_overwrite) weak_supervision.store_data( project_id, labeling_task_id, @@ -100,13 +100,13 @@ def export_weak_supervision_stats( overwrite_weak_supervision: Optional[Union[float, Dict[str, float]]] = None, ) -> Tuple[int, str]: if overwrite_weak_supervision is not None: - ws_stats = __create_stats_lkp( + ws_stats = __create_quality_metrics( project_id, labeling_task_id, overwrite_weak_supervision ) elif not record_label_association.is_any_record_manually_labeled( project_id, labeling_task_id ): - ws_stats = __create_stats_lkp( + ws_stats = __create_quality_metrics( project_id, labeling_task_id, NO_LABEL_WS_PRECISION ) else: @@ -145,9 +145,11 @@ def export_weak_supervision_stats( return 200, "OK" -def integrate_classification(df: pd.DataFrame, stats_lkp: Dict[Any, Any] = None): +def integrate_classification( + df: pd.DataFrame, quality_metrics_overwrite: Dict[Any, Any] = None +): cnlm = util.get_cnlm_from_df(df) - weak_supervision_results = cnlm.weakly_supervise(stats_lkp) + weak_supervision_results = cnlm.weakly_supervise(quality_metrics_overwrite) return_values = defaultdict(list) for record_id, ( label_id, @@ -159,9 +161,11 @@ def integrate_classification(df: pd.DataFrame, stats_lkp: Dict[Any, Any] = None) return return_values -def integrate_extraction(df: pd.DataFrame, stats_lkp: Dict[Any, Any] = None): +def integrate_extraction( + df: pd.DataFrame, quality_metrics_overwrite: Dict[Any, Any] = None +): enlm = util.get_enlm_from_df(df) - weak_supervision_results = enlm.weakly_supervise(stats_lkp) + weak_supervision_results = enlm.weakly_supervise(quality_metrics_overwrite) return_values = defaultdict(list) for record_id, preds in weak_supervision_results.items(): for pred in preds: From 0c75c2000384cd1c5aecaf7c07fbadbc5b8a42cf Mon Sep 17 00:00:00 2001 From: FelixKirschKern Date: Tue, 10 Oct 2023 11:31:11 +0200 Subject: [PATCH 5/7] pr comments, typing --- controller/integration.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/controller/integration.py b/controller/integration.py index 29612cc..954b2e3 100644 --- a/controller/integration.py +++ b/controller/integration.py @@ -29,7 +29,7 @@ def __create_quality_metrics( project_id: str, labeling_task_id: str, overwrite_weak_supervision: Union[float, Dict[str, float]], -) -> Dict[Any, Any]: +) -> Dict[Tuple[str, str], Dict[str, float]]: if isinstance(overwrite_weak_supervision, float): ws_weights = {} for heuristic_id in information_source.get_all_ids_by_labeling_task_id( @@ -54,7 +54,7 @@ def fit_predict( labeling_task_id: str, user_id: str, weak_supervision_task_id: str, - overwrite_weak_supervision: Optional[Dict[Any, Any]] = None, + overwrite_weak_supervision: Optional[Union[float, Dict[str, float]]] = None, ): quality_metrics_overwrite = None if overwrite_weak_supervision is not None: @@ -146,7 +146,8 @@ def export_weak_supervision_stats( def integrate_classification( - df: pd.DataFrame, quality_metrics_overwrite: Dict[Any, Any] = None + df: pd.DataFrame, + quality_metrics_overwrite: Optional[Dict[Tuple[str, str], Dict[str, float]]] = None, ): cnlm = util.get_cnlm_from_df(df) weak_supervision_results = cnlm.weakly_supervise(quality_metrics_overwrite) @@ -162,7 +163,8 @@ def integrate_classification( def integrate_extraction( - df: pd.DataFrame, quality_metrics_overwrite: Dict[Any, Any] = None + df: pd.DataFrame, + quality_metrics_overwrite: Optional[Dict[Tuple[str, str], Dict[str, float]]] = None, ): enlm = util.get_enlm_from_df(df) weak_supervision_results = enlm.weakly_supervise(quality_metrics_overwrite) From 48a7e7c6fcb1754cb2d8e2b70bcf5f837eb4d061 Mon Sep 17 00:00:00 2001 From: FelixKirschKern Date: Tue, 10 Oct 2023 11:31:59 +0200 Subject: [PATCH 6/7] updates weak_nlp version --- requirements.txt | 2 +- requirements/requirements.in | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index c7d03cd..2daeb60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -107,5 +107,5 @@ urllib3==1.26.16 # requests uvicorn==0.22.0 # via -r requirements/common-requirements.txt -weak-nlp==0.0.12 +weak-nlp==0.0.13 # via -r requirements/requirements.in diff --git a/requirements/requirements.in b/requirements/requirements.in index ba36fb9..71279f9 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,2 +1,2 @@ -r common-requirements.txt -weak-nlp==0.0.12 +weak-nlp==0.0.13 From 7c11df5c8322de45125969de99ec6e37fe1b90d4 Mon Sep 17 00:00:00 2001 From: FelixKirschKern Date: Tue, 10 Oct 2023 12:42:30 +0200 Subject: [PATCH 7/7] updated submodule --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index 7ecd2d2..5c96146 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 7ecd2d2e118aae9aa272c5dd57368c560528b3c6 +Subproject commit 5c96146323de5150450c192ba1583917bc53076d