From 40fee617e9a1f6ade08d0c90469549fb1e97f3a9 Mon Sep 17 00:00:00 2001 From: felix0496 Date: Thu, 24 Nov 2022 14:59:05 +0100 Subject: [PATCH 1/7] save active learner as pickle --- run_ml.py | 39 +++++++++++++++++++++++++++++++++------ util.py | 13 ++++++++++--- 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/run_ml.py b/run_ml.py index 70f312b..6f9904a 100644 --- a/run_ml.py +++ b/run_ml.py @@ -1,20 +1,33 @@ #!/usr/bin/env python3 +import os import sys import util import requests -from collections import defaultdict import pandas as pd +import pickle CONSTANT__OUTSIDE = "OUTSIDE" # enum from graphql-gateway; if it changes, the extraction service breaks! -def run_classification(corpus_embeddings, corpus_labels, corpus_ids, training_ids): +def run_classification( + information_source_id, + corpus_embeddings, + corpus_labels, + corpus_ids, + training_ids, +): from active_transfer_learning import ATLClassifier classifier = ATLClassifier() prediction_probabilities = classifier.fit_predict( corpus_embeddings, corpus_labels, corpus_ids, training_ids ) + pickle_path = os.path.join( + "/inference", f"active-learning-{information_source_id}.pkl" + ) + with open(pickle_path, "wb") as f: + pickle.dump(classifier, f) + print("Saved model to disk", flush=True) prediction_indices = prediction_probabilities.argmax(axis=1) predictions_with_probabilities = [] @@ -48,7 +61,9 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): corpus_embeddings, corpus_labels, corpus_ids, training_ids ) ml_results_by_record_id = {} - for record_id, prediction, probability in zip(corpus_ids, predictions, probabilities): + for record_id, prediction, probability in zip( + corpus_ids, predictions, probabilities + ): df = pd.DataFrame( list(zip(prediction, probability)), columns=["prediction", "probability"], @@ -67,7 +82,9 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): if row.prediction != row.next: prob = df.loc[start_idx:idx].probability.mean() end_idx = idx + 1 - predictions_with_probabilities.append([float(prob), row.prediction, start_idx, end_idx]) + predictions_with_probabilities.append( + [float(prob), row.prediction, start_idx, end_idx] + ) new_start_idx = True ml_results_by_record_id[record_id] = predictions_with_probabilities if len(ml_results_by_record_id) == 0: @@ -79,7 +96,13 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): _, payload_url = sys.argv print("Preparing data for machine learning.") - corpus_embeddings, corpus_labels, corpus_ids, training_ids = util.get_corpus() + ( + information_source_id, + corpus_embeddings, + corpus_labels, + corpus_ids, + training_ids, + ) = util.get_corpus() is_extractor = any([isinstance(val, list) for val in corpus_labels["manual"]]) if is_extractor: @@ -90,7 +113,11 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): else: print("Running classifier.") ml_results_by_record_id = run_classification( - corpus_embeddings, corpus_labels, corpus_ids, training_ids + information_source_id, + corpus_embeddings, + corpus_labels, + corpus_ids, + training_ids, ) print("Finished execution.") diff --git a/util.py b/util.py index 6660c4a..c5be89b 100644 --- a/util.py +++ b/util.py @@ -20,6 +20,7 @@ def get_corpus(): with open("input.json", "r") as infile: input_data = json.load(infile) + information_source_id = input_data["information_source_id"] embedding_type = input_data["embedding_type"] embedding_name = input_data["embedding_name"] labels = input_data["labels"] @@ -45,10 +46,16 @@ def get_corpus(): if x != "data" ] } - except: + except Exception: print("Can't parse the embedding. Please contact the support.") raise ValueError("Can't parse the embedding. Please contact the support.") - return embeddings, labels, ids, training_ids + return ( + information_source_id, + embeddings, + labels, + ids, + training_ids, + ) def transform_corpus_classification_inference(embeddings): @@ -94,7 +101,7 @@ def transform_corpus_extraction_fit( for _, row in df_labels.loc[df_labels.idx == idx].iterrows(): for token_idx in row.token_list: label_vector[token_idx] = row.label_name - np.place(label_vector, label_vector == None, CONSTANT__OUTSIDE) + np.place(label_vector, label_vector is None, CONSTANT__OUTSIDE) labels_prepared.append(label_vector.tolist()) keep_idxs = list(df_labels.idx.unique()) From abb457ff74d5d18ab8ee77016fdb52b3b9e432dc Mon Sep 17 00:00:00 2001 From: felix0496 Date: Fri, 25 Nov 2022 18:51:43 +0100 Subject: [PATCH 2/7] change structure and use relative imports --- run.sh | 2 +- run_ml.py | 6 +++--- util/__init__.py | 0 .../active_transfer_learning.py | 2 +- util.py => util/util.py | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 util/__init__.py rename active_transfer_learning.py => util/active_transfer_learning.py (99%) rename util.py => util/util.py (99%) diff --git a/run.sh b/run.sh index 3fe7f26..672dbc6 100755 --- a/run.sh +++ b/run.sh @@ -1,6 +1,6 @@ #!/bin/bash /usr/bin/curl -s "$1" > input.json; -/usr/bin/curl -s echo "$2" >> active_transfer_learning.py; +/usr/bin/curl -s echo "$2" >> util/active_transfer_learning.py; /usr/bin/curl -s "$3" > embedding.csv.bz2; /usr/local/bin/python run_ml.py "$4"; diff --git a/run_ml.py b/run_ml.py index 6f9904a..798b8ae 100644 --- a/run_ml.py +++ b/run_ml.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import os import sys -import util +from util import util import requests import pandas as pd import pickle @@ -16,7 +16,7 @@ def run_classification( corpus_ids, training_ids, ): - from active_transfer_learning import ATLClassifier + from util.active_transfer_learning import ATLClassifier classifier = ATLClassifier() prediction_probabilities = classifier.fit_predict( @@ -54,7 +54,7 @@ def run_classification( def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): - from active_transfer_learning import ATLExtractor + from util.active_transfer_learning import ATLExtractor extractor = ATLExtractor() predictions, probabilities = extractor.fit_predict( diff --git a/util/__init__.py b/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/active_transfer_learning.py b/util/active_transfer_learning.py similarity index 99% rename from active_transfer_learning.py rename to util/active_transfer_learning.py index 67bf7ba..c1aec26 100644 --- a/active_transfer_learning.py +++ b/util/active_transfer_learning.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -import util +from . import util from typing import Callable, List, Optional diff --git a/util.py b/util/util.py similarity index 99% rename from util.py rename to util/util.py index c5be89b..c4a6653 100644 --- a/util.py +++ b/util/util.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from run_ml import CONSTANT__OUTSIDE +CONSTANT__OUTSIDE = "OUTSIDE" pd.options.mode.chained_assignment = None # default='warn' From 188e3215625acddddae33c184140faa02bb66abf Mon Sep 17 00:00:00 2001 From: felix0496 Date: Tue, 29 Nov 2022 13:39:13 +0100 Subject: [PATCH 3/7] changes scikit-learn version --- requirements.txt | 2 +- requirements/requirements.in | 2 +- util/util.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 46cd4fc..ddf1e7e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -111,7 +111,7 @@ s3transfer==0.6.0 # via # -r requirements/torch-cpu-requirements.txt # boto3 -scikit-learn==1.0.2 +scikit-learn==1.1.2 # via # -r requirements/requirements.in # sequencelearn diff --git a/requirements/requirements.in b/requirements/requirements.in index adf3ae3..90d1b7a 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,4 +1,4 @@ -r torch-cpu-requirements.txt -scikit-learn==1.0.2 +scikit-learn==1.1.2 scipy==1.9.0 sequencelearn==0.0.9 \ No newline at end of file diff --git a/util/util.py b/util/util.py index c4a6653..5129ba1 100644 --- a/util/util.py +++ b/util/util.py @@ -101,7 +101,7 @@ def transform_corpus_extraction_fit( for _, row in df_labels.loc[df_labels.idx == idx].iterrows(): for token_idx in row.token_list: label_vector[token_idx] = row.label_name - np.place(label_vector, label_vector is None, CONSTANT__OUTSIDE) + np.place(label_vector, label_vector == None, CONSTANT__OUTSIDE) labels_prepared.append(label_vector.tolist()) keep_idxs = list(df_labels.idx.unique()) From b6fb7ec4bff79abbe7e74a527588dc3b1abe647b Mon Sep 17 00:00:00 2001 From: felix0496 Date: Thu, 12 Jan 2023 14:02:55 +0100 Subject: [PATCH 4/7] write activelearner to pickle for extraction tasks --- run_ml.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/run_ml.py b/run_ml.py index 798b8ae..a6fa781 100644 --- a/run_ml.py +++ b/run_ml.py @@ -53,13 +53,22 @@ def run_classification( return ml_results_by_record_id -def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): +def run_extraction( + information_source_id, corpus_embeddings, corpus_labels, corpus_ids, training_ids +): from util.active_transfer_learning import ATLExtractor extractor = ATLExtractor() predictions, probabilities = extractor.fit_predict( corpus_embeddings, corpus_labels, corpus_ids, training_ids ) + pickle_path = os.path.join( + "/inference", f"active-learning-{information_source_id}.pkl" + ) + with open(pickle_path, "wb") as f: + pickle.dump(extractor, f) + print("Saved model to disk", flush=True) + ml_results_by_record_id = {} for record_id, prediction, probability in zip( corpus_ids, predictions, probabilities @@ -108,7 +117,11 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): if is_extractor: print("Running extractor.") ml_results_by_record_id = run_extraction( - corpus_embeddings, corpus_labels, corpus_ids, training_ids + information_source_id, + corpus_embeddings, + corpus_labels, + corpus_ids, + training_ids, ) else: print("Running classifier.") From 74756e6b2918171c3363b69519a77012c63699ff Mon Sep 17 00:00:00 2001 From: felix0496 Date: Mon, 16 Jan 2023 10:41:40 +0100 Subject: [PATCH 5/7] save activelearner only if is_managed --- run_ml.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/run_ml.py b/run_ml.py index a6fa781..ec14436 100644 --- a/run_ml.py +++ b/run_ml.py @@ -22,12 +22,13 @@ def run_classification( prediction_probabilities = classifier.fit_predict( corpus_embeddings, corpus_labels, corpus_ids, training_ids ) - pickle_path = os.path.join( - "/inference", f"active-learning-{information_source_id}.pkl" - ) - with open(pickle_path, "wb") as f: - pickle.dump(classifier, f) - print("Saved model to disk", flush=True) + if os.path.exists("/inference"): + pickle_path = os.path.join( + "/inference", f"active-learning-{information_source_id}.pkl" + ) + with open(pickle_path, "wb") as f: + pickle.dump(classifier, f) + print("Saved model to disk", flush=True) prediction_indices = prediction_probabilities.argmax(axis=1) predictions_with_probabilities = [] @@ -62,12 +63,13 @@ def run_extraction( predictions, probabilities = extractor.fit_predict( corpus_embeddings, corpus_labels, corpus_ids, training_ids ) - pickle_path = os.path.join( - "/inference", f"active-learning-{information_source_id}.pkl" - ) - with open(pickle_path, "wb") as f: - pickle.dump(extractor, f) - print("Saved model to disk", flush=True) + if os.path.exists("/inference"): + pickle_path = os.path.join( + "/inference", f"active-learning-{information_source_id}.pkl" + ) + with open(pickle_path, "wb") as f: + pickle.dump(extractor, f) + print("Saved model to disk", flush=True) ml_results_by_record_id = {} for record_id, prediction, probability in zip( From b4858ded0fcb33681519f34780870db5a1a49197 Mon Sep 17 00:00:00 2001 From: felix0496 Date: Mon, 16 Jan 2023 16:03:37 +0100 Subject: [PATCH 6/7] renames pickle file --- run_ml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_ml.py b/run_ml.py index ec14436..0579645 100644 --- a/run_ml.py +++ b/run_ml.py @@ -24,7 +24,7 @@ def run_classification( ) if os.path.exists("/inference"): pickle_path = os.path.join( - "/inference", f"active-learning-{information_source_id}.pkl" + "/inference", f"active-learner-{information_source_id}.pkl" ) with open(pickle_path, "wb") as f: pickle.dump(classifier, f) @@ -65,7 +65,7 @@ def run_extraction( ) if os.path.exists("/inference"): pickle_path = os.path.join( - "/inference", f"active-learning-{information_source_id}.pkl" + "/inference", f"active-learner-{information_source_id}.pkl" ) with open(pickle_path, "wb") as f: pickle.dump(extractor, f) From 20675edb14f5a9eeda44a7b911fa5fefa07981ad Mon Sep 17 00:00:00 2001 From: felix0496 Date: Mon, 30 Jan 2023 16:40:58 +0100 Subject: [PATCH 7/7] pr comments --- run_ml.py | 21 ++++++++++++--------- util/util.py | 2 +- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/run_ml.py b/run_ml.py index 0579645..a0c8e39 100644 --- a/run_ml.py +++ b/run_ml.py @@ -5,16 +5,15 @@ import requests import pandas as pd import pickle - -CONSTANT__OUTSIDE = "OUTSIDE" # enum from graphql-gateway; if it changes, the extraction service breaks! +from typing import List, Dict, Tuple, Any def run_classification( - information_source_id, - corpus_embeddings, - corpus_labels, - corpus_ids, - training_ids, + information_source_id: str, + corpus_embeddings: Dict[str, List[List[float]]], + corpus_labels: List[str], + corpus_ids: List[str], + training_ids: List[str], ): from util.active_transfer_learning import ATLClassifier @@ -55,7 +54,11 @@ def run_classification( def run_extraction( - information_source_id, corpus_embeddings, corpus_labels, corpus_ids, training_ids + information_source_id: str, + corpus_embeddings: Dict[str, List[Any]], + corpus_labels: List[Tuple[str, str, List[Any]]], + corpus_ids: List[str], + training_ids: List[str], ): from util.active_transfer_learning import ATLExtractor @@ -83,7 +86,7 @@ def run_extraction( predictions_with_probabilities = [] new_start_idx = True for idx, row in df.loc[ - (df.prediction != CONSTANT__OUTSIDE) + (df.prediction != util.CONSTANT__OUTSIDE) & (df.prediction.isin(extractor.label_names)) & (df.probability > extractor.min_confidence) ].iterrows(): diff --git a/util/util.py b/util/util.py index 5129ba1..eb238a6 100644 --- a/util/util.py +++ b/util/util.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -CONSTANT__OUTSIDE = "OUTSIDE" +CONSTANT__OUTSIDE = "OUTSIDE" # enum from graphql-gateway; if it changes, the extraction service breaks! pd.options.mode.chained_assignment = None # default='warn'