diff --git a/requirements.txt b/requirements.txt index 74aad05..3514d73 100644 --- a/requirements.txt +++ b/requirements.txt @@ -107,7 +107,7 @@ s3transfer==0.6.0 # via # -r requirements/torch-cpu-requirements.txt # boto3 -scikit-learn==1.0.2 +scikit-learn==1.1.2 # via # -r requirements/requirements.in # sequencelearn diff --git a/requirements/requirements.in b/requirements/requirements.in index adf3ae3..90d1b7a 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -1,4 +1,4 @@ -r torch-cpu-requirements.txt -scikit-learn==1.0.2 +scikit-learn==1.1.2 scipy==1.9.0 sequencelearn==0.0.9 \ No newline at end of file diff --git a/run.sh b/run.sh index 3fe7f26..672dbc6 100755 --- a/run.sh +++ b/run.sh @@ -1,6 +1,6 @@ #!/bin/bash /usr/bin/curl -s "$1" > input.json; -/usr/bin/curl -s echo "$2" >> active_transfer_learning.py; +/usr/bin/curl -s echo "$2" >> util/active_transfer_learning.py; /usr/bin/curl -s "$3" > embedding.csv.bz2; /usr/local/bin/python run_ml.py "$4"; diff --git a/run_ml.py b/run_ml.py index 70f312b..a0c8e39 100644 --- a/run_ml.py +++ b/run_ml.py @@ -1,20 +1,33 @@ #!/usr/bin/env python3 +import os import sys -import util +from util import util import requests -from collections import defaultdict import pandas as pd +import pickle +from typing import List, Dict, Tuple, Any -CONSTANT__OUTSIDE = "OUTSIDE" # enum from graphql-gateway; if it changes, the extraction service breaks! - -def run_classification(corpus_embeddings, corpus_labels, corpus_ids, training_ids): - from active_transfer_learning import ATLClassifier +def run_classification( + information_source_id: str, + corpus_embeddings: Dict[str, List[List[float]]], + corpus_labels: List[str], + corpus_ids: List[str], + training_ids: List[str], +): + from util.active_transfer_learning import ATLClassifier classifier = ATLClassifier() prediction_probabilities = classifier.fit_predict( corpus_embeddings, corpus_labels, corpus_ids, training_ids ) + if os.path.exists("/inference"): + pickle_path = os.path.join( + "/inference", f"active-learner-{information_source_id}.pkl" + ) + with open(pickle_path, "wb") as f: + pickle.dump(classifier, f) + print("Saved model to disk", flush=True) prediction_indices = prediction_probabilities.argmax(axis=1) predictions_with_probabilities = [] @@ -40,15 +53,31 @@ def run_classification(corpus_embeddings, corpus_labels, corpus_ids, training_id return ml_results_by_record_id -def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): - from active_transfer_learning import ATLExtractor +def run_extraction( + information_source_id: str, + corpus_embeddings: Dict[str, List[Any]], + corpus_labels: List[Tuple[str, str, List[Any]]], + corpus_ids: List[str], + training_ids: List[str], +): + from util.active_transfer_learning import ATLExtractor extractor = ATLExtractor() predictions, probabilities = extractor.fit_predict( corpus_embeddings, corpus_labels, corpus_ids, training_ids ) + if os.path.exists("/inference"): + pickle_path = os.path.join( + "/inference", f"active-learner-{information_source_id}.pkl" + ) + with open(pickle_path, "wb") as f: + pickle.dump(extractor, f) + print("Saved model to disk", flush=True) + ml_results_by_record_id = {} - for record_id, prediction, probability in zip(corpus_ids, predictions, probabilities): + for record_id, prediction, probability in zip( + corpus_ids, predictions, probabilities + ): df = pd.DataFrame( list(zip(prediction, probability)), columns=["prediction", "probability"], @@ -57,7 +86,7 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): predictions_with_probabilities = [] new_start_idx = True for idx, row in df.loc[ - (df.prediction != CONSTANT__OUTSIDE) + (df.prediction != util.CONSTANT__OUTSIDE) & (df.prediction.isin(extractor.label_names)) & (df.probability > extractor.min_confidence) ].iterrows(): @@ -67,7 +96,9 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): if row.prediction != row.next: prob = df.loc[start_idx:idx].probability.mean() end_idx = idx + 1 - predictions_with_probabilities.append([float(prob), row.prediction, start_idx, end_idx]) + predictions_with_probabilities.append( + [float(prob), row.prediction, start_idx, end_idx] + ) new_start_idx = True ml_results_by_record_id[record_id] = predictions_with_probabilities if len(ml_results_by_record_id) == 0: @@ -79,18 +110,32 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids): _, payload_url = sys.argv print("Preparing data for machine learning.") - corpus_embeddings, corpus_labels, corpus_ids, training_ids = util.get_corpus() + ( + information_source_id, + corpus_embeddings, + corpus_labels, + corpus_ids, + training_ids, + ) = util.get_corpus() is_extractor = any([isinstance(val, list) for val in corpus_labels["manual"]]) if is_extractor: print("Running extractor.") ml_results_by_record_id = run_extraction( - corpus_embeddings, corpus_labels, corpus_ids, training_ids + information_source_id, + corpus_embeddings, + corpus_labels, + corpus_ids, + training_ids, ) else: print("Running classifier.") ml_results_by_record_id = run_classification( - corpus_embeddings, corpus_labels, corpus_ids, training_ids + information_source_id, + corpus_embeddings, + corpus_labels, + corpus_ids, + training_ids, ) print("Finished execution.") diff --git a/util/__init__.py b/util/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/active_transfer_learning.py b/util/active_transfer_learning.py similarity index 99% rename from active_transfer_learning.py rename to util/active_transfer_learning.py index 67bf7ba..c1aec26 100644 --- a/active_transfer_learning.py +++ b/util/active_transfer_learning.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -import util +from . import util from typing import Callable, List, Optional diff --git a/util.py b/util/util.py similarity index 91% rename from util.py rename to util/util.py index 6660c4a..eb238a6 100644 --- a/util.py +++ b/util/util.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from run_ml import CONSTANT__OUTSIDE +CONSTANT__OUTSIDE = "OUTSIDE" # enum from graphql-gateway; if it changes, the extraction service breaks! pd.options.mode.chained_assignment = None # default='warn' @@ -20,6 +20,7 @@ def get_corpus(): with open("input.json", "r") as infile: input_data = json.load(infile) + information_source_id = input_data["information_source_id"] embedding_type = input_data["embedding_type"] embedding_name = input_data["embedding_name"] labels = input_data["labels"] @@ -45,10 +46,16 @@ def get_corpus(): if x != "data" ] } - except: + except Exception: print("Can't parse the embedding. Please contact the support.") raise ValueError("Can't parse the embedding. Please contact the support.") - return embeddings, labels, ids, training_ids + return ( + information_source_id, + embeddings, + labels, + ids, + training_ids, + ) def transform_corpus_classification_inference(embeddings):