Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ s3transfer==0.6.0
# via
# -r requirements/torch-cpu-requirements.txt
# boto3
scikit-learn==1.0.2
scikit-learn==1.1.2
# via
# -r requirements/requirements.in
# sequencelearn
Expand Down
2 changes: 1 addition & 1 deletion requirements/requirements.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-r torch-cpu-requirements.txt
scikit-learn==1.0.2
scikit-learn==1.1.2
scipy==1.9.0
sequencelearn==0.0.9
2 changes: 1 addition & 1 deletion run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

/usr/bin/curl -s "$1" > input.json;
/usr/bin/curl -s echo "$2" >> active_transfer_learning.py;
/usr/bin/curl -s echo "$2" >> util/active_transfer_learning.py;
/usr/bin/curl -s "$3" > embedding.csv.bz2;
/usr/local/bin/python run_ml.py "$4";
73 changes: 59 additions & 14 deletions run_ml.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,33 @@
#!/usr/bin/env python3
import os
import sys
import util
from util import util
import requests
from collections import defaultdict
import pandas as pd
import pickle
from typing import List, Dict, Tuple, Any

CONSTANT__OUTSIDE = "OUTSIDE" # enum from graphql-gateway; if it changes, the extraction service breaks!


def run_classification(corpus_embeddings, corpus_labels, corpus_ids, training_ids):
from active_transfer_learning import ATLClassifier
def run_classification(
information_source_id: str,
corpus_embeddings: Dict[str, List[List[float]]],
corpus_labels: List[str],
corpus_ids: List[str],
training_ids: List[str],
):
from util.active_transfer_learning import ATLClassifier

classifier = ATLClassifier()
prediction_probabilities = classifier.fit_predict(
corpus_embeddings, corpus_labels, corpus_ids, training_ids
)
if os.path.exists("/inference"):
pickle_path = os.path.join(
"/inference", f"active-learner-{information_source_id}.pkl"
)
with open(pickle_path, "wb") as f:
pickle.dump(classifier, f)
print("Saved model to disk", flush=True)

prediction_indices = prediction_probabilities.argmax(axis=1)
predictions_with_probabilities = []
Expand All @@ -40,15 +53,31 @@ def run_classification(corpus_embeddings, corpus_labels, corpus_ids, training_id
return ml_results_by_record_id


def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids):
from active_transfer_learning import ATLExtractor
def run_extraction(
information_source_id: str,
corpus_embeddings: Dict[str, List[Any]],
corpus_labels: List[Tuple[str, str, List[Any]]],
corpus_ids: List[str],
training_ids: List[str],
):
from util.active_transfer_learning import ATLExtractor

extractor = ATLExtractor()
predictions, probabilities = extractor.fit_predict(
corpus_embeddings, corpus_labels, corpus_ids, training_ids
)
if os.path.exists("/inference"):
pickle_path = os.path.join(
"/inference", f"active-learner-{information_source_id}.pkl"
)
with open(pickle_path, "wb") as f:
pickle.dump(extractor, f)
print("Saved model to disk", flush=True)

ml_results_by_record_id = {}
for record_id, prediction, probability in zip(corpus_ids, predictions, probabilities):
for record_id, prediction, probability in zip(
corpus_ids, predictions, probabilities
):
df = pd.DataFrame(
list(zip(prediction, probability)),
columns=["prediction", "probability"],
Expand All @@ -57,7 +86,7 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids):
predictions_with_probabilities = []
new_start_idx = True
for idx, row in df.loc[
(df.prediction != CONSTANT__OUTSIDE)
(df.prediction != util.CONSTANT__OUTSIDE)
& (df.prediction.isin(extractor.label_names))
& (df.probability > extractor.min_confidence)
].iterrows():
Expand All @@ -67,7 +96,9 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids):
if row.prediction != row.next:
prob = df.loc[start_idx:idx].probability.mean()
end_idx = idx + 1
predictions_with_probabilities.append([float(prob), row.prediction, start_idx, end_idx])
predictions_with_probabilities.append(
[float(prob), row.prediction, start_idx, end_idx]
)
new_start_idx = True
ml_results_by_record_id[record_id] = predictions_with_probabilities
if len(ml_results_by_record_id) == 0:
Expand All @@ -79,18 +110,32 @@ def run_extraction(corpus_embeddings, corpus_labels, corpus_ids, training_ids):
_, payload_url = sys.argv
print("Preparing data for machine learning.")

corpus_embeddings, corpus_labels, corpus_ids, training_ids = util.get_corpus()
(
information_source_id,
corpus_embeddings,
corpus_labels,
corpus_ids,
training_ids,
) = util.get_corpus()
is_extractor = any([isinstance(val, list) for val in corpus_labels["manual"]])

if is_extractor:
print("Running extractor.")
ml_results_by_record_id = run_extraction(
corpus_embeddings, corpus_labels, corpus_ids, training_ids
information_source_id,
corpus_embeddings,
corpus_labels,
corpus_ids,
training_ids,
)
else:
print("Running classifier.")
ml_results_by_record_id = run_classification(
corpus_embeddings, corpus_labels, corpus_ids, training_ids
information_source_id,
corpus_embeddings,
corpus_labels,
corpus_ids,
training_ids,
)

print("Finished execution.")
Expand Down
Empty file added util/__init__.py
Empty file.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod
import util
from . import util
from typing import Callable, List, Optional


Expand Down
13 changes: 10 additions & 3 deletions util.py → util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
import pandas as pd

from run_ml import CONSTANT__OUTSIDE
CONSTANT__OUTSIDE = "OUTSIDE" # enum from graphql-gateway; if it changes, the extraction service breaks!

pd.options.mode.chained_assignment = None # default='warn'

Expand All @@ -20,6 +20,7 @@
def get_corpus():
with open("input.json", "r") as infile:
input_data = json.load(infile)
information_source_id = input_data["information_source_id"]
embedding_type = input_data["embedding_type"]
embedding_name = input_data["embedding_name"]
labels = input_data["labels"]
Expand All @@ -45,10 +46,16 @@ def get_corpus():
if x != "data"
]
}
except:
except Exception:
print("Can't parse the embedding. Please contact the support.")
raise ValueError("Can't parse the embedding. Please contact the support.")
return embeddings, labels, ids, training_ids
return (
information_source_id,
embeddings,
labels,
ids,
training_ids,
)


def transform_corpus_classification_inference(embeddings):
Expand Down