Skip to content

Commit

Permalink
added support for OneClassSVM for single class classification / outli…
Browse files Browse the repository at this point in the history
…er detection #12
  • Loading branch information
davidberenstein1957 committed Dec 30, 2022
1 parent 786f7c0 commit ad7a4db
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 20 deletions.
57 changes: 57 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,63 @@ print(nlp("I am looking for furniture and kitchen equipment.")._.cats)
#
# [{"label": "furniture", "score": 0.92}, {"label": "kitchen", "score": 0.91}]
```
### Outlier detection
Sometimes it is worth to be able to do outlier detection or binary classification. This can either be approached using
a binary training dataset, however, I have also implemented support for a `OneClassSVM` for [outlier detection using a single label](https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html). Not that this method does not return probabilities, but that the data is formatted like label-score value pair to ensure uniformity.

Approach 1:
```python
import spacy
import classy_classification

data_binary = {
"inlier": ["This text is about chairs.",
"Couches, benches and televisions.",
"I really need to get a new sofa."],
"outlier": ["Text about kitchen equipment",
"This text is about politics",
"Comments about AI and stuff."]
}

nlp = spacy.load("en_core_web_md")
nlp.add_pipe(
"text_categorizer",
config={
"data": data_binary,
}
)

print(nlp("This text is a random text")._.cats)

# Output:
#
# [{'inlier': 0.2926672385488411, 'outlier': 0.707332761451159}]
```

```python
import spacy
import classy_classification

data_singular = {
"furniture": ["This text is about chairs.",
"Couches, benches and televisions.",
"I really need to get a new sofa.",
"We have a new dinner table."]
}
nlp = spacy.load("en_core_web_md")
nlp.add_pipe(
"text_categorizer",
config={
"data": data_singular,
}
)

print(nlp("This text is a random text")._.cats)

# Output:
#
# [{'furniture': 0, 'not_furniture': 1}]
```
## Sentence-transfomer embeddings
```python
import spacy
Expand Down
53 changes: 34 additions & 19 deletions classy_classification/classifiers/classy_skeleton.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.svm import SVC, OneClassSVM
from spacy.language import Language
from spacy.tokens import Doc, Span

Expand Down Expand Up @@ -49,7 +49,7 @@ def __init__(
}.
"""
self.multi_label = multi_label
self.set_config(config)

self.data = data
self.name = name
self.nlp = nlp
Expand All @@ -63,12 +63,17 @@ def __init__(
if include_doc:
Doc.set_extension("cats", default=None, force=True)
self.set_training_data()
self.set_config(config)
self.set_classification_model()

def set_training_data(self):
"""Overwritten by super class"""
raise NotImplementedError("Needs to be overwritten by superclass")

def set_config(self):
"""Overwritten by super class"""
raise NotImplementedError("Needs to be overwritten by superclass")

def set_classification_model(self):
"""Overwritten by super class"""
raise NotImplementedError("Needs to be overwritten by superclass")
Expand All @@ -94,9 +99,16 @@ def get_prediction(self, embeddings: List[List]) -> List[dict]:
Returns:
List[dict]: list of key-class proba-value dict
"""
pred_result = self.clf.predict_proba(embeddings)

return self.proba_to_dict(pred_result)
if len(self.label_list) > 1:
pred_result = self.clf.predict_proba(embeddings)
pred_result = self.proba_to_dict(pred_result)
else:
pred_result = self.clf.predict(embeddings)
label = self.label_list[0]
pred_result = [
{label: 1, f"not_{label}": 0} if pred == 1 else {label: 0, f"not_{label}": 1} for pred in pred_result
]
return pred_result


class ClassySkeletonFewShot(ClassySkeleton):
Expand All @@ -110,12 +122,18 @@ def set_config(self, config: Union[dict, None] = None):
"""

if config is None:
config = {
"C": [1, 2, 5, 10, 20, 100],
"kernel": ["linear", "rbf"],
"max_cross_validation_folds": 5,
"seed": None,
}
if len(self.label_list) > 1:
config = {
"C": [1, 2, 5, 10, 20, 100],
"kernel": ["linear", "rbf", "poly"],
"max_cross_validation_folds": 5,
"seed": None,
}
else:
config = {
"nu": 0.1,
"kernel": "rbf",
}

self.config = config

Expand Down Expand Up @@ -163,18 +181,15 @@ def set_classification_model(self, config: dict = None):
scoring="f1_weighted",
verbose=self.verbose,
)

self.clf.fit(self.X, self.y)
elif len(self.label_list) == 1:
raise NotImplementedError(
"I have not managed to take an in-depth look into probabilistic predictions for single class"
" classification yet. Feel free to provide your input on"
" https://github.com/Pandora-Intelligence/classy-classification/issues/12."
)
if self.multi_label:
raise ValueError("Cannot apply one class classification with multiple-labels.")
self.clf = OneClassSVM(verbose=self.verbose, **self.config)
self.clf.fit(self.X)
else:
raise ValueError("Provide input data with Dict[key, List].")

self.clf.fit(self.X, self.y)

def proba_to_dict(self, pred_results: List[List]) -> List[dict]:
"""converts probability prediciton to a formatted key-class proba-value list
Expand Down
2 changes: 1 addition & 1 deletion classy_classification/classifiers/classy_standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def __init__(
self.model = model
self.device = device
self.verbose = verbose
self.set_config(config)
self.set_embedding_model()
self.set_training_data()
self.set_config(config)
self.set_classification_model()
19 changes: 19 additions & 0 deletions tests/test_spacy_external_single_label.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pytest
import spacy

from classy_classification.examples.data import training_data_single_class


@pytest.fixture
def spacy_external_single_label():
nlp = spacy.blank("en")
nlp.add_pipe(
"text_categorizer",
config={"data": training_data_single_class},
)
return nlp


def test_spacy_external_single_label(spacy_external_single_label):
_ = spacy_external_single_label(training_data_single_class["politics"][0])
_ = spacy_external_single_label.pipe(training_data_single_class["politics"])
16 changes: 16 additions & 0 deletions tests/test_spacy_internal_single_label.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest
import spacy

from classy_classification.examples.data import training_data_single_class


@pytest.fixture(params=["en_core_web_md", "en_core_web_trf"])
def spacy_internal_single_label(request):
nlp = spacy.load(request.param)
nlp.add_pipe("text_categorizer", config={"data": training_data_single_class})
return nlp


def test_spacy_internal_single_label(spacy_internal_single_label):
_ = spacy_internal_single_label(training_data_single_class["politics"][0])
_ = spacy_internal_single_label.pipe(training_data_single_class["politics"])
15 changes: 15 additions & 0 deletions tests/test_standalone_single_label.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest

from classy_classification import ClassyClassifier
from classy_classification.examples.data import training_data_single_class


@pytest.fixture
def standalone_single_label():
classifier = ClassyClassifier(data=training_data_single_class)
return classifier


def test_standalone_single_label(standalone_single_label):
_ = standalone_single_label(training_data_single_class["politics"][0])
_ = standalone_single_label.pipe(training_data_single_class["politics"])

0 comments on commit ad7a4db

Please sign in to comment.