added support for OneClassSVM for single class classification / outli…

…er detection #12
davidberenstein1957 · Dec 30, 2022 · ad7a4db · ad7a4db
1 parent 786f7c0
commit ad7a4db
Show file tree

Hide file tree

Showing 6 changed files with 142 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -140,6 +140,63 @@ print(nlp("I am looking for furniture and kitchen equipment.")._.cats)
 #
 # [{"label": "furniture", "score": 0.92}, {"label": "kitchen", "score": 0.91}]
 ```
+### Outlier detection
+Sometimes it is worth to be able to do outlier detection or binary classification. This can either be approached using
+a binary training dataset, however, I have also implemented support for a `OneClassSVM` for [outlier detection using a single label](https://scikit-learn.org/stable/modules/generated/sklearn.svm.OneClassSVM.html). Not that this method does not return probabilities, but that the data is formatted like label-score value pair to ensure uniformity.
+
+Approach 1:
+```python
+import spacy
+import classy_classification
+
+data_binary = {
+    "inlier": ["This text is about chairs.",
+               "Couches, benches and televisions.",
+               "I really need to get a new sofa."],
+    "outlier": ["Text about kitchen equipment",
+                "This text is about politics",
+                "Comments about AI and stuff."]
+}
+
+nlp = spacy.load("en_core_web_md")
+nlp.add_pipe(
+    "text_categorizer",
+    config={
+        "data": data_binary,
+    }
+)
+
+print(nlp("This text is a random text")._.cats)
+
+# Output:
+#
+# [{'inlier': 0.2926672385488411, 'outlier': 0.707332761451159}]
+```
+
+```python
+import spacy
+import classy_classification
+
+data_singular = {
+    "furniture": ["This text is about chairs.",
+               "Couches, benches and televisions.",
+               "I really need to get a new sofa.",
+               "We have a new dinner table."]
+}
+nlp = spacy.load("en_core_web_md")
+nlp.add_pipe(
+    "text_categorizer",
+    config={
+        "data": data_singular,
+    }
+)
+
+print(nlp("This text is a random text")._.cats)
+
+# Output:
+#
+# [{'furniture': 0, 'not_furniture': 1}]
+```
 ## Sentence-transfomer embeddings
 ```python
 import spacy

diff --git a/classy_classification/classifiers/classy_skeleton.py b/classy_classification/classifiers/classy_skeleton.py
@@ -6,7 +6,7 @@
 from sklearn import preprocessing
 from sklearn.model_selection import GridSearchCV
 from sklearn.multiclass import OneVsRestClassifier
-from sklearn.svm import SVC
+from sklearn.svm import SVC, OneClassSVM
 from spacy.language import Language
 from spacy.tokens import Doc, Span
 
@@ -49,7 +49,7 @@ def __init__(
                 }.
         """
         self.multi_label = multi_label
-        self.set_config(config)
+
         self.data = data
         self.name = name
         self.nlp = nlp
@@ -63,12 +63,17 @@ def __init__(
         if include_doc:
             Doc.set_extension("cats", default=None, force=True)
         self.set_training_data()
+        self.set_config(config)
         self.set_classification_model()
 
     def set_training_data(self):
         """Overwritten by super class"""
         raise NotImplementedError("Needs to be overwritten by superclass")
 
+    def set_config(self):
+        """Overwritten by super class"""
+        raise NotImplementedError("Needs to be overwritten by superclass")
+
     def set_classification_model(self):
         """Overwritten by super class"""
         raise NotImplementedError("Needs to be overwritten by superclass")
@@ -94,9 +99,16 @@ def get_prediction(self, embeddings: List[List]) -> List[dict]:
         Returns:
             List[dict]: list of key-class proba-value dict
         """
-        pred_result = self.clf.predict_proba(embeddings)
-
-        return self.proba_to_dict(pred_result)
+        if len(self.label_list) > 1:
+            pred_result = self.clf.predict_proba(embeddings)
+            pred_result = self.proba_to_dict(pred_result)
+        else:
+            pred_result = self.clf.predict(embeddings)
+            label = self.label_list[0]
+            pred_result = [
+                {label: 1, f"not_{label}": 0} if pred == 1 else {label: 0, f"not_{label}": 1} for pred in pred_result
+            ]
+        return pred_result
 
 
 class ClassySkeletonFewShot(ClassySkeleton):
@@ -110,12 +122,18 @@ def set_config(self, config: Union[dict, None] = None):
         """
 
         if config is None:
-            config = {
-                "C": [1, 2, 5, 10, 20, 100],
-                "kernel": ["linear", "rbf"],
-                "max_cross_validation_folds": 5,
-                "seed": None,
-            }
+            if len(self.label_list) > 1:
+                config = {
+                    "C": [1, 2, 5, 10, 20, 100],
+                    "kernel": ["linear", "rbf", "poly"],
+                    "max_cross_validation_folds": 5,
+                    "seed": None,
+                }
+            else:
+                config = {
+                    "nu": 0.1,
+                    "kernel": "rbf",
+                }
 
         self.config = config
 
@@ -163,18 +181,15 @@ def set_classification_model(self, config: dict = None):
                 scoring="f1_weighted",
                 verbose=self.verbose,
             )
-
+            self.clf.fit(self.X, self.y)
         elif len(self.label_list) == 1:
-            raise NotImplementedError(
-                "I have not managed to take an in-depth look into probabilistic predictions for single class"
-                " classification yet. Feel free to provide your input on"
-                " https://github.com/Pandora-Intelligence/classy-classification/issues/12."
-            )
+            if self.multi_label:
+                raise ValueError("Cannot apply one class classification with multiple-labels.")
+            self.clf = OneClassSVM(verbose=self.verbose, **self.config)
+            self.clf.fit(self.X)
         else:
             raise ValueError("Provide input data with Dict[key, List].")
 
-        self.clf.fit(self.X, self.y)
-
     def proba_to_dict(self, pred_results: List[List]) -> List[dict]:
         """converts probability prediciton to a formatted key-class proba-value list
 

diff --git a/classy_classification/classifiers/classy_standalone.py b/classy_classification/classifiers/classy_standalone.py
@@ -63,7 +63,7 @@ def __init__(
         self.model = model
         self.device = device
         self.verbose = verbose
-        self.set_config(config)
         self.set_embedding_model()
         self.set_training_data()
+        self.set_config(config)
         self.set_classification_model()
diff --git a/tests/test_spacy_external_single_label.py b/tests/test_spacy_external_single_label.py
@@ -0,0 +1,19 @@
+import pytest
+import spacy
+
+from classy_classification.examples.data import training_data_single_class
+
+
+@pytest.fixture
+def spacy_external_single_label():
+    nlp = spacy.blank("en")
+    nlp.add_pipe(
+        "text_categorizer",
+        config={"data": training_data_single_class},
+    )
+    return nlp
+
+
+def test_spacy_external_single_label(spacy_external_single_label):
+    _ = spacy_external_single_label(training_data_single_class["politics"][0])
+    _ = spacy_external_single_label.pipe(training_data_single_class["politics"])
diff --git a/tests/test_spacy_internal_single_label.py b/tests/test_spacy_internal_single_label.py
@@ -0,0 +1,16 @@
+import pytest
+import spacy
+
+from classy_classification.examples.data import training_data_single_class
+
+
+@pytest.fixture(params=["en_core_web_md", "en_core_web_trf"])
+def spacy_internal_single_label(request):
+    nlp = spacy.load(request.param)
+    nlp.add_pipe("text_categorizer", config={"data": training_data_single_class})
+    return nlp
+
+
+def test_spacy_internal_single_label(spacy_internal_single_label):
+    _ = spacy_internal_single_label(training_data_single_class["politics"][0])
+    _ = spacy_internal_single_label.pipe(training_data_single_class["politics"])
diff --git a/tests/test_standalone_single_label.py b/tests/test_standalone_single_label.py
@@ -0,0 +1,15 @@
+import pytest
+
+from classy_classification import ClassyClassifier
+from classy_classification.examples.data import training_data_single_class
+
+
+@pytest.fixture
+def standalone_single_label():
+    classifier = ClassyClassifier(data=training_data_single_class)
+    return classifier
+
+
+def test_standalone_single_label(standalone_single_label):
+    _ = standalone_single_label(training_data_single_class["politics"][0])
+    _ = standalone_single_label.pipe(training_data_single_class["politics"])