Merge pull request #1799 from doccano/enhancement/datasetExport

Enhancement/dataset export
doccano · Apr 25, 2022 · 4144d22 · 4144d22
2 parents 6013413 + a6928fd
commit 4144d22
Show file tree

Hide file tree

Showing 37 changed files with 1,444 additions and 1,233 deletions.
diff --git a/backend/data_export/celery_tasks.py b/backend/data_export/celery_tasks.py
@@ -1,20 +1,64 @@
+import os
+import shutil
+import uuid
+
 from celery import shared_task
 from celery.utils.log import get_task_logger
 from django.conf import settings
 from django.shortcuts import get_object_or_404
 
-from .pipeline.factories import create_repository, create_writer
+from .pipeline.dataset import Dataset
+from .pipeline.factories import create_formatter, create_labels, create_writer
 from .pipeline.services import ExportApplicationService
-from projects.models import Project
+from data_export.models import ExportedExample
+from projects.models import Member, Project
 
 logger = get_task_logger(__name__)
 
 
+def create_collaborative_dataset(project: Project, dirpath: str, confirmed_only: bool, formatters, writer):
+    is_text_project = project.is_text_project
+    if confirmed_only:
+        examples = ExportedExample.objects.confirmed(project)
+    else:
+        examples = ExportedExample.objects.filter(project=project)
+    labels = create_labels(project, examples)
+    dataset = Dataset(examples, labels, is_text_project)
+
+    service = ExportApplicationService(dataset, formatters, writer)
+
+    filepath = os.path.join(dirpath, f"all.{writer.extension}")
+    service.export(filepath)
+
+
+def create_individual_dataset(project: Project, dirpath: str, confirmed_only: bool, formatters, writer):
+    is_text_project = project.is_text_project
+    members = Member.objects.filter(project=project)
+    for member in members:
+        if confirmed_only:
+            examples = ExportedExample.objects.confirmed(project, user=member.user)
+        else:
+            examples = ExportedExample.objects.filter(project=project)
+        labels = create_labels(project, examples, member.user)
+        dataset = Dataset(examples, labels, is_text_project)
+
+        service = ExportApplicationService(dataset, formatters, writer)
+
+        filepath = os.path.join(dirpath, f"{member.username}.{writer.extension}")
+        service.export(filepath)
+
+
 @shared_task
-def export_dataset(project_id, file_format: str, export_approved=False):
+def export_dataset(project_id, file_format: str, confirmed_only=False):
     project = get_object_or_404(Project, pk=project_id)
-    repository = create_repository(project, file_format)
-    writer = create_writer(file_format)(settings.MEDIA_ROOT)
-    service = ExportApplicationService(repository, writer)
-    filepath = service.export(export_approved)
-    return filepath
+    dirpath = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4()))
+    os.makedirs(dirpath, exist_ok=True)
+    formatters = create_formatter(project, file_format)
+    writer = create_writer(file_format)
+    if project.collaborative_annotation:
+        create_collaborative_dataset(project, dirpath, confirmed_only, formatters, writer)
+    else:
+        create_individual_dataset(project, dirpath, confirmed_only, formatters, writer)
+    zip_file = shutil.make_archive(dirpath, "zip", dirpath)
+    shutil.rmtree(dirpath)
+    return zip_file
diff --git a/backend/data_export/models.py b/backend/data_export/models.py
@@ -0,0 +1,81 @@
+from typing import Any, Dict, Protocol, Tuple
+
+from django.db import models
+
+from examples.models import Example
+from labels.models import Category, Relation, Span, TextLabel
+from projects.models import Project
+
+DATA = "data"
+
+
+class ExportedExampleManager(models.Manager):
+    def confirmed(self, project: Project, user=None):
+        if project.collaborative_annotation:
+            return self.filter(project=project).exclude(states=None)
+        else:
+            assert user is not None
+            return self.filter(project=project, states__confirmed_by=user)
+
+
+class ExportedExample(Example):
+    objects = ExportedExampleManager()
+
+    def to_dict(self, is_text_project=True) -> Dict[str, Any]:
+        return {"id": self.id, DATA: self.text if is_text_project else self.upload_name, **self.meta}
+
+    class Meta:
+        proxy = True
+
+
+class ExportedLabel(Protocol):
+    objects: models.Manager
+
+    def to_dict(self) -> Dict[str, Any]:
+        raise NotImplementedError("Please implement this method in the subclass.")
+
+    def to_string(self) -> str:
+        raise NotImplementedError("Please implement this method in the subclass.")
+
+    def to_tuple(self) -> Tuple:
+        raise NotImplementedError("Please implement this method in the subclass.")
+
+
+class ExportedCategory(Category):
+    def to_string(self) -> str:
+        return self.label.text
+
+    class Meta:
+        proxy = True
+
+
+class ExportedSpan(Span):
+    def to_dict(self):
+        return {
+            "id": self.id,
+            "label": self.label.text,
+            "start_offset": self.start_offset,
+            "end_offset": self.end_offset,
+        }
+
+    def to_tuple(self):
+        return self.start_offset, self.end_offset, self.label.text
+
+    class Meta:
+        proxy = True
+
+
+class ExportedRelation(Relation):
+    def to_dict(self):
+        return {"id": self.id, "from_id": self.from_id.id, "to_id": self.to_id.id, "type": self.type.text}
+
+    class Meta:
+        proxy = True
+
+
+class ExportedText(TextLabel):
+    def to_string(self) -> str:
+        return self.text
+
+    class Meta:
+        proxy = True
diff --git a/backend/data_export/pipeline/catalog.py b/backend/data_export/pipeline/catalog.py
@@ -1,10 +1,7 @@
 from collections import defaultdict
+from pathlib import Path
 from typing import Dict, List, Type
 
-from pydantic import BaseModel
-from typing_extensions import Literal
-
-from . import examples
 from projects.models import (
     DOCUMENT_CLASSIFICATION,
     IMAGE_CLASSIFICATION,
@@ -14,6 +11,8 @@
     SPEECH2TEXT,
 )
 
+EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples"
+
 
 class Format:
     name = ""
@@ -27,40 +26,18 @@ def dict(cls):
 
 class CSV(Format):
     name = "CSV"
-    extension = "csv"
 
 
 class FastText(Format):
     name = "fastText"
-    extension = "txt"
 
 
 class JSON(Format):
     name = "JSON"
-    extension = "json"
 
 
 class JSONL(Format):
     name = "JSONL"
-    extension = "jsonl"
-
-
-class IntentAndSlot(Format):
-    name = "JSONL(intent and slot)"
-    extension = "jsonl"
-
-
-class JSONLRelation(Format):
-    name = "JSONL(relation)"
-    extension = "jsonl"
-
-
-class OptionDelimiter(BaseModel):
-    delimiter: Literal[",", "\t", ";", "|", " "] = ","
-
-
-class OptionNone(BaseModel):
-    pass
 
 
 class Options:
@@ -69,35 +46,46 @@ class Options:
     @classmethod
     def filter_by_task(cls, task_name: str):
         options = cls.options[task_name]
-        return [
-            {**file_format.dict(), **option.schema(), "example": example} for file_format, option, example in options
-        ]
+        return [{**file_format.dict(), "example": example} for file_format, example in options]
 
     @classmethod
-    def register(cls, task: str, file_format: Type[Format], option: Type[BaseModel], example: str):
-        cls.options[task].append((file_format, option, example))
+    def register(cls, task: str, file_format: Type[Format], file: Path):
+        example = cls.load_example(file)
+        cls.options[task].append((file_format, example))
+
+    @staticmethod
+    def load_example(file):
+        with open(file, encoding="utf-8") as f:
+            return f.read()
 
 
 # Text Classification
-Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
-Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionNone, examples.Category_fastText)
-Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionNone, examples.Category_JSON)
-Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionNone, examples.Category_JSONL)
+TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification"
+Options.register(DOCUMENT_CLASSIFICATION, CSV, TEXT_CLASSIFICATION_DIR / "example.csv")
+Options.register(DOCUMENT_CLASSIFICATION, FastText, TEXT_CLASSIFICATION_DIR / "example.txt")
+Options.register(DOCUMENT_CLASSIFICATION, JSON, TEXT_CLASSIFICATION_DIR / "example.json")
+Options.register(DOCUMENT_CLASSIFICATION, JSONL, TEXT_CLASSIFICATION_DIR / "example.jsonl")
 
 # Sequence Labeling
-Options.register(SEQUENCE_LABELING, JSONL, OptionNone, examples.Offset_JSONL)
-Options.register(SEQUENCE_LABELING, JSONLRelation, OptionNone, examples.ENTITY_AND_RELATION_JSONL)
+SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling"
+RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction"
+Options.register(SEQUENCE_LABELING, JSONL, SEQUENCE_LABELING_DIR / "example.jsonl")
+Options.register(SEQUENCE_LABELING, JSONL, RELATION_EXTRACTION_DIR / "example.jsonl")
 
 # Sequence to sequence
-Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
-Options.register(SEQ2SEQ, JSON, OptionNone, examples.Text_JSON)
-Options.register(SEQ2SEQ, JSONL, OptionNone, examples.Text_JSONL)
+SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence"
+Options.register(SEQ2SEQ, CSV, SEQ2SEQ_DIR / "example.csv")
+Options.register(SEQ2SEQ, JSON, SEQ2SEQ_DIR / "example.json")
+Options.register(SEQ2SEQ, JSONL, SEQ2SEQ_DIR / "example.jsonl")
 
 # Intent detection and slot filling
-Options.register(INTENT_DETECTION_AND_SLOT_FILLING, IntentAndSlot, OptionNone, examples.INTENT_JSONL)
+INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection"
+Options.register(INTENT_DETECTION_AND_SLOT_FILLING, JSONL, INTENT_DETECTION_DIR / "example.jsonl")
 
 # Image Classification
-Options.register(IMAGE_CLASSIFICATION, JSONL, OptionNone, examples.CategoryImageClassification)
+IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification"
+Options.register(IMAGE_CLASSIFICATION, JSONL, IMAGE_CLASSIFICATION_DIR / "example.jsonl")
 
 # Speech to Text
-Options.register(SPEECH2TEXT, JSONL, OptionNone, examples.Speech2Text)
+SPEECH2TEXT_DIR = EXAMPLE_DIR / "speech_to_text"
+Options.register(SPEECH2TEXT, JSONL, SPEECH2TEXT_DIR / "example.jsonl")
diff --git a/backend/data_export/pipeline/data.py b/backend/data_export/pipeline/data.py
diff --git a/backend/data_export/pipeline/dataset.py b/backend/data_export/pipeline/dataset.py
@@ -0,0 +1,24 @@
+from typing import Any, Dict, Iterator, List
+
+import pandas as pd
+from django.db.models.query import QuerySet
+
+from .labels import Labels
+from data_export.models import ExportedExample
+
+
+class Dataset:
+    def __init__(self, examples: QuerySet[ExportedExample], labels: List[Labels], is_text_project=True):
+        self.examples = examples
+        self.labels = labels
+        self.is_text_project = is_text_project
+
+    def __iter__(self) -> Iterator[Dict[str, Any]]:
+        for example in self.examples:
+            data = example.to_dict(self.is_text_project)
+            for labels in self.labels:
+                data.update(**labels.find_by(example.id))
+            yield data
+
+    def to_dataframe(self) -> pd.DataFrame:
+        return pd.DataFrame(self)