Skip to content

Commit

Permalink
Merge pull request #1799 from doccano/enhancement/datasetExport
Browse files Browse the repository at this point in the history
Enhancement/dataset export
  • Loading branch information
Hironsan committed Apr 25, 2022
2 parents 6013413 + a6928fd commit 4144d22
Show file tree
Hide file tree
Showing 37 changed files with 1,444 additions and 1,233 deletions.
60 changes: 52 additions & 8 deletions backend/data_export/celery_tasks.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,64 @@
import os
import shutil
import uuid

from celery import shared_task
from celery.utils.log import get_task_logger
from django.conf import settings
from django.shortcuts import get_object_or_404

from .pipeline.factories import create_repository, create_writer
from .pipeline.dataset import Dataset
from .pipeline.factories import create_formatter, create_labels, create_writer
from .pipeline.services import ExportApplicationService
from projects.models import Project
from data_export.models import ExportedExample
from projects.models import Member, Project

logger = get_task_logger(__name__)


def create_collaborative_dataset(project: Project, dirpath: str, confirmed_only: bool, formatters, writer):
is_text_project = project.is_text_project
if confirmed_only:
examples = ExportedExample.objects.confirmed(project)
else:
examples = ExportedExample.objects.filter(project=project)
labels = create_labels(project, examples)
dataset = Dataset(examples, labels, is_text_project)

service = ExportApplicationService(dataset, formatters, writer)

filepath = os.path.join(dirpath, f"all.{writer.extension}")
service.export(filepath)


def create_individual_dataset(project: Project, dirpath: str, confirmed_only: bool, formatters, writer):
is_text_project = project.is_text_project
members = Member.objects.filter(project=project)
for member in members:
if confirmed_only:
examples = ExportedExample.objects.confirmed(project, user=member.user)
else:
examples = ExportedExample.objects.filter(project=project)
labels = create_labels(project, examples, member.user)
dataset = Dataset(examples, labels, is_text_project)

service = ExportApplicationService(dataset, formatters, writer)

filepath = os.path.join(dirpath, f"{member.username}.{writer.extension}")
service.export(filepath)


@shared_task
def export_dataset(project_id, file_format: str, export_approved=False):
def export_dataset(project_id, file_format: str, confirmed_only=False):
project = get_object_or_404(Project, pk=project_id)
repository = create_repository(project, file_format)
writer = create_writer(file_format)(settings.MEDIA_ROOT)
service = ExportApplicationService(repository, writer)
filepath = service.export(export_approved)
return filepath
dirpath = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4()))
os.makedirs(dirpath, exist_ok=True)
formatters = create_formatter(project, file_format)
writer = create_writer(file_format)
if project.collaborative_annotation:
create_collaborative_dataset(project, dirpath, confirmed_only, formatters, writer)
else:
create_individual_dataset(project, dirpath, confirmed_only, formatters, writer)
zip_file = shutil.make_archive(dirpath, "zip", dirpath)
shutil.rmtree(dirpath)
return zip_file
81 changes: 81 additions & 0 deletions backend/data_export/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from typing import Any, Dict, Protocol, Tuple

from django.db import models

from examples.models import Example
from labels.models import Category, Relation, Span, TextLabel
from projects.models import Project

DATA = "data"


class ExportedExampleManager(models.Manager):
def confirmed(self, project: Project, user=None):
if project.collaborative_annotation:
return self.filter(project=project).exclude(states=None)
else:
assert user is not None
return self.filter(project=project, states__confirmed_by=user)


class ExportedExample(Example):
objects = ExportedExampleManager()

def to_dict(self, is_text_project=True) -> Dict[str, Any]:
return {"id": self.id, DATA: self.text if is_text_project else self.upload_name, **self.meta}

class Meta:
proxy = True


class ExportedLabel(Protocol):
objects: models.Manager

def to_dict(self) -> Dict[str, Any]:
raise NotImplementedError("Please implement this method in the subclass.")

def to_string(self) -> str:
raise NotImplementedError("Please implement this method in the subclass.")

def to_tuple(self) -> Tuple:
raise NotImplementedError("Please implement this method in the subclass.")


class ExportedCategory(Category):
def to_string(self) -> str:
return self.label.text

class Meta:
proxy = True


class ExportedSpan(Span):
def to_dict(self):
return {
"id": self.id,
"label": self.label.text,
"start_offset": self.start_offset,
"end_offset": self.end_offset,
}

def to_tuple(self):
return self.start_offset, self.end_offset, self.label.text

class Meta:
proxy = True


class ExportedRelation(Relation):
def to_dict(self):
return {"id": self.id, "from_id": self.from_id.id, "to_id": self.to_id.id, "type": self.type.text}

class Meta:
proxy = True


class ExportedText(TextLabel):
def to_string(self) -> str:
return self.text

class Meta:
proxy = True
74 changes: 31 additions & 43 deletions backend/data_export/pipeline/catalog.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Type

from pydantic import BaseModel
from typing_extensions import Literal

from . import examples
from projects.models import (
DOCUMENT_CLASSIFICATION,
IMAGE_CLASSIFICATION,
Expand All @@ -14,6 +11,8 @@
SPEECH2TEXT,
)

EXAMPLE_DIR = Path(__file__).parent.resolve() / "examples"


class Format:
name = ""
Expand All @@ -27,40 +26,18 @@ def dict(cls):

class CSV(Format):
name = "CSV"
extension = "csv"


class FastText(Format):
name = "fastText"
extension = "txt"


class JSON(Format):
name = "JSON"
extension = "json"


class JSONL(Format):
name = "JSONL"
extension = "jsonl"


class IntentAndSlot(Format):
name = "JSONL(intent and slot)"
extension = "jsonl"


class JSONLRelation(Format):
name = "JSONL(relation)"
extension = "jsonl"


class OptionDelimiter(BaseModel):
delimiter: Literal[",", "\t", ";", "|", " "] = ","


class OptionNone(BaseModel):
pass


class Options:
Expand All @@ -69,35 +46,46 @@ class Options:
@classmethod
def filter_by_task(cls, task_name: str):
options = cls.options[task_name]
return [
{**file_format.dict(), **option.schema(), "example": example} for file_format, option, example in options
]
return [{**file_format.dict(), "example": example} for file_format, example in options]

@classmethod
def register(cls, task: str, file_format: Type[Format], option: Type[BaseModel], example: str):
cls.options[task].append((file_format, option, example))
def register(cls, task: str, file_format: Type[Format], file: Path):
example = cls.load_example(file)
cls.options[task].append((file_format, example))

@staticmethod
def load_example(file):
with open(file, encoding="utf-8") as f:
return f.read()


# Text Classification
Options.register(DOCUMENT_CLASSIFICATION, CSV, OptionDelimiter, examples.Category_CSV)
Options.register(DOCUMENT_CLASSIFICATION, FastText, OptionNone, examples.Category_fastText)
Options.register(DOCUMENT_CLASSIFICATION, JSON, OptionNone, examples.Category_JSON)
Options.register(DOCUMENT_CLASSIFICATION, JSONL, OptionNone, examples.Category_JSONL)
TEXT_CLASSIFICATION_DIR = EXAMPLE_DIR / "text_classification"
Options.register(DOCUMENT_CLASSIFICATION, CSV, TEXT_CLASSIFICATION_DIR / "example.csv")
Options.register(DOCUMENT_CLASSIFICATION, FastText, TEXT_CLASSIFICATION_DIR / "example.txt")
Options.register(DOCUMENT_CLASSIFICATION, JSON, TEXT_CLASSIFICATION_DIR / "example.json")
Options.register(DOCUMENT_CLASSIFICATION, JSONL, TEXT_CLASSIFICATION_DIR / "example.jsonl")

# Sequence Labeling
Options.register(SEQUENCE_LABELING, JSONL, OptionNone, examples.Offset_JSONL)
Options.register(SEQUENCE_LABELING, JSONLRelation, OptionNone, examples.ENTITY_AND_RELATION_JSONL)
SEQUENCE_LABELING_DIR = EXAMPLE_DIR / "sequence_labeling"
RELATION_EXTRACTION_DIR = EXAMPLE_DIR / "relation_extraction"
Options.register(SEQUENCE_LABELING, JSONL, SEQUENCE_LABELING_DIR / "example.jsonl")
Options.register(SEQUENCE_LABELING, JSONL, RELATION_EXTRACTION_DIR / "example.jsonl")

# Sequence to sequence
Options.register(SEQ2SEQ, CSV, OptionDelimiter, examples.Text_CSV)
Options.register(SEQ2SEQ, JSON, OptionNone, examples.Text_JSON)
Options.register(SEQ2SEQ, JSONL, OptionNone, examples.Text_JSONL)
SEQ2SEQ_DIR = EXAMPLE_DIR / "sequence_to_sequence"
Options.register(SEQ2SEQ, CSV, SEQ2SEQ_DIR / "example.csv")
Options.register(SEQ2SEQ, JSON, SEQ2SEQ_DIR / "example.json")
Options.register(SEQ2SEQ, JSONL, SEQ2SEQ_DIR / "example.jsonl")

# Intent detection and slot filling
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, IntentAndSlot, OptionNone, examples.INTENT_JSONL)
INTENT_DETECTION_DIR = EXAMPLE_DIR / "intent_detection"
Options.register(INTENT_DETECTION_AND_SLOT_FILLING, JSONL, INTENT_DETECTION_DIR / "example.jsonl")

# Image Classification
Options.register(IMAGE_CLASSIFICATION, JSONL, OptionNone, examples.CategoryImageClassification)
IMAGE_CLASSIFICATION_DIR = EXAMPLE_DIR / "image_classification"
Options.register(IMAGE_CLASSIFICATION, JSONL, IMAGE_CLASSIFICATION_DIR / "example.jsonl")

# Speech to Text
Options.register(SPEECH2TEXT, JSONL, OptionNone, examples.Speech2Text)
SPEECH2TEXT_DIR = EXAMPLE_DIR / "speech_to_text"
Options.register(SPEECH2TEXT, JSONL, SPEECH2TEXT_DIR / "example.jsonl")
18 changes: 0 additions & 18 deletions backend/data_export/pipeline/data.py

This file was deleted.

24 changes: 24 additions & 0 deletions backend/data_export/pipeline/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from typing import Any, Dict, Iterator, List

import pandas as pd
from django.db.models.query import QuerySet

from .labels import Labels
from data_export.models import ExportedExample


class Dataset:
def __init__(self, examples: QuerySet[ExportedExample], labels: List[Labels], is_text_project=True):
self.examples = examples
self.labels = labels
self.is_text_project = is_text_project

def __iter__(self) -> Iterator[Dict[str, Any]]:
for example in self.examples:
data = example.to_dict(self.is_text_project)
for labels in self.labels:
data.update(**labels.find_by(example.id))
yield data

def to_dataframe(self) -> pd.DataFrame:
return pd.DataFrame(self)

0 comments on commit 4144d22

Please sign in to comment.