-
-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1799 from doccano/enhancement/datasetExport
Enhancement/dataset export
- Loading branch information
Showing
37 changed files
with
1,444 additions
and
1,233 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,64 @@ | ||
import os | ||
import shutil | ||
import uuid | ||
|
||
from celery import shared_task | ||
from celery.utils.log import get_task_logger | ||
from django.conf import settings | ||
from django.shortcuts import get_object_or_404 | ||
|
||
from .pipeline.factories import create_repository, create_writer | ||
from .pipeline.dataset import Dataset | ||
from .pipeline.factories import create_formatter, create_labels, create_writer | ||
from .pipeline.services import ExportApplicationService | ||
from projects.models import Project | ||
from data_export.models import ExportedExample | ||
from projects.models import Member, Project | ||
|
||
logger = get_task_logger(__name__) | ||
|
||
|
||
def create_collaborative_dataset(project: Project, dirpath: str, confirmed_only: bool, formatters, writer): | ||
is_text_project = project.is_text_project | ||
if confirmed_only: | ||
examples = ExportedExample.objects.confirmed(project) | ||
else: | ||
examples = ExportedExample.objects.filter(project=project) | ||
labels = create_labels(project, examples) | ||
dataset = Dataset(examples, labels, is_text_project) | ||
|
||
service = ExportApplicationService(dataset, formatters, writer) | ||
|
||
filepath = os.path.join(dirpath, f"all.{writer.extension}") | ||
service.export(filepath) | ||
|
||
|
||
def create_individual_dataset(project: Project, dirpath: str, confirmed_only: bool, formatters, writer): | ||
is_text_project = project.is_text_project | ||
members = Member.objects.filter(project=project) | ||
for member in members: | ||
if confirmed_only: | ||
examples = ExportedExample.objects.confirmed(project, user=member.user) | ||
else: | ||
examples = ExportedExample.objects.filter(project=project) | ||
labels = create_labels(project, examples, member.user) | ||
dataset = Dataset(examples, labels, is_text_project) | ||
|
||
service = ExportApplicationService(dataset, formatters, writer) | ||
|
||
filepath = os.path.join(dirpath, f"{member.username}.{writer.extension}") | ||
service.export(filepath) | ||
|
||
|
||
@shared_task | ||
def export_dataset(project_id, file_format: str, export_approved=False): | ||
def export_dataset(project_id, file_format: str, confirmed_only=False): | ||
project = get_object_or_404(Project, pk=project_id) | ||
repository = create_repository(project, file_format) | ||
writer = create_writer(file_format)(settings.MEDIA_ROOT) | ||
service = ExportApplicationService(repository, writer) | ||
filepath = service.export(export_approved) | ||
return filepath | ||
dirpath = os.path.join(settings.MEDIA_ROOT, str(uuid.uuid4())) | ||
os.makedirs(dirpath, exist_ok=True) | ||
formatters = create_formatter(project, file_format) | ||
writer = create_writer(file_format) | ||
if project.collaborative_annotation: | ||
create_collaborative_dataset(project, dirpath, confirmed_only, formatters, writer) | ||
else: | ||
create_individual_dataset(project, dirpath, confirmed_only, formatters, writer) | ||
zip_file = shutil.make_archive(dirpath, "zip", dirpath) | ||
shutil.rmtree(dirpath) | ||
return zip_file |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
from typing import Any, Dict, Protocol, Tuple | ||
|
||
from django.db import models | ||
|
||
from examples.models import Example | ||
from labels.models import Category, Relation, Span, TextLabel | ||
from projects.models import Project | ||
|
||
DATA = "data" | ||
|
||
|
||
class ExportedExampleManager(models.Manager): | ||
def confirmed(self, project: Project, user=None): | ||
if project.collaborative_annotation: | ||
return self.filter(project=project).exclude(states=None) | ||
else: | ||
assert user is not None | ||
return self.filter(project=project, states__confirmed_by=user) | ||
|
||
|
||
class ExportedExample(Example): | ||
objects = ExportedExampleManager() | ||
|
||
def to_dict(self, is_text_project=True) -> Dict[str, Any]: | ||
return {"id": self.id, DATA: self.text if is_text_project else self.upload_name, **self.meta} | ||
|
||
class Meta: | ||
proxy = True | ||
|
||
|
||
class ExportedLabel(Protocol): | ||
objects: models.Manager | ||
|
||
def to_dict(self) -> Dict[str, Any]: | ||
raise NotImplementedError("Please implement this method in the subclass.") | ||
|
||
def to_string(self) -> str: | ||
raise NotImplementedError("Please implement this method in the subclass.") | ||
|
||
def to_tuple(self) -> Tuple: | ||
raise NotImplementedError("Please implement this method in the subclass.") | ||
|
||
|
||
class ExportedCategory(Category): | ||
def to_string(self) -> str: | ||
return self.label.text | ||
|
||
class Meta: | ||
proxy = True | ||
|
||
|
||
class ExportedSpan(Span): | ||
def to_dict(self): | ||
return { | ||
"id": self.id, | ||
"label": self.label.text, | ||
"start_offset": self.start_offset, | ||
"end_offset": self.end_offset, | ||
} | ||
|
||
def to_tuple(self): | ||
return self.start_offset, self.end_offset, self.label.text | ||
|
||
class Meta: | ||
proxy = True | ||
|
||
|
||
class ExportedRelation(Relation): | ||
def to_dict(self): | ||
return {"id": self.id, "from_id": self.from_id.id, "to_id": self.to_id.id, "type": self.type.text} | ||
|
||
class Meta: | ||
proxy = True | ||
|
||
|
||
class ExportedText(TextLabel): | ||
def to_string(self) -> str: | ||
return self.text | ||
|
||
class Meta: | ||
proxy = True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from typing import Any, Dict, Iterator, List | ||
|
||
import pandas as pd | ||
from django.db.models.query import QuerySet | ||
|
||
from .labels import Labels | ||
from data_export.models import ExportedExample | ||
|
||
|
||
class Dataset: | ||
def __init__(self, examples: QuerySet[ExportedExample], labels: List[Labels], is_text_project=True): | ||
self.examples = examples | ||
self.labels = labels | ||
self.is_text_project = is_text_project | ||
|
||
def __iter__(self) -> Iterator[Dict[str, Any]]: | ||
for example in self.examples: | ||
data = example.to_dict(self.is_text_project) | ||
for labels in self.labels: | ||
data.update(**labels.find_by(example.id)) | ||
yield data | ||
|
||
def to_dataframe(self) -> pd.DataFrame: | ||
return pd.DataFrame(self) |
Oops, something went wrong.