Skip to content

Commit

Permalink
Merge pull request #102 from doccano/feature/support-import-data
Browse files Browse the repository at this point in the history
Support import data
  • Loading branch information
Hironsan committed Oct 2, 2022
2 parents 9aef713 + 2a62565 commit de49c6c
Show file tree
Hide file tree
Showing 9 changed files with 584 additions and 0 deletions.
74 changes: 74 additions & 0 deletions doccano_client/clients/data_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from __future__ import annotations

import pathlib
from typing import List

from requests_toolbelt import MultipartEncoder

from doccano_client.client import DoccanoClient
from doccano_client.models.data_import import AvailableTask, Option


class DataImportClient:
"""Client for interacting with the Doccano data import API"""

def __init__(self, client: DoccanoClient):
self._client = client

def list_options(self, project_id: int) -> List[Option]:
"""Return all upload options
Args:
project_id (int): The id of the project
Returns:
List[Option]: The list of the upload options.
"""
resource = f"projects/{project_id}/catalog"
response = self._client.get(resource)
options = [Option.parse_obj(label) for label in response.json()]
return options

def upload(self, file_path: str) -> str:
"""Upload a file to the server
Args:
file_path (str): The path to the file to upload
Returns:
str: The id of the uploaded file
"""
resource = "fp/process/"
path = pathlib.Path(file_path)
with path.open("rb") as f:
m = MultipartEncoder(fields={"filepond": (path.name, f)})
headers = {"Content-Type": m.content_type, "Accept": "*/*"}
response = self._client.post(resource, data=m, headers=headers)
return response.content.decode()

def delete(self, upload_id: str):
"""Delete the uploaded file from the server
Args:
upload_id (str): The id of the uploaded file
"""
resource = "fp/revert/"
headers = {"Content-Type": "text/plain", "Accept": "*/*"}
self._client.delete(resource, data=upload_id, headers=headers)

def ingest(self, project_id: int, upload_ids: List[str], task: AvailableTask, format: str) -> int:
"""Ingest the uploaded files into the project
Args:
project_id (int): The id of the project
upload_ids (List[str]): The ids of the uploaded files
task (AvailableTask): The project's task name
format (str): The format of the uploaded files
Returns:
int: The celery task id
"""
resource = f"projects/{project_id}/upload"
data = {"uploadIds": upload_ids, "task": task, "format": format}
response = self._client.post(resource, json=data)
return response.json()["task_id"]
26 changes: 26 additions & 0 deletions doccano_client/models/data_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from enum import Enum
from typing import Any, Dict

from pydantic import BaseModel


class Option(BaseModel):
task_id: str
name: str
display_name: str
example: str
accept_types: str
properties: Dict[str, Any]


class AvailableTask(str, Enum):
DOCUMENT_CLASSIFICATION = "DocumentClassification"
SEQUENCE_LABELING = "SequenceLabeling"
SEQ2SEQ = "Seq2seq"
SPEECH2TEXT = "Speech2text"
IMAGE_CLASSIFICATION = "ImageClassification"
BOUNDING_BOX = "BoundingBox"
SEGMENTATION = "Segmentation"
IMAGE_CAPTIONING = "ImageCaptioning"
INTENT_DETECTION_AND_SLOT_FILLING = "IntentDetectionAndSlotFilling"
RELATION_EXTRACTION = "RelationExtraction"
3 changes: 3 additions & 0 deletions tests/clients/data/classification.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"text": "exampleA", "labels": ["positive"], "meta": {"wikiPageID": 1}}
{"text": "exampleB", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}}
{"text": "exampleC", "labels": [], "meta": {"wikiPageID": 3}}
43 changes: 43 additions & 0 deletions tests/clients/test_data_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pathlib

import vcr

from doccano_client.client import DoccanoClient
from doccano_client.clients.data_import import DataImportClient
from doccano_client.models.data_import import Option
from tests.conftest import cassettes_path


class TestDataImportClient:
@classmethod
def setup_class(cls):
with vcr.use_cassette(str(cassettes_path / "data_import/login.yaml"), mode="once"):
client = DoccanoClient("http://localhost:8000")
client.login(username="admin", password="password")
cls.client = DataImportClient(client)
cls.project_id = 16
cls.file_path = pathlib.Path(__file__).parent / "data/classification.json"

def test_list_options(self):
with vcr.use_cassette(str(cassettes_path / "data_import/options.yaml"), mode="once"):
response = self.client.list_options(self.project_id)
assert len(response) > 0
assert all(isinstance(option, Option) for option in response)

def test_upload(self):
with vcr.use_cassette(str(cassettes_path / "data_import/upload.yaml"), mode="once"):
upload_id = self.client.upload(self.file_path)
assert upload_id is not None
assert isinstance(upload_id, str)

def test_delete(self):
with vcr.use_cassette(str(cassettes_path / "data_import/delete.yaml"), mode="once"):
upload_id = self.client.upload(self.file_path)
self.client.delete(upload_id)

def test_ingest(self):
with vcr.use_cassette(str(cassettes_path / "data_import/ingest.yaml"), mode="once"):
upload_id = self.client.upload(self.file_path)
task_id = self.client.ingest(self.project_id, [upload_id], task="DocumentClassification", format="JSONL")
assert task_id is not None
assert isinstance(task_id, str)
117 changes: 117 additions & 0 deletions tests/fixtures/cassettes/data_import/delete.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
interactions:
- request:
body: !!python/object/new:_io.BytesIO
state: !!python/tuple
- !!binary |
LS0wYzAzNjM2OGU0Yzc0OGI0YmM4ZTMyZjdhNzBlYmU1OQ0KQ29udGVudC1EaXNwb3NpdGlvbjog
Zm9ybS1kYXRhOyBuYW1lPSJmaWxlcG9uZCI7IGZpbGVuYW1lPSJjbGFzc2lmaWNhdGlvbi5qc29u
Ig0KDQp7InRleHQiOiAiZXhhbXBsZUEiLCAibGFiZWxzIjogWyJwb3NpdGl2ZSJdLCAibWV0YSI6
IHsid2lraVBhZ2VJRCI6IDF9fQp7InRleHQiOiAiZXhhbXBsZUIiLCAibGFiZWxzIjogWyJwb3Np
dGl2ZSIsICJuZWdhdGl2ZSJdLCAibWV0YSI6IHsid2lraVBhZ2VJRCI6IDJ9fQp7InRleHQiOiAi
ZXhhbXBsZUMiLCAibGFiZWxzIjogW10sICJtZXRhIjogeyJ3aWtpUGFnZUlEIjogM319Cg0KLS0w
YzAzNjM2OGU0Yzc0OGI0YmM4ZTMyZjdhNzBlYmU1OS0tDQo=
- 0
- null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '377'
Content-Type:
- multipart/form-data; boundary=0c036368e4c748b4bc8e32f7a70ebe59
Cookie:
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b;
sessionid=fcjn46n6aqmya549fego0q7igo89cxks
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/fp/process/
response:
body:
string: HmoLcxgwo9QLFHN27DBKgx
headers:
Allow:
- POST, OPTIONS
Connection:
- close
Content-Length:
- '22'
Content-Type:
- text/plain
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 04:44:37 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
- request:
body: HmoLcxgwo9QLFHN27DBKgx
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '22'
Content-Type:
- text/plain
Cookie:
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b;
sessionid=fcjn46n6aqmya549fego0q7igo89cxks
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b
referer:
- http://localhost:8000
method: DELETE
uri: http://localhost:8000/v1/fp/revert/
response:
body:
string: ''
headers:
Allow:
- DELETE, OPTIONS
Connection:
- close
Content-Length:
- '4'
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 04:44:37 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 204
message: No Content
version: 1
120 changes: 120 additions & 0 deletions tests/fixtures/cassettes/data_import/ingest.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
interactions:
- request:
body: !!python/object/new:_io.BytesIO
state: !!python/tuple
- !!binary |
LS0wOGE2MGVjNzQzYzE0NjM0OTQ4ZjJhZWFmYTJiZWY3Yg0KQ29udGVudC1EaXNwb3NpdGlvbjog
Zm9ybS1kYXRhOyBuYW1lPSJmaWxlcG9uZCI7IGZpbGVuYW1lPSJjbGFzc2lmaWNhdGlvbi5qc29u
Ig0KDQp7InRleHQiOiAiZXhhbXBsZUEiLCAibGFiZWxzIjogWyJwb3NpdGl2ZSJdLCAibWV0YSI6
IHsid2lraVBhZ2VJRCI6IDF9fQp7InRleHQiOiAiZXhhbXBsZUIiLCAibGFiZWxzIjogWyJwb3Np
dGl2ZSIsICJuZWdhdGl2ZSJdLCAibWV0YSI6IHsid2lraVBhZ2VJRCI6IDJ9fQp7InRleHQiOiAi
ZXhhbXBsZUMiLCAibGFiZWxzIjogW10sICJtZXRhIjogeyJ3aWtpUGFnZUlEIjogM319Cg0KLS0w
OGE2MGVjNzQzYzE0NjM0OTQ4ZjJhZWFmYTJiZWY3Yi0tDQo=
- 0
- null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '377'
Content-Type:
- multipart/form-data; boundary=08a60ec743c14634948f2aeafa2bef7b
Cookie:
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b;
sessionid=fcjn46n6aqmya549fego0q7igo89cxks
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/fp/process/
response:
body:
string: dntzCMUgfA3sFzXXmSjGe6
headers:
Allow:
- POST, OPTIONS
Connection:
- close
Content-Length:
- '22'
Content-Type:
- text/plain
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 04:42:25 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
- request:
body: '{"uploadIds": ["dntzCMUgfA3sFzXXmSjGe6"], "task": "DocumentClassification",
"format": "JSONL"}'
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '94'
Cookie:
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b;
sessionid=fcjn46n6aqmya549fego0q7igo89cxks
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/projects/16/upload
response:
body:
string: '{"task_id":"713c6904-8a07-44bc-8411-333672fb77d5"}'
headers:
Allow:
- POST, OPTIONS
Connection:
- close
Content-Length:
- '50'
Content-Type:
- application/json
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 04:42:26 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
version: 1
Loading

0 comments on commit de49c6c

Please sign in to comment.