Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support import data #102

Merged
merged 1 commit into from
Oct 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions doccano_client/clients/data_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from __future__ import annotations

import pathlib
from typing import List

from requests_toolbelt import MultipartEncoder

from doccano_client.client import DoccanoClient
from doccano_client.models.data_import import AvailableTask, Option


class DataImportClient:
"""Client for interacting with the Doccano data import API"""

def __init__(self, client: DoccanoClient):
self._client = client

def list_options(self, project_id: int) -> List[Option]:
"""Return all upload options

Args:
project_id (int): The id of the project

Returns:
List[Option]: The list of the upload options.
"""
resource = f"projects/{project_id}/catalog"
response = self._client.get(resource)
options = [Option.parse_obj(label) for label in response.json()]
return options

def upload(self, file_path: str) -> str:
"""Upload a file to the server

Args:
file_path (str): The path to the file to upload

Returns:
str: The id of the uploaded file
"""
resource = "fp/process/"
path = pathlib.Path(file_path)
with path.open("rb") as f:
m = MultipartEncoder(fields={"filepond": (path.name, f)})
headers = {"Content-Type": m.content_type, "Accept": "*/*"}
response = self._client.post(resource, data=m, headers=headers)
return response.content.decode()

def delete(self, upload_id: str):
"""Delete the uploaded file from the server

Args:
upload_id (str): The id of the uploaded file
"""
resource = "fp/revert/"
headers = {"Content-Type": "text/plain", "Accept": "*/*"}
self._client.delete(resource, data=upload_id, headers=headers)

def ingest(self, project_id: int, upload_ids: List[str], task: AvailableTask, format: str) -> int:
"""Ingest the uploaded files into the project

Args:
project_id (int): The id of the project
upload_ids (List[str]): The ids of the uploaded files
task (AvailableTask): The project's task name
format (str): The format of the uploaded files

Returns:
int: The celery task id
"""
resource = f"projects/{project_id}/upload"
data = {"uploadIds": upload_ids, "task": task, "format": format}
response = self._client.post(resource, json=data)
return response.json()["task_id"]
26 changes: 26 additions & 0 deletions doccano_client/models/data_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
from enum import Enum
from typing import Any, Dict

from pydantic import BaseModel


class Option(BaseModel):
task_id: str
name: str
display_name: str
example: str
accept_types: str
properties: Dict[str, Any]


class AvailableTask(str, Enum):
DOCUMENT_CLASSIFICATION = "DocumentClassification"
SEQUENCE_LABELING = "SequenceLabeling"
SEQ2SEQ = "Seq2seq"
SPEECH2TEXT = "Speech2text"
IMAGE_CLASSIFICATION = "ImageClassification"
BOUNDING_BOX = "BoundingBox"
SEGMENTATION = "Segmentation"
IMAGE_CAPTIONING = "ImageCaptioning"
INTENT_DETECTION_AND_SLOT_FILLING = "IntentDetectionAndSlotFilling"
RELATION_EXTRACTION = "RelationExtraction"
3 changes: 3 additions & 0 deletions tests/clients/data/classification.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"text": "exampleA", "labels": ["positive"], "meta": {"wikiPageID": 1}}
{"text": "exampleB", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}}
{"text": "exampleC", "labels": [], "meta": {"wikiPageID": 3}}
43 changes: 43 additions & 0 deletions tests/clients/test_data_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pathlib

import vcr

from doccano_client.client import DoccanoClient
from doccano_client.clients.data_import import DataImportClient
from doccano_client.models.data_import import Option
from tests.conftest import cassettes_path


class TestDataImportClient:
@classmethod
def setup_class(cls):
with vcr.use_cassette(str(cassettes_path / "data_import/login.yaml"), mode="once"):
client = DoccanoClient("http://localhost:8000")
client.login(username="admin", password="password")
cls.client = DataImportClient(client)
cls.project_id = 16
cls.file_path = pathlib.Path(__file__).parent / "data/classification.json"

def test_list_options(self):
with vcr.use_cassette(str(cassettes_path / "data_import/options.yaml"), mode="once"):
response = self.client.list_options(self.project_id)
assert len(response) > 0
assert all(isinstance(option, Option) for option in response)

def test_upload(self):
with vcr.use_cassette(str(cassettes_path / "data_import/upload.yaml"), mode="once"):
upload_id = self.client.upload(self.file_path)
assert upload_id is not None
assert isinstance(upload_id, str)

def test_delete(self):
with vcr.use_cassette(str(cassettes_path / "data_import/delete.yaml"), mode="once"):
upload_id = self.client.upload(self.file_path)
self.client.delete(upload_id)

def test_ingest(self):
with vcr.use_cassette(str(cassettes_path / "data_import/ingest.yaml"), mode="once"):
upload_id = self.client.upload(self.file_path)
task_id = self.client.ingest(self.project_id, [upload_id], task="DocumentClassification", format="JSONL")
assert task_id is not None
assert isinstance(task_id, str)
117 changes: 117 additions & 0 deletions tests/fixtures/cassettes/data_import/delete.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
interactions:
- request:
body: !!python/object/new:_io.BytesIO
state: !!python/tuple
- !!binary |
LS0wYzAzNjM2OGU0Yzc0OGI0YmM4ZTMyZjdhNzBlYmU1OQ0KQ29udGVudC1EaXNwb3NpdGlvbjog
Zm9ybS1kYXRhOyBuYW1lPSJmaWxlcG9uZCI7IGZpbGVuYW1lPSJjbGFzc2lmaWNhdGlvbi5qc29u
Ig0KDQp7InRleHQiOiAiZXhhbXBsZUEiLCAibGFiZWxzIjogWyJwb3NpdGl2ZSJdLCAibWV0YSI6
IHsid2lraVBhZ2VJRCI6IDF9fQp7InRleHQiOiAiZXhhbXBsZUIiLCAibGFiZWxzIjogWyJwb3Np
dGl2ZSIsICJuZWdhdGl2ZSJdLCAibWV0YSI6IHsid2lraVBhZ2VJRCI6IDJ9fQp7InRleHQiOiAi
ZXhhbXBsZUMiLCAibGFiZWxzIjogW10sICJtZXRhIjogeyJ3aWtpUGFnZUlEIjogM319Cg0KLS0w
YzAzNjM2OGU0Yzc0OGI0YmM4ZTMyZjdhNzBlYmU1OS0tDQo=
- 0
- null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '377'
Content-Type:
- multipart/form-data; boundary=0c036368e4c748b4bc8e32f7a70ebe59
Cookie:
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b;
sessionid=fcjn46n6aqmya549fego0q7igo89cxks
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/fp/process/
response:
body:
string: HmoLcxgwo9QLFHN27DBKgx
headers:
Allow:
- POST, OPTIONS
Connection:
- close
Content-Length:
- '22'
Content-Type:
- text/plain
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 04:44:37 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
- request:
body: HmoLcxgwo9QLFHN27DBKgx
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '22'
Content-Type:
- text/plain
Cookie:
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b;
sessionid=fcjn46n6aqmya549fego0q7igo89cxks
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b
referer:
- http://localhost:8000
method: DELETE
uri: http://localhost:8000/v1/fp/revert/
response:
body:
string: ''
headers:
Allow:
- DELETE, OPTIONS
Connection:
- close
Content-Length:
- '4'
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 04:44:37 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 204
message: No Content
version: 1
120 changes: 120 additions & 0 deletions tests/fixtures/cassettes/data_import/ingest.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
interactions:
- request:
body: !!python/object/new:_io.BytesIO
state: !!python/tuple
- !!binary |
LS0wOGE2MGVjNzQzYzE0NjM0OTQ4ZjJhZWFmYTJiZWY3Yg0KQ29udGVudC1EaXNwb3NpdGlvbjog
Zm9ybS1kYXRhOyBuYW1lPSJmaWxlcG9uZCI7IGZpbGVuYW1lPSJjbGFzc2lmaWNhdGlvbi5qc29u
Ig0KDQp7InRleHQiOiAiZXhhbXBsZUEiLCAibGFiZWxzIjogWyJwb3NpdGl2ZSJdLCAibWV0YSI6
IHsid2lraVBhZ2VJRCI6IDF9fQp7InRleHQiOiAiZXhhbXBsZUIiLCAibGFiZWxzIjogWyJwb3Np
dGl2ZSIsICJuZWdhdGl2ZSJdLCAibWV0YSI6IHsid2lraVBhZ2VJRCI6IDJ9fQp7InRleHQiOiAi
ZXhhbXBsZUMiLCAibGFiZWxzIjogW10sICJtZXRhIjogeyJ3aWtpUGFnZUlEIjogM319Cg0KLS0w
OGE2MGVjNzQzYzE0NjM0OTQ4ZjJhZWFmYTJiZWY3Yi0tDQo=
- 0
- null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '377'
Content-Type:
- multipart/form-data; boundary=08a60ec743c14634948f2aeafa2bef7b
Cookie:
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b;
sessionid=fcjn46n6aqmya549fego0q7igo89cxks
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/fp/process/
response:
body:
string: dntzCMUgfA3sFzXXmSjGe6
headers:
Allow:
- POST, OPTIONS
Connection:
- close
Content-Length:
- '22'
Content-Type:
- text/plain
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 04:42:25 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
- request:
body: '{"uploadIds": ["dntzCMUgfA3sFzXXmSjGe6"], "task": "DocumentClassification",
"format": "JSONL"}'
headers:
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '94'
Cookie:
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b;
sessionid=fcjn46n6aqmya549fego0q7igo89cxks
User-Agent:
- python-requests/2.28.1
X-CSRFToken:
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b
accept:
- application/json
content-type:
- application/json
referer:
- http://localhost:8000
method: POST
uri: http://localhost:8000/v1/projects/16/upload
response:
body:
string: '{"task_id":"713c6904-8a07-44bc-8411-333672fb77d5"}'
headers:
Allow:
- POST, OPTIONS
Connection:
- close
Content-Length:
- '50'
Content-Type:
- application/json
Cross-Origin-Opener-Policy:
- same-origin
Date:
- Sun, 02 Oct 2022 04:42:26 GMT
Referrer-Policy:
- same-origin
Server:
- gunicorn
Vary:
- Accept, Origin, Cookie
X-Content-Type-Options:
- nosniff
X-Frame-Options:
- DENY
status:
code: 200
message: OK
version: 1
Loading