-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #102 from doccano/feature/support-import-data
Support import data
- Loading branch information
Showing
9 changed files
with
584 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from __future__ import annotations | ||
|
||
import pathlib | ||
from typing import List | ||
|
||
from requests_toolbelt import MultipartEncoder | ||
|
||
from doccano_client.client import DoccanoClient | ||
from doccano_client.models.data_import import AvailableTask, Option | ||
|
||
|
||
class DataImportClient: | ||
"""Client for interacting with the Doccano data import API""" | ||
|
||
def __init__(self, client: DoccanoClient): | ||
self._client = client | ||
|
||
def list_options(self, project_id: int) -> List[Option]: | ||
"""Return all upload options | ||
Args: | ||
project_id (int): The id of the project | ||
Returns: | ||
List[Option]: The list of the upload options. | ||
""" | ||
resource = f"projects/{project_id}/catalog" | ||
response = self._client.get(resource) | ||
options = [Option.parse_obj(label) for label in response.json()] | ||
return options | ||
|
||
def upload(self, file_path: str) -> str: | ||
"""Upload a file to the server | ||
Args: | ||
file_path (str): The path to the file to upload | ||
Returns: | ||
str: The id of the uploaded file | ||
""" | ||
resource = "fp/process/" | ||
path = pathlib.Path(file_path) | ||
with path.open("rb") as f: | ||
m = MultipartEncoder(fields={"filepond": (path.name, f)}) | ||
headers = {"Content-Type": m.content_type, "Accept": "*/*"} | ||
response = self._client.post(resource, data=m, headers=headers) | ||
return response.content.decode() | ||
|
||
def delete(self, upload_id: str): | ||
"""Delete the uploaded file from the server | ||
Args: | ||
upload_id (str): The id of the uploaded file | ||
""" | ||
resource = "fp/revert/" | ||
headers = {"Content-Type": "text/plain", "Accept": "*/*"} | ||
self._client.delete(resource, data=upload_id, headers=headers) | ||
|
||
def ingest(self, project_id: int, upload_ids: List[str], task: AvailableTask, format: str) -> int: | ||
"""Ingest the uploaded files into the project | ||
Args: | ||
project_id (int): The id of the project | ||
upload_ids (List[str]): The ids of the uploaded files | ||
task (AvailableTask): The project's task name | ||
format (str): The format of the uploaded files | ||
Returns: | ||
int: The celery task id | ||
""" | ||
resource = f"projects/{project_id}/upload" | ||
data = {"uploadIds": upload_ids, "task": task, "format": format} | ||
response = self._client.post(resource, json=data) | ||
return response.json()["task_id"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
from enum import Enum | ||
from typing import Any, Dict | ||
|
||
from pydantic import BaseModel | ||
|
||
|
||
class Option(BaseModel): | ||
task_id: str | ||
name: str | ||
display_name: str | ||
example: str | ||
accept_types: str | ||
properties: Dict[str, Any] | ||
|
||
|
||
class AvailableTask(str, Enum): | ||
DOCUMENT_CLASSIFICATION = "DocumentClassification" | ||
SEQUENCE_LABELING = "SequenceLabeling" | ||
SEQ2SEQ = "Seq2seq" | ||
SPEECH2TEXT = "Speech2text" | ||
IMAGE_CLASSIFICATION = "ImageClassification" | ||
BOUNDING_BOX = "BoundingBox" | ||
SEGMENTATION = "Segmentation" | ||
IMAGE_CAPTIONING = "ImageCaptioning" | ||
INTENT_DETECTION_AND_SLOT_FILLING = "IntentDetectionAndSlotFilling" | ||
RELATION_EXTRACTION = "RelationExtraction" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"text": "exampleA", "labels": ["positive"], "meta": {"wikiPageID": 1}} | ||
{"text": "exampleB", "labels": ["positive", "negative"], "meta": {"wikiPageID": 2}} | ||
{"text": "exampleC", "labels": [], "meta": {"wikiPageID": 3}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
import pathlib | ||
|
||
import vcr | ||
|
||
from doccano_client.client import DoccanoClient | ||
from doccano_client.clients.data_import import DataImportClient | ||
from doccano_client.models.data_import import Option | ||
from tests.conftest import cassettes_path | ||
|
||
|
||
class TestDataImportClient: | ||
@classmethod | ||
def setup_class(cls): | ||
with vcr.use_cassette(str(cassettes_path / "data_import/login.yaml"), mode="once"): | ||
client = DoccanoClient("http://localhost:8000") | ||
client.login(username="admin", password="password") | ||
cls.client = DataImportClient(client) | ||
cls.project_id = 16 | ||
cls.file_path = pathlib.Path(__file__).parent / "data/classification.json" | ||
|
||
def test_list_options(self): | ||
with vcr.use_cassette(str(cassettes_path / "data_import/options.yaml"), mode="once"): | ||
response = self.client.list_options(self.project_id) | ||
assert len(response) > 0 | ||
assert all(isinstance(option, Option) for option in response) | ||
|
||
def test_upload(self): | ||
with vcr.use_cassette(str(cassettes_path / "data_import/upload.yaml"), mode="once"): | ||
upload_id = self.client.upload(self.file_path) | ||
assert upload_id is not None | ||
assert isinstance(upload_id, str) | ||
|
||
def test_delete(self): | ||
with vcr.use_cassette(str(cassettes_path / "data_import/delete.yaml"), mode="once"): | ||
upload_id = self.client.upload(self.file_path) | ||
self.client.delete(upload_id) | ||
|
||
def test_ingest(self): | ||
with vcr.use_cassette(str(cassettes_path / "data_import/ingest.yaml"), mode="once"): | ||
upload_id = self.client.upload(self.file_path) | ||
task_id = self.client.ingest(self.project_id, [upload_id], task="DocumentClassification", format="JSONL") | ||
assert task_id is not None | ||
assert isinstance(task_id, str) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
interactions: | ||
- request: | ||
body: !!python/object/new:_io.BytesIO | ||
state: !!python/tuple | ||
- !!binary | | ||
LS0wYzAzNjM2OGU0Yzc0OGI0YmM4ZTMyZjdhNzBlYmU1OQ0KQ29udGVudC1EaXNwb3NpdGlvbjog | ||
Zm9ybS1kYXRhOyBuYW1lPSJmaWxlcG9uZCI7IGZpbGVuYW1lPSJjbGFzc2lmaWNhdGlvbi5qc29u | ||
Ig0KDQp7InRleHQiOiAiZXhhbXBsZUEiLCAibGFiZWxzIjogWyJwb3NpdGl2ZSJdLCAibWV0YSI6 | ||
IHsid2lraVBhZ2VJRCI6IDF9fQp7InRleHQiOiAiZXhhbXBsZUIiLCAibGFiZWxzIjogWyJwb3Np | ||
dGl2ZSIsICJuZWdhdGl2ZSJdLCAibWV0YSI6IHsid2lraVBhZ2VJRCI6IDJ9fQp7InRleHQiOiAi | ||
ZXhhbXBsZUMiLCAibGFiZWxzIjogW10sICJtZXRhIjogeyJ3aWtpUGFnZUlEIjogM319Cg0KLS0w | ||
YzAzNjM2OGU0Yzc0OGI0YmM4ZTMyZjdhNzBlYmU1OS0tDQo= | ||
- 0 | ||
- null | ||
headers: | ||
Accept: | ||
- '*/*' | ||
Accept-Encoding: | ||
- gzip, deflate | ||
Connection: | ||
- keep-alive | ||
Content-Length: | ||
- '377' | ||
Content-Type: | ||
- multipart/form-data; boundary=0c036368e4c748b4bc8e32f7a70ebe59 | ||
Cookie: | ||
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b; | ||
sessionid=fcjn46n6aqmya549fego0q7igo89cxks | ||
User-Agent: | ||
- python-requests/2.28.1 | ||
X-CSRFToken: | ||
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b | ||
referer: | ||
- http://localhost:8000 | ||
method: POST | ||
uri: http://localhost:8000/v1/fp/process/ | ||
response: | ||
body: | ||
string: HmoLcxgwo9QLFHN27DBKgx | ||
headers: | ||
Allow: | ||
- POST, OPTIONS | ||
Connection: | ||
- close | ||
Content-Length: | ||
- '22' | ||
Content-Type: | ||
- text/plain | ||
Cross-Origin-Opener-Policy: | ||
- same-origin | ||
Date: | ||
- Sun, 02 Oct 2022 04:44:37 GMT | ||
Referrer-Policy: | ||
- same-origin | ||
Server: | ||
- gunicorn | ||
Vary: | ||
- Origin, Cookie | ||
X-Content-Type-Options: | ||
- nosniff | ||
X-Frame-Options: | ||
- DENY | ||
status: | ||
code: 200 | ||
message: OK | ||
- request: | ||
body: HmoLcxgwo9QLFHN27DBKgx | ||
headers: | ||
Accept: | ||
- '*/*' | ||
Accept-Encoding: | ||
- gzip, deflate | ||
Connection: | ||
- keep-alive | ||
Content-Length: | ||
- '22' | ||
Content-Type: | ||
- text/plain | ||
Cookie: | ||
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b; | ||
sessionid=fcjn46n6aqmya549fego0q7igo89cxks | ||
User-Agent: | ||
- python-requests/2.28.1 | ||
X-CSRFToken: | ||
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b | ||
referer: | ||
- http://localhost:8000 | ||
method: DELETE | ||
uri: http://localhost:8000/v1/fp/revert/ | ||
response: | ||
body: | ||
string: '' | ||
headers: | ||
Allow: | ||
- DELETE, OPTIONS | ||
Connection: | ||
- close | ||
Content-Length: | ||
- '4' | ||
Cross-Origin-Opener-Policy: | ||
- same-origin | ||
Date: | ||
- Sun, 02 Oct 2022 04:44:37 GMT | ||
Referrer-Policy: | ||
- same-origin | ||
Server: | ||
- gunicorn | ||
Vary: | ||
- Origin, Cookie | ||
X-Content-Type-Options: | ||
- nosniff | ||
X-Frame-Options: | ||
- DENY | ||
status: | ||
code: 204 | ||
message: No Content | ||
version: 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,120 @@ | ||
interactions: | ||
- request: | ||
body: !!python/object/new:_io.BytesIO | ||
state: !!python/tuple | ||
- !!binary | | ||
LS0wOGE2MGVjNzQzYzE0NjM0OTQ4ZjJhZWFmYTJiZWY3Yg0KQ29udGVudC1EaXNwb3NpdGlvbjog | ||
Zm9ybS1kYXRhOyBuYW1lPSJmaWxlcG9uZCI7IGZpbGVuYW1lPSJjbGFzc2lmaWNhdGlvbi5qc29u | ||
Ig0KDQp7InRleHQiOiAiZXhhbXBsZUEiLCAibGFiZWxzIjogWyJwb3NpdGl2ZSJdLCAibWV0YSI6 | ||
IHsid2lraVBhZ2VJRCI6IDF9fQp7InRleHQiOiAiZXhhbXBsZUIiLCAibGFiZWxzIjogWyJwb3Np | ||
dGl2ZSIsICJuZWdhdGl2ZSJdLCAibWV0YSI6IHsid2lraVBhZ2VJRCI6IDJ9fQp7InRleHQiOiAi | ||
ZXhhbXBsZUMiLCAibGFiZWxzIjogW10sICJtZXRhIjogeyJ3aWtpUGFnZUlEIjogM319Cg0KLS0w | ||
OGE2MGVjNzQzYzE0NjM0OTQ4ZjJhZWFmYTJiZWY3Yi0tDQo= | ||
- 0 | ||
- null | ||
headers: | ||
Accept: | ||
- '*/*' | ||
Accept-Encoding: | ||
- gzip, deflate | ||
Connection: | ||
- keep-alive | ||
Content-Length: | ||
- '377' | ||
Content-Type: | ||
- multipart/form-data; boundary=08a60ec743c14634948f2aeafa2bef7b | ||
Cookie: | ||
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b; | ||
sessionid=fcjn46n6aqmya549fego0q7igo89cxks | ||
User-Agent: | ||
- python-requests/2.28.1 | ||
X-CSRFToken: | ||
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b | ||
referer: | ||
- http://localhost:8000 | ||
method: POST | ||
uri: http://localhost:8000/v1/fp/process/ | ||
response: | ||
body: | ||
string: dntzCMUgfA3sFzXXmSjGe6 | ||
headers: | ||
Allow: | ||
- POST, OPTIONS | ||
Connection: | ||
- close | ||
Content-Length: | ||
- '22' | ||
Content-Type: | ||
- text/plain | ||
Cross-Origin-Opener-Policy: | ||
- same-origin | ||
Date: | ||
- Sun, 02 Oct 2022 04:42:25 GMT | ||
Referrer-Policy: | ||
- same-origin | ||
Server: | ||
- gunicorn | ||
Vary: | ||
- Origin, Cookie | ||
X-Content-Type-Options: | ||
- nosniff | ||
X-Frame-Options: | ||
- DENY | ||
status: | ||
code: 200 | ||
message: OK | ||
- request: | ||
body: '{"uploadIds": ["dntzCMUgfA3sFzXXmSjGe6"], "task": "DocumentClassification", | ||
"format": "JSONL"}' | ||
headers: | ||
Accept-Encoding: | ||
- gzip, deflate | ||
Connection: | ||
- keep-alive | ||
Content-Length: | ||
- '94' | ||
Cookie: | ||
- csrftoken=dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b; | ||
sessionid=fcjn46n6aqmya549fego0q7igo89cxks | ||
User-Agent: | ||
- python-requests/2.28.1 | ||
X-CSRFToken: | ||
- dt7iPmKOH48xSezmmypld4izNQtiMVcEA64Qf4a7sSVlTIToGyXjnsQHJRYHuy2b | ||
accept: | ||
- application/json | ||
content-type: | ||
- application/json | ||
referer: | ||
- http://localhost:8000 | ||
method: POST | ||
uri: http://localhost:8000/v1/projects/16/upload | ||
response: | ||
body: | ||
string: '{"task_id":"713c6904-8a07-44bc-8411-333672fb77d5"}' | ||
headers: | ||
Allow: | ||
- POST, OPTIONS | ||
Connection: | ||
- close | ||
Content-Length: | ||
- '50' | ||
Content-Type: | ||
- application/json | ||
Cross-Origin-Opener-Policy: | ||
- same-origin | ||
Date: | ||
- Sun, 02 Oct 2022 04:42:26 GMT | ||
Referrer-Policy: | ||
- same-origin | ||
Server: | ||
- gunicorn | ||
Vary: | ||
- Accept, Origin, Cookie | ||
X-Content-Type-Options: | ||
- nosniff | ||
X-Frame-Options: | ||
- DENY | ||
status: | ||
code: 200 | ||
message: OK | ||
version: 1 |
Oops, something went wrong.