diff --git a/cli/commands/resource/validate.spec.ts b/cli/commands/resource/validate.spec.ts
index 9bd3b432..4dbc78da 100644
--- a/cli/commands/resource/validate.spec.ts
+++ b/cli/commands/resource/validate.spec.ts
@@ -9,9 +9,10 @@ useRecording()
describe("resource validate", () => {
it("should validate a valid resource", async () => {
+ const csvPath = await writeTempFile("id,name\n1,alice\n2,bob")
const resourceContent = JSON.stringify({
name: "test-resource",
- path: "data.csv",
+ path: basename(csvPath),
})
const resourcePath = await writeTempFile(resourceContent)
diff --git a/cli/commands/schema/infer.tsx b/cli/commands/schema/infer.tsx
index dfbd5ff9..ee770f61 100644
--- a/cli/commands/schema/infer.tsx
+++ b/cli/commands/schema/infer.tsx
@@ -79,6 +79,11 @@ export const inferSchemaCommand = new Command("infer")
loadTable(resource, { denormalized: true }),
)
+ if (!table) {
+ session.terminate("Could not load table")
+ process.exit(1)
+ }
+
const inferredSchema = await session.task(
"Inferring schema",
inferSchemaFromTable(table, options),
@@ -86,7 +91,7 @@ export const inferSchemaCommand = new Command("infer")
if (isEmptyObject(inferredSchema)) {
session.terminate("Could not infer schema")
- process.exit(1) // typescript ignore never return type above
+ process.exit(1)
}
await session.render(inferredSchema, )
diff --git a/cli/commands/table/convert.tsx b/cli/commands/table/convert.tsx
index 6f45d941..5c0649d2 100644
--- a/cli/commands/table/convert.tsx
+++ b/cli/commands/table/convert.tsx
@@ -142,6 +142,11 @@ export const convertTableCommand = new Command("convert")
loadTable(resource, options),
)
+ if (!table) {
+ session.terminate("Could not load table")
+ process.exit(1)
+ }
+
if (options.query) {
table = queryTable(table, options.query)
}
diff --git a/cli/commands/table/describe.tsx b/cli/commands/table/describe.tsx
index 63fc4a56..8b5ba6da 100644
--- a/cli/commands/table/describe.tsx
+++ b/cli/commands/table/describe.tsx
@@ -93,6 +93,11 @@ export const describeTableCommand = new Command("describe")
loadTable(resource, options),
)
+ if (!table) {
+ session.terminate("Could not load table")
+ process.exit(1)
+ }
+
if (options.query) {
table = queryTable(table, options.query)
}
diff --git a/cli/commands/table/explore.tsx b/cli/commands/table/explore.tsx
index 39b28687..ee4db088 100644
--- a/cli/commands/table/explore.tsx
+++ b/cli/commands/table/explore.tsx
@@ -92,6 +92,11 @@ export const exploreTableCommand = new Command("explore")
loadTable(resource, { denormalized: true }),
)
+ if (!table) {
+ session.terminate("Could not load table")
+ process.exit(1)
+ }
+
if (!schema && resource.schema) {
schema = await session.task(
"Loading schema",
diff --git a/cli/commands/table/script.tsx b/cli/commands/table/script.tsx
index 4718bad4..c8f5dddf 100644
--- a/cli/commands/table/script.tsx
+++ b/cli/commands/table/script.tsx
@@ -94,6 +94,11 @@ export const scriptTableCommand = new Command("script")
loadTable(resource, options),
)
+ if (!table) {
+ session.terminate("Could not load table")
+ process.exit(1)
+ }
+
if (options.query) {
table = queryTable(table, options.query)
}
diff --git a/cli/commands/table/validate.tsx b/cli/commands/table/validate.tsx
index 7d553c84..386f5dd6 100644
--- a/cli/commands/table/validate.tsx
+++ b/cli/commands/table/validate.tsx
@@ -99,6 +99,11 @@ export const validateTableCommand = new Command("validate")
loadTable(resource, { denormalized: true }),
)
+ if (!table) {
+ session.terminate("Could not load table")
+ process.exit(1)
+ }
+
if (!schema && resource.schema) {
schema = await session.task(
"Loading schema",
diff --git a/cloud/components/Report/Error/Error.tsx b/cloud/components/Report/Error/Error.tsx
index db079bd3..3c2a8471 100644
--- a/cloud/components/Report/Error/Error.tsx
+++ b/cloud/components/Report/Error/Error.tsx
@@ -14,7 +14,7 @@ import {
} from "./Cell.tsx"
import { FieldNameError, FieldTypeError } from "./Field.tsx"
import { FieldsExtraError, FieldsMissingError } from "./Fields.tsx"
-import { BytesError, HashError } from "./File.tsx"
+import { BytesError, EncodingError, HashError } from "./File.tsx"
import { MetadataError } from "./Metadata.tsx"
import { RowUniqueError } from "./Row.tsx"
@@ -31,6 +31,8 @@ export function Error(props: {
return
case "file/hash":
return
+ case "file/encoding":
+ return
case "fields/missing":
return
case "fields/extra":
diff --git a/cloud/components/Report/Error/File.tsx b/cloud/components/Report/Error/File.tsx
index 2e907788..179c1fbb 100644
--- a/cloud/components/Report/Error/File.tsx
+++ b/cloud/components/Report/Error/File.tsx
@@ -35,3 +35,20 @@ export function HashError(props: { error: errorTypes.HashError }) {
)
}
+
+export function EncodingError(props: { error: errorTypes.EncodingError }) {
+ const { t } = useTranslation()
+
+ return (
+
+ {t("File encoding")} {t("is expected to be")}{" "}
+
+ {props.error.encoding}
+ {" "}
+ {t("but it is actually")}{" "}
+
+ {props.error.actualEncoding}
+
+
+ )
+}
diff --git a/cloud/locales/de.json b/cloud/locales/de.json
index 3de71ba1..14b4b813 100644
--- a/cloud/locales/de.json
+++ b/cloud/locales/de.json
@@ -51,6 +51,7 @@
"are not expected": "werden nicht erwartet",
"File size": "Dateigröße",
"File hash": "Datei-Hash",
+ "File encoding": "Datei-Kodierung",
"Field name": "Feldname",
"at": "bei",
"The cell values of the fields": "Die Zellwerte der Felder",
diff --git a/cloud/locales/en.json b/cloud/locales/en.json
index e5d066e5..72ea6d2d 100644
--- a/cloud/locales/en.json
+++ b/cloud/locales/en.json
@@ -51,6 +51,7 @@
"are not expected": "are not expected",
"File size": "File size",
"File hash": "File hash",
+ "File encoding": "File encoding",
"Field name": "Field name",
"at": "at",
"The cell values of the fields": "The cell values of the fields",
diff --git a/cloud/locales/es.json b/cloud/locales/es.json
index c8f6eb9d..9810ec87 100644
--- a/cloud/locales/es.json
+++ b/cloud/locales/es.json
@@ -51,6 +51,7 @@
"are not expected": "no se esperan",
"File size": "Tamaño del archivo",
"File hash": "Hash del archivo",
+ "File encoding": "Codificación del archivo",
"Field name": "Nombre del campo",
"at": "en",
"The cell values of the fields": "Los valores de celda de los campos",
diff --git a/cloud/locales/fr.json b/cloud/locales/fr.json
index cd5a3d7a..81f2078d 100644
--- a/cloud/locales/fr.json
+++ b/cloud/locales/fr.json
@@ -51,6 +51,7 @@
"are not expected": "ne sont pas attendus",
"File size": "Taille du fichier",
"File hash": "Hash du fichier",
+ "File encoding": "Encodage du fichier",
"Field name": "Nom du champ",
"at": "à",
"The cell values of the fields": "Les valeurs de cellule des champs",
diff --git a/cloud/locales/it.json b/cloud/locales/it.json
index 65f9d679..150a4122 100644
--- a/cloud/locales/it.json
+++ b/cloud/locales/it.json
@@ -51,6 +51,7 @@
"are not expected": "non sono previsti",
"File size": "Dimensione file",
"File hash": "Hash file",
+ "File encoding": "Codifica file",
"Field name": "Nome campo",
"at": "a",
"The cell values of the fields": "I valori delle celle dei campi",
diff --git a/cloud/locales/pt.json b/cloud/locales/pt.json
index e1fe124d..f80054a9 100644
--- a/cloud/locales/pt.json
+++ b/cloud/locales/pt.json
@@ -51,6 +51,7 @@
"are not expected": "não são esperados",
"File size": "Tamanho do arquivo",
"File hash": "Hash do arquivo",
+ "File encoding": "Codificação do arquivo",
"Field name": "Nome do campo",
"at": "em",
"The cell values of the fields": "Os valores das células dos campos",
diff --git a/cloud/locales/ru.json b/cloud/locales/ru.json
index 8323b0b2..fac2a1fa 100644
--- a/cloud/locales/ru.json
+++ b/cloud/locales/ru.json
@@ -51,6 +51,7 @@
"are not expected": "не ожидаются",
"File size": "Размер файла",
"File hash": "Хеш файла",
+ "File encoding": "Кодировка файла",
"Field name": "Имя поля",
"at": "в",
"The cell values of the fields": "Значения ячеек полей",
diff --git a/cloud/locales/uk.json b/cloud/locales/uk.json
index 37ada8a0..9bd3e518 100644
--- a/cloud/locales/uk.json
+++ b/cloud/locales/uk.json
@@ -51,6 +51,7 @@
"are not expected": "не очікуються",
"File size": "Розмір файлу",
"File hash": "Хеш файлу",
+ "File encoding": "Кодування файлу",
"Field name": "Ім'я поля",
"at": "в",
"The cell values of the fields": "Значення комірок полів",
diff --git a/core/package/assert.ts b/core/package/assert.ts
index 26d5a44c..d8eb1595 100644
--- a/core/package/assert.ts
+++ b/core/package/assert.ts
@@ -1,7 +1,7 @@
import { AssertionError } from "../error/index.ts"
import type { Descriptor } from "../general/index.ts"
import type { Package } from "./Package.ts"
-import { validatePackageDescriptor } from "./validate.ts"
+import { validatePackageMetadata } from "./validate.ts"
/**
* Assert a Package descriptor (JSON Object) against its profile
@@ -12,10 +12,7 @@ export async function assertPackage(
basepath?: string
},
) {
- const { errors, dataPackage } = await validatePackageDescriptor(
- source,
- options,
- )
+ const { errors, dataPackage } = await validatePackageMetadata(source, options)
if (!dataPackage) throw new AssertionError(errors)
return dataPackage
diff --git a/core/package/fixtures/generated/validatePackageMetadata-should-validate-camtrap-dp-144_2984677073/recording.har b/core/package/fixtures/generated/validatePackageMetadata-should-validate-camtrap-dp-144_2984677073/recording.har
new file mode 100644
index 00000000..7ae6732d
--- /dev/null
+++ b/core/package/fixtures/generated/validatePackageMetadata-should-validate-camtrap-dp-144_2984677073/recording.har
@@ -0,0 +1,156 @@
+{
+ "log": {
+ "_recordingName": "validatePackageMetadata-should validate camtrap dp (#144)",
+ "creator": {
+ "comment": "persister:fs",
+ "name": "Polly.JS",
+ "version": "6.0.6"
+ },
+ "entries": [
+ {
+ "_id": "4306f83bec3be19c1183804f9d081277",
+ "_order": 0,
+ "cache": {},
+ "request": {
+ "bodySize": 0,
+ "cookies": [],
+ "headers": [],
+ "headersSize": 109,
+ "httpVersion": "HTTP/1.1",
+ "method": "GET",
+ "queryString": [],
+ "url": "https://raw.githubusercontent.com/tdwg/camtrap-dp/refs/tags/1.0.2/example/datapackage.json"
+ },
+ "response": {
+ "bodySize": 2603,
+ "content": {
+ "mimeType": "text/plain; charset=utf-8",
+ "size": 2603,
+ "text": "{\n \"resources\": [\n {\n \"name\": \"deployments\",\n \"path\": \"deployments.csv\",\n \"profile\": \"tabular-data-resource\",\n \"format\": \"csv\",\n \"mediatype\": \"text/csv\",\n \"encoding\": \"utf-8\",\n \"schema\": \"https://raw.githubusercontent.com/tdwg/camtrap-dp/1.0.2/deployments-table-schema.json\"\n },\n {\n \"name\": \"media\",\n \"path\": \"media.csv\",\n \"profile\": \"tabular-data-resource\",\n \"format\": \"csv\",\n \"mediatype\": \"text/csv\",\n \"encoding\": \"utf-8\",\n \"schema\": \"https://raw.githubusercontent.com/tdwg/camtrap-dp/1.0.2/media-table-schema.json\"\n },\n {\n \"name\": \"observations\",\n \"path\": \"observations.csv\",\n \"profile\": \"tabular-data-resource\",\n \"format\": \"csv\",\n \"mediatype\": \"text/csv\",\n \"encoding\": \"utf-8\",\n \"schema\": \"https://raw.githubusercontent.com/tdwg/camtrap-dp/1.0.2/observations-table-schema.json\"\n },\n {\n \"name\": \"individuals\",\n \"description\": \"Custom table/resource not part of the Camtrap DP model. Included to showcase that extending with more resources is possible.\",\n \"data\": [\n {\n \"id\": 1,\n \"individualName\": \"Reinaert\",\n \"scientificName\": \"Vulpes vulpes\"\n }\n ]\n }\n ],\n \"profile\": \"https://raw.githubusercontent.com/tdwg/camtrap-dp/1.0.2/camtrap-dp-profile.json\",\n \"name\": \"camtrap-dp-example-dataset\",\n \"id\": \"7cca70f5-ef8c-4f86-85fb-8f070937d7ab\",\n \"created\": \"2023-02-06T11:23:03Z\",\n \"title\": \"Sample from: MICA - Muskrat and coypu camera trap observations in Belgium, the Netherlands and Germany\",\n \"contributors\": [\n {\n \"title\": \"Axel Neukermans\",\n \"email\": \"axel.neukermans@inbo.be\",\n \"path\": \"https://orcid.org/0000-0003-0272-9180\",\n \"role\": \"contributor\",\n \"organization\": \"Research Institute for Nature and Forest (INBO)\"\n },\n {\n \"title\": \"Danny Van der beeck\",\n \"email\": \"daniel.vanderbeeck@gmail.com\"\n },\n {\n \"title\": \"Emma Cartuyvels\",\n \"email\": \"emma.cartuyvels@inbo.be\",\n \"role\": \"principalInvestigator\",\n \"organization\": \"Research Institute for Nature and Forest (INBO)\"\n },\n {\n \"title\": \"Peter Desmet\",\n \"email\": \"peter.desmet@inbo.be\",\n \"path\": \"https://orcid.org/0000-0002-8442-8025\",\n \"role\": \"contact\",\n \"organization\": \"Research Institute for Nature and Forest (INBO)\"\n },\n {\n \"title\": \"Research Institute for Nature and Forest (INBO)\",\n \"path\": \"https://inbo.be\",\n \"role\": \"rightsHolder\"\n },\n {\n \"title\": \"Research Institute for Nature and Forest (INBO)\",\n \"path\": \"https://inbo.be\",\n \"role\": \"publisher\"\n }\n ],\n \"description\": \"MICA - Muskrat and coypu camera trap observations in Belgium, the Netherlands and Germany is an occurrence dataset published by the Research Institute of Nature and Forest (INBO). It is part of the LIFE project MICA, in which innovative techniques are tested for a more efficient control of muskrat and coypu populations, both invasive species. This dataset is a sample of the original dataset and serves as an example of a Camera Trap Data Package (Camtrap DP).\",\n \"version\": \"1.0.2\",\n \"keywords\": [\n \"camera traps\",\n \"public awareness campaign\",\n \"flood protection\",\n \"flood control\",\n \"damage prevention\",\n \"animal damage\",\n \"pest control\",\n \"invasive alien species\",\n \"muskrat\",\n \"coypu\"\n ],\n \"image\": \"\",\n \"homepage\": \"https://camtrap-dp.tdwg.org/example/\",\n \"sources\": [\n {\n \"title\": \"Agouti\",\n \"path\": \"https://www.agouti.eu\",\n \"email\": \"agouti@wur.nl\",\n \"version\": \"v3.21\"\n }\n ],\n \"licenses\": [\n {\n \"name\": \"CC0-1.0\",\n \"scope\": \"data\"\n },\n {\n \"path\": \"http://creativecommons.org/licenses/by/4.0/\",\n \"scope\": \"media\"\n }\n ],\n \"bibliographicCitation\": \"Desmet P, Neukermans A, Van der beeck D, Cartuyvels E (2022). Sample from: MICA - Muskrat and coypu camera trap observations in Belgium, the Netherlands and Germany. Version 1.0.2. Research Institute for Nature and Forest (INBO). Dataset. https://camtrap-dp.tdwg.org/example/\",\n \"project\": {\n \"id\": \"86cabc14-d475-4439-98a7-e7b590bed60e\",\n \"title\": \"Management of Invasive Coypu and muskrAt in Europe\",\n \"acronym\": \"MICA\",\n \"description\": \"Invasive alien species such as the coypu and muskrat pose a major threat to biodiversity and cost millions of euros annually. By feeding on rushes and reeds, these animals cause serious damage to the environment in which they live and endangered species suffer from habitat loss. The disappearance of reeds and digging in dikes represents a safety risk for humans in the lowland areas. With the LIFE project MICA (), the partners from the participating countries want to develop a transnational plan for the management of coypu and muskrat populations in Europe and aim to reduce their population. The objective of an effective population control of coypu and muskrat is to protect lowlands from flooding, to prevent crop damage and loss of biodiversity. The objective of the project is to serve as a pilot and demonstration project in which ‘best practices’ are tested and new techniques are developed for a more efficient control of muskrat and coypu populations. By involving organisations from Belgium, Germany and the Netherlands, the project also promotes international cooperation and knowledge exchange in the field of muskrat and coypu management.\",\n \"samplingDesign\": \"targeted\",\n \"path\": \"https://lifemica.eu\",\n \"captureMethod\": [\n \"activityDetection\",\n \"timeLapse\"\n ],\n \"individualAnimals\": false,\n \"observationLevel\": [\n \"media\",\n \"event\"\n ]\n },\n \"coordinatePrecision\": 0.001,\n \"spatial\": {\n \"type\": \"Polygon\",\n \"coordinates\": [\n [\n [\n 4.013,\n 50.699\n ],\n [\n 5.659,\n 50.699\n ],\n [\n 5.659,\n 51.496\n ],\n [\n 4.013,\n 51.496\n ],\n [\n 4.013,\n 50.699\n ]\n ]\n ]\n },\n \"temporal\": {\n \"start\": \"2020-05-30\",\n \"end\": \"2021-04-18\"\n },\n \"taxonomic\": [\n {\n \"scientificName\": \"Anas platyrhynchos\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/DGP6\",\n \"taxonRank\": \"species\",\n \"vernacularNames\": {\n \"eng\": \"mallard\",\n \"nld\": \"wilde eend\"\n }\n },\n {\n \"scientificName\": \"Anas strepera\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/DGPL\",\n \"taxonRank\": \"species\",\n \"vernacularNames\": {\n \"eng\": \"gadwall\",\n \"nld\": \"krakeend\"\n }\n },\n {\n \"scientificName\": \"Ardea\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/32FH\",\n \"taxonRank\": \"genus\",\n \"vernacularNames\": {\n \"eng\": \"great herons\",\n \"nld\": \"reigers\"\n }\n },\n {\n \"scientificName\": \"Ardea cinerea\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/GCHS\",\n \"taxonRank\": \"species\",\n \"vernacularNames\": {\n \"eng\": \"grey heron\",\n \"nld\": \"blauwe reiger\"\n }\n },\n {\n \"scientificName\": \"Aves\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/V2\",\n \"taxonRank\": \"class\",\n \"vernacularNames\": {\n \"eng\": \"bird sp.\",\n \"nld\": \"vogel\"\n }\n },\n {\n \"scientificName\": \"Homo sapiens\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/6MB3T\",\n \"taxonRank\": \"species\",\n \"vernacularNames\": {\n \"eng\": \"human\",\n \"nld\": \"mens\"\n }\n },\n {\n \"scientificName\": \"Martes foina\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/3Y9VW\",\n \"taxonRank\": \"species\",\n \"vernacularNames\": {\n \"eng\": \"beech marten\",\n \"nld\": \"steenmarter\"\n }\n },\n {\n \"scientificName\": \"Mustela putorius\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/44QYC\",\n \"taxonRank\": \"species\",\n \"vernacularNames\": {\n \"eng\": \"European polecat\",\n \"nld\": \"bunzing\"\n }\n },\n {\n \"scientificName\": \"Rattus norvegicus\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/4RM67\",\n \"taxonRank\": \"species\",\n \"vernacularNames\": {\n \"eng\": \"brown rat\",\n \"nld\": \"bruine rat\"\n }\n },\n {\n \"scientificName\": \"Vulpes vulpes\",\n \"taxonID\": \"https://www.checklistbank.org/dataset/COL2023/taxon/5BSG3\",\n \"taxonRank\": \"species\",\n \"vernacularNames\": {\n \"eng\": \"red fox\",\n \"nld\": \"vos\"\n }\n }\n ],\n \"relatedIdentifiers\": [\n {\n \"relationType\": \"IsDerivedFrom\",\n \"relatedIdentifier\": \"https://doi.org/10.15468/5tb6ze\",\n \"resourceTypeGeneral\": \"Dataset\",\n \"relatedIdentifierType\": \"DOI\"\n },\n {\n \"relationType\": \"IsSupplementTo\",\n \"relatedIdentifier\": \"https://inbo.github.io/camtrapdp/\",\n \"resourceTypeGeneral\": \"Software\",\n \"relatedIdentifierType\": \"URL\"\n }\n ],\n \"references\": [ ]\n}\n"
+ },
+ "cookies": [],
+ "headers": [
+ {
+ "name": "accept-ranges",
+ "value": "bytes"
+ },
+ {
+ "name": "access-control-allow-origin",
+ "value": "*"
+ },
+ {
+ "name": "cache-control",
+ "value": "max-age=300"
+ },
+ {
+ "name": "connection",
+ "value": "keep-alive"
+ },
+ {
+ "name": "content-encoding",
+ "value": "gzip"
+ },
+ {
+ "name": "content-length",
+ "value": "2603"
+ },
+ {
+ "name": "content-security-policy",
+ "value": "default-src 'none'; style-src 'unsafe-inline'; sandbox"
+ },
+ {
+ "name": "content-type",
+ "value": "text/plain; charset=utf-8"
+ },
+ {
+ "name": "cross-origin-resource-policy",
+ "value": "cross-origin"
+ },
+ {
+ "name": "date",
+ "value": "Fri, 24 Oct 2025 12:04:41 GMT"
+ },
+ {
+ "name": "etag",
+ "value": "W/\"568d81aa4b4f1148ac5358387a8367b5853f76cb76dfb282d001809dbe43e173\""
+ },
+ {
+ "name": "expires",
+ "value": "Fri, 24 Oct 2025 12:09:41 GMT"
+ },
+ {
+ "name": "source-age",
+ "value": "0"
+ },
+ {
+ "name": "strict-transport-security",
+ "value": "max-age=31536000"
+ },
+ {
+ "name": "vary",
+ "value": "Authorization,Accept-Encoding"
+ },
+ {
+ "name": "via",
+ "value": "1.1 varnish"
+ },
+ {
+ "name": "x-cache",
+ "value": "MISS"
+ },
+ {
+ "name": "x-cache-hits",
+ "value": "0"
+ },
+ {
+ "name": "x-content-type-options",
+ "value": "nosniff"
+ },
+ {
+ "name": "x-fastly-request-id",
+ "value": "c27c933a46fb9b15459264fba7573facffd24599"
+ },
+ {
+ "name": "x-frame-options",
+ "value": "deny"
+ },
+ {
+ "name": "x-github-request-id",
+ "value": "54DE:431DC:29FED3:2E50D4:68FB6B59"
+ },
+ {
+ "name": "x-served-by",
+ "value": "cache-lis1490035-LIS"
+ },
+ {
+ "name": "x-timer",
+ "value": "S1761307482.672912,VS0,VE181"
+ },
+ {
+ "name": "x-xss-protection",
+ "value": "1; mode=block"
+ }
+ ],
+ "headersSize": 902,
+ "httpVersion": "HTTP/1.1",
+ "redirectURL": "",
+ "status": 200,
+ "statusText": "OK"
+ },
+ "startedDateTime": "2025-10-24T12:04:41.482Z",
+ "time": 398,
+ "timings": {
+ "blocked": -1,
+ "connect": -1,
+ "dns": -1,
+ "receive": 0,
+ "send": 0,
+ "ssl": -1,
+ "wait": 398
+ }
+ }
+ ],
+ "pages": [],
+ "version": "1.2"
+ }
+}
diff --git a/core/package/index.ts b/core/package/index.ts
index 924098f7..0d6e37bc 100644
--- a/core/package/index.ts
+++ b/core/package/index.ts
@@ -2,8 +2,11 @@ export type { Package } from "./Package.ts"
export { assertPackage } from "./assert.ts"
export { loadPackageDescriptor } from "./load.ts"
export { savePackageDescriptor } from "./save.ts"
-export { validatePackageDescriptor } from "./validate.ts"
+export { validatePackageMetadata } from "./validate.ts"
export { convertPackageFromDescriptor } from "./convert/fromDescriptor.ts"
export { convertPackageToDescriptor } from "./convert/toDescriptor.ts"
export type { Contributor } from "./Contributor.ts"
export { mergePackages } from "./merge.ts"
+
+// TODO: Remove in v2
+export { validatePackageMetadata as validatePackageDescriptor } from "./validate.ts"
diff --git a/core/package/validate.spec.ts b/core/package/validate.spec.ts
index efbf54a7..4f54e79e 100644
--- a/core/package/validate.spec.ts
+++ b/core/package/validate.spec.ts
@@ -1,11 +1,11 @@
import { useRecording } from "@dpkit/test"
import { describe, expect, it } from "vitest"
import { loadDescriptor } from "../general/index.ts"
-import { validatePackageDescriptor } from "./validate.ts"
+import { validatePackageMetadata } from "./validate.ts"
useRecording()
-describe("validatePackageDescriptor", () => {
+describe("validatePackageMetadata", () => {
it("returns valid result for valid package", async () => {
const descriptor = {
name: "example-package",
@@ -17,7 +17,7 @@ describe("validatePackageDescriptor", () => {
],
}
- const { valid, errors } = await validatePackageDescriptor(descriptor)
+ const { valid, errors } = await validatePackageMetadata(descriptor)
expect(valid).toBe(true)
expect(errors).toEqual([])
@@ -29,7 +29,7 @@ describe("validatePackageDescriptor", () => {
resources: "not-an-array", // Should be an array
}
- const { valid, errors } = await validatePackageDescriptor(descriptor)
+ const { valid, errors } = await validatePackageMetadata(descriptor)
expect(valid).toBe(false)
expect(errors.length).toBeGreaterThan(0)
@@ -46,7 +46,7 @@ describe("validatePackageDescriptor", () => {
"https://raw.githubusercontent.com/tdwg/camtrap-dp/refs/tags/1.0.2/example/datapackage.json",
)
- const { valid } = await validatePackageDescriptor(descriptor)
+ const { valid } = await validatePackageMetadata(descriptor)
expect(valid).toBe(true)
})
})
diff --git a/core/package/validate.ts b/core/package/validate.ts
index 40236bc5..ef36e28f 100644
--- a/core/package/validate.ts
+++ b/core/package/validate.ts
@@ -8,7 +8,7 @@ const DEFAULT_PROFILE = "https://datapackage.org/profiles/1.0/datapackage.json"
/**
* Validate a Package descriptor (JSON Object) against its profile
*/
-export async function validatePackageDescriptor(
+export async function validatePackageMetadata(
source: Descriptor | Package,
options?: {
basepath?: string
diff --git a/core/resource/assert.ts b/core/resource/assert.ts
index f30207a7..a05c0ae1 100644
--- a/core/resource/assert.ts
+++ b/core/resource/assert.ts
@@ -1,7 +1,7 @@
import { AssertionError } from "../error/index.ts"
import type { Descriptor } from "../general/index.ts"
import type { Resource } from "./Resource.ts"
-import { validateResourceDescriptor } from "./validate.ts"
+import { validateResourceMetadata } from "./validate.ts"
/**
* Assert a Resource descriptor (JSON Object) against its profile
@@ -12,7 +12,7 @@ export async function assertResource(
basepath?: string
},
) {
- const { errors, resource } = await validateResourceDescriptor(source, options)
+ const { errors, resource } = await validateResourceMetadata(source, options)
if (!resource) throw new AssertionError(errors)
return resource
diff --git a/core/resource/index.ts b/core/resource/index.ts
index b5b8abe1..085b83e2 100644
--- a/core/resource/index.ts
+++ b/core/resource/index.ts
@@ -3,7 +3,7 @@ export { inferResourceName, inferResourceFormat } from "./infer.ts"
export { assertResource } from "./assert.ts"
export { loadResourceDescriptor } from "./load.ts"
export { saveResourceDescriptor } from "./save.ts"
-export { validateResourceDescriptor } from "./validate.ts"
+export { validateResourceMetadata } from "./validate.ts"
export { convertResourceFromDescriptor } from "./convert/fromDescriptor.ts"
export { convertResourceToDescriptor } from "./convert/toDescriptor.ts"
export type { Source } from "./Source.ts"
@@ -11,3 +11,6 @@ export type { License } from "./License.ts"
export { loadResourceDialect } from "./dialect.ts"
export { loadResourceSchema } from "./schema.ts"
export { isRemoteResource } from "./helpers.ts"
+
+// TODO: Remove in v2
+export { validateResourceMetadata as validateResourceDescriptor } from "./validate.ts"
diff --git a/core/resource/validate.spec.ts b/core/resource/validate.spec.ts
index a6bf646d..04a06d73 100644
--- a/core/resource/validate.spec.ts
+++ b/core/resource/validate.spec.ts
@@ -1,7 +1,7 @@
import { describe, expect, it } from "vitest"
-import { validateResourceDescriptor } from "./validate.ts"
+import { validateResourceMetadata } from "./validate.ts"
-describe("validateResourceDescriptor", () => {
+describe("validateResourceMetadata", () => {
it("returns valid result for valid resource", async () => {
const descriptor = {
name: "example-resource",
@@ -10,7 +10,7 @@ describe("validateResourceDescriptor", () => {
encoding: "utf-8",
}
- const result = await validateResourceDescriptor(descriptor)
+ const result = await validateResourceMetadata(descriptor)
expect(result.valid).toBe(true)
expect(result.errors).toEqual([])
@@ -22,7 +22,7 @@ describe("validateResourceDescriptor", () => {
path: true, // Should be a string or array of strings
}
- const result = await validateResourceDescriptor(invalidResource)
+ const result = await validateResourceMetadata(invalidResource)
expect(result.valid).toBe(false)
expect(result.errors.length).toBeGreaterThan(0)
diff --git a/core/resource/validate.ts b/core/resource/validate.ts
index 10cc2efa..7ef1d3eb 100644
--- a/core/resource/validate.ts
+++ b/core/resource/validate.ts
@@ -11,7 +11,7 @@ const DEFAULT_PROFILE = "https://datapackage.org/profiles/1.0/dataresource.json"
/**
* Validate a Resource descriptor (JSON Object) against its profile
*/
-export async function validateResourceDescriptor(
+export async function validateResourceMetadata(
source: Descriptor | Resource,
options?: {
basepath?: string
diff --git a/csv/table/load.ts b/csv/table/load.ts
index f5d61c85..b451dc66 100644
--- a/csv/table/load.ts
+++ b/csv/table/load.ts
@@ -71,6 +71,15 @@ function getScanOptions(resource: Partial, dialect?: Dialect) {
if (resource.encoding) {
options.encoding = resource.encoding
+
+ // Polars supports only utf-8 and utf-8-lossy encodings
+ if (options.encoding === "utf-8") {
+ options.encoding = "utf8"
+ }
+
+ if (options.encoding !== "utf8") {
+ throw new Error(`Encoding ${options.encoding} for CSV files is not supported`)
+ }
}
options.skipRows = getRowsToSkip(dialect)
diff --git a/file/error/Encoding.ts b/file/error/Encoding.ts
new file mode 100644
index 00000000..57217338
--- /dev/null
+++ b/file/error/Encoding.ts
@@ -0,0 +1,7 @@
+import type { BaseFileError } from "./Base.ts"
+
+export interface EncodingError extends BaseFileError {
+ type: "file/encoding"
+ encoding: string
+ actualEncoding: string
+}
diff --git a/file/error/File.ts b/file/error/File.ts
index 997380d7..d079a57b 100644
--- a/file/error/File.ts
+++ b/file/error/File.ts
@@ -1,4 +1,5 @@
import type { BytesError } from "./Bytes.ts"
+import type { EncodingError } from "./Encoding.ts"
import type { HashError } from "./Hash.ts"
-export type FileError = BytesError | HashError
+export type FileError = BytesError | HashError | EncodingError
diff --git a/file/error/index.ts b/file/error/index.ts
index 1e4f6ce1..547d82e2 100644
--- a/file/error/index.ts
+++ b/file/error/index.ts
@@ -1,3 +1,4 @@
export type * from "./File.ts"
export type * from "./Bytes.ts"
+export type * from "./Encoding.ts"
export type * from "./Hash.ts"
diff --git a/file/file/infer.spec.ts b/file/file/infer.spec.ts
index 0068483c..aca85bad 100644
--- a/file/file/infer.spec.ts
+++ b/file/file/infer.spec.ts
@@ -4,71 +4,71 @@ import { inferFileBytes, inferFileEncoding, inferFileHash } from "./infer.ts"
import { writeTempFile } from "./temp.ts"
vi.mock("./fetch.ts", () => ({
- prefetchFile: vi.fn(),
+ prefetchFiles: vi.fn(),
}))
describe("inferFileHash", () => {
- let mockPrefetchFile: ReturnType
+ let mockPrefetchFiles: ReturnType
let tempFilePath: string
beforeEach(async () => {
- mockPrefetchFile = vi.mocked(fetchModule.prefetchFile)
+ mockPrefetchFiles = vi.mocked(fetchModule.prefetchFiles)
tempFilePath = await writeTempFile("Hello, World!")
vi.clearAllMocks()
})
it("should compute sha256 hash by default", async () => {
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await inferFileHash("https://example.com/file.txt")
- expect(mockPrefetchFile).toHaveBeenCalledWith(
+ expect(mockPrefetchFiles).toHaveBeenCalledWith(
"https://example.com/file.txt",
)
expect(result).toMatch(/^sha256:[a-f0-9]{64}$/)
})
it("should compute md5 hash when specified", async () => {
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await inferFileHash("https://example.com/file.txt", {
hashType: "md5",
})
- expect(mockPrefetchFile).toHaveBeenCalledWith(
+ expect(mockPrefetchFiles).toHaveBeenCalledWith(
"https://example.com/file.txt",
)
expect(result).toMatch(/^md5:[a-f0-9]{32}$/)
})
it("should compute sha1 hash when specified", async () => {
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await inferFileHash("https://example.com/file.txt", {
hashType: "sha1",
})
- expect(mockPrefetchFile).toHaveBeenCalledWith(
+ expect(mockPrefetchFiles).toHaveBeenCalledWith(
"https://example.com/file.txt",
)
expect(result).toMatch(/^sha1:[a-f0-9]{40}$/)
})
it("should compute sha512 hash when specified", async () => {
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await inferFileHash("https://example.com/file.txt", {
hashType: "sha512",
})
- expect(mockPrefetchFile).toHaveBeenCalledWith(
+ expect(mockPrefetchFiles).toHaveBeenCalledWith(
"https://example.com/file.txt",
)
expect(result).toMatch(/^sha512:[a-f0-9]{128}$/)
})
it("should compute consistent hashes for same content", async () => {
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result1 = await inferFileHash("https://example.com/file.txt")
const result2 = await inferFileHash("https://example.com/file.txt")
@@ -78,20 +78,20 @@ describe("inferFileHash", () => {
})
describe("inferFileBytes", () => {
- let mockPrefetchFile: ReturnType
+ let mockPrefetchFiles: ReturnType
beforeEach(() => {
- mockPrefetchFile = vi.mocked(fetchModule.prefetchFile)
+ mockPrefetchFiles = vi.mocked(fetchModule.prefetchFiles)
vi.clearAllMocks()
})
it("should return file size in bytes", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await inferFileBytes("https://example.com/file.txt")
- expect(mockPrefetchFile).toHaveBeenCalledWith(
+ expect(mockPrefetchFiles).toHaveBeenCalledWith(
"https://example.com/file.txt",
)
expect(result).toBe(13)
@@ -99,7 +99,7 @@ describe("inferFileBytes", () => {
it("should handle empty files", async () => {
const tempFilePath = await writeTempFile("")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await inferFileBytes("https://example.com/empty.txt")
@@ -108,7 +108,7 @@ describe("inferFileBytes", () => {
it("should handle larger files", async () => {
const tempFilePath = await writeTempFile("x".repeat(10000))
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await inferFileBytes("https://example.com/large.txt")
@@ -119,11 +119,11 @@ describe("inferFileBytes", () => {
const tempFilePath = await writeTempFile(
Buffer.from([0xff, 0xd8, 0xff, 0xe0]),
)
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await inferFileBytes("https://example.com/file.bin")
- expect(mockPrefetchFile).toHaveBeenCalledWith(
+ expect(mockPrefetchFiles).toHaveBeenCalledWith(
"https://example.com/file.bin",
)
expect(result).toBe(4)
diff --git a/file/file/infer.ts b/file/file/infer.ts
index 6aaaf087..0f5acb98 100644
--- a/file/file/infer.ts
+++ b/file/file/infer.ts
@@ -1,38 +1,52 @@
import { stat } from "node:fs/promises"
import chardet from "chardet"
-import { hashFile } from "hasha"
+import * as hasha from "hasha"
import { isBinaryFile } from "isbinaryfile"
-import { prefetchFile } from "./fetch.ts"
+import pMap from "p-map"
+import { concatFileStreams } from "../stream/concat.ts"
+import { loadFileStream } from "../stream/index.ts"
+import { prefetchFiles } from "./fetch.ts"
import { loadFile } from "./load.ts"
export type HashType = "md5" | "sha1" | "sha256" | "sha512"
+export async function inferFileBytes(path: string | string[]) {
+ const localPaths = await prefetchFiles(path)
+
+ let bytes = 0
+ for (const localPath of localPaths) {
+ const result = await stat(localPath)
+ bytes += result.size
+ }
+
+ return bytes
+}
+
export async function inferFileHash(
- path: string,
+ path: string | string[],
options?: { hashType?: HashType },
) {
- const localPath = await prefetchFile(path)
const algorithm = options?.hashType ?? "sha256"
+ const localPaths = await prefetchFiles(path)
- const result = await hashFile(localPath, { algorithm })
- return `${algorithm}:${result}`
-}
-
-export async function inferFileBytes(path: string) {
- const localPath = await prefetchFile(path)
+ const streams = await pMap(localPaths, async path => loadFileStream(path))
+ const stream = concatFileStreams(streams)
- const result = await stat(localPath)
- return result.size
+ const hash = await hasha.hash(stream, { algorithm })
+ return `${algorithm}:${hash}`
}
export async function inferFileEncoding(
- path: string,
+ path: string | string[],
options?: { sampleBytes?: number; confidencePercent?: number },
) {
const maxBytes = options?.sampleBytes ?? 10_000
- const confidencePercent = options?.confidencePercent ?? 75
+ const confidencePercent = options?.confidencePercent ?? 80
+
+ const firstPath = Array.isArray(path) ? path[0] : path
+ if (!firstPath) return undefined
- const buffer = await loadFile(path, { maxBytes })
+ const buffer = await loadFile(firstPath, { maxBytes })
const isBinary = await isBinaryFile(buffer)
if (!isBinary) {
diff --git a/file/file/validate.spec.ts b/file/file/validate.spec.ts
index fb8e79fe..e6e219a5 100644
--- a/file/file/validate.spec.ts
+++ b/file/file/validate.spec.ts
@@ -5,24 +5,24 @@ import { writeTempFile } from "./temp.ts"
import { validateFile } from "./validate.ts"
vi.mock("./fetch.ts", () => ({
- prefetchFile: vi.fn(),
+ prefetchFiles: vi.fn(),
}))
describe("validateFile", () => {
- let mockPrefetchFile: ReturnType
+ let mockPrefetchFiles: ReturnType
beforeEach(() => {
- mockPrefetchFile = vi.mocked(fetchModule.prefetchFile)
+ mockPrefetchFiles = vi.mocked(fetchModule.prefetchFiles)
vi.clearAllMocks()
})
it("should return valid result when no validation options provided", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await validateFile("https://example.com/file.txt")
- expect(mockPrefetchFile).toHaveBeenCalledWith(
+ expect(mockPrefetchFiles).toHaveBeenCalledWith(
"https://example.com/file.txt",
)
expect(result).toEqual({ valid: true, errors: [] })
@@ -30,7 +30,7 @@ describe("validateFile", () => {
it("should validate bytes successfully when they match", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await validateFile("https://example.com/file.txt", {
bytes: 13,
@@ -41,7 +41,7 @@ describe("validateFile", () => {
it("should return error when bytes do not match", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await validateFile("https://example.com/file.txt", {
bytes: 1024,
@@ -58,7 +58,7 @@ describe("validateFile", () => {
it("should validate hash successfully when it matches", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const actualHash = await inferFileHash(tempFilePath, { hashType: "md5" })
@@ -71,7 +71,7 @@ describe("validateFile", () => {
it("should return error when hash does not match", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const actualHash = await inferFileHash(tempFilePath, { hashType: "md5" })
@@ -90,7 +90,7 @@ describe("validateFile", () => {
it("should validate sha256 hash", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const actualHash = await inferFileHash(tempFilePath, { hashType: "sha256" })
@@ -103,7 +103,7 @@ describe("validateFile", () => {
it("should validate sha1 hash", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const actualHash = await inferFileHash(tempFilePath, { hashType: "sha1" })
@@ -116,7 +116,7 @@ describe("validateFile", () => {
it("should validate sha512 hash", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const actualHash = await inferFileHash(tempFilePath, { hashType: "sha512" })
@@ -129,7 +129,7 @@ describe("validateFile", () => {
it("should validate both bytes and hash when both match", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const actualHash = await inferFileHash(tempFilePath, { hashType: "md5" })
@@ -143,7 +143,7 @@ describe("validateFile", () => {
it("should return multiple errors when both bytes and hash do not match", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const actualHash = await inferFileHash(tempFilePath, { hashType: "md5" })
@@ -168,7 +168,7 @@ describe("validateFile", () => {
it("should return error when only bytes mismatch", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const actualHash = await inferFileHash(tempFilePath, { hashType: "md5" })
@@ -184,7 +184,7 @@ describe("validateFile", () => {
it("should return error when only hash mismatch", async () => {
const tempFilePath = await writeTempFile("Hello, World!")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await validateFile("https://example.com/file.txt", {
bytes: 13,
@@ -198,17 +198,17 @@ describe("validateFile", () => {
it("should handle local file paths", async () => {
const tempFilePath = await writeTempFile("x".repeat(2048))
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await validateFile("/local/path/file.txt", { bytes: 2048 })
- expect(mockPrefetchFile).toHaveBeenCalledWith("/local/path/file.txt")
+ expect(mockPrefetchFiles).toHaveBeenCalledWith("/local/path/file.txt")
expect(result).toEqual({ valid: true, errors: [] })
})
it("should handle empty file validation", async () => {
const tempFilePath = await writeTempFile("")
- mockPrefetchFile.mockResolvedValue(tempFilePath)
+ mockPrefetchFiles.mockResolvedValue([tempFilePath])
const result = await validateFile("https://example.com/empty.txt", {
bytes: 0,
diff --git a/file/file/validate.ts b/file/file/validate.ts
index fc892dc6..ea0e5cbb 100644
--- a/file/file/validate.ts
+++ b/file/file/validate.ts
@@ -1,16 +1,16 @@
import type { FileError } from "../error/index.ts"
-import { prefetchFile } from "./fetch.ts"
-import { inferFileBytes, inferFileHash } from "./infer.ts"
+import { prefetchFiles } from "./fetch.ts"
+import { inferFileBytes, inferFileEncoding, inferFileHash } from "./infer.ts"
export async function validateFile(
- path: string,
- options?: { bytes?: number; hash?: string },
+ path?: string | string[],
+ options?: { bytes?: number; hash?: string; encoding?: string },
) {
const errors: FileError[] = []
- const localPath = await prefetchFile(path)
+ const localPaths = await prefetchFiles(path)
if (options?.bytes) {
- const bytes = await inferFileBytes(localPath)
+ const bytes = await inferFileBytes(localPaths)
if (bytes !== options.bytes) {
errors.push({
type: "file/bytes",
@@ -22,9 +22,10 @@ export async function validateFile(
if (options?.hash) {
const [_hashValue, hashType = "md5"] = options.hash.split(":").toReversed()
- // TODO: figure out how we should handle other hash types
- // @ts-ignore
- const hash = await inferFileHash(localPath, { hashType })
+ const hash = await inferFileHash(localPaths, {
+ hashType: hashType as any,
+ })
+
if (hash !== options.hash) {
errors.push({
type: "file/hash",
@@ -34,6 +35,18 @@ export async function validateFile(
}
}
+ if (options?.encoding) {
+ const encoding = await inferFileEncoding(localPaths)
+
+ if (encoding && encoding !== options.encoding) {
+ errors.push({
+ type: "file/encoding",
+ encoding: options.encoding,
+ actualEncoding: encoding,
+ })
+ }
+ }
+
const valid = errors.length === 0
return { valid, errors }
}
diff --git a/file/package.json b/file/package.json
index a5899235..c99ced6f 100644
--- a/file/package.json
+++ b/file/package.json
@@ -29,7 +29,12 @@
"exit-hook": "^4.0.0",
"hasha": "^6.0.0",
"isbinaryfile": "^5.0.4",
+ "multistream": "^4.1.0",
+ "p-map": "^7.0.3",
"tempy": "3.1.0",
"tiny-invariant": "^1.3.3"
+ },
+ "devDependencies": {
+ "@types/multistream": "4.1.3"
}
}
diff --git a/file/stream/concat.ts b/file/stream/concat.ts
new file mode 100644
index 00000000..4ee77fa8
--- /dev/null
+++ b/file/stream/concat.ts
@@ -0,0 +1,6 @@
+import type { Readable } from "node:stream"
+import { default as Multistream } from "multistream"
+
+export function concatFileStreams(streams: Readable[]) {
+ return new Multistream(streams)
+}
diff --git a/file/stream/index.ts b/file/stream/index.ts
index 471bbcc2..9bb335db 100644
--- a/file/stream/index.ts
+++ b/file/stream/index.ts
@@ -1,2 +1,3 @@
+export { concatFileStreams } from "./concat.ts"
export { loadFileStream } from "./load.ts"
export { saveFileStream } from "./save.ts"
diff --git a/file/stream/load.ts b/file/stream/load.ts
index 5a426130..4a96e43d 100644
--- a/file/stream/load.ts
+++ b/file/stream/load.ts
@@ -3,7 +3,7 @@ import { Readable, Transform } from "node:stream"
import { isRemotePath } from "@dpkit/core"
export async function loadFileStream(
- pathOrPaths: string | string[],
+ path: string | string[],
options?: {
index?: number
maxBytes?: number
@@ -11,17 +11,17 @@ export async function loadFileStream(
) {
const index = options?.index ?? 0
- const paths = Array.isArray(pathOrPaths) ? pathOrPaths : [pathOrPaths]
- const path = paths[index]
+ const paths = Array.isArray(path) ? path : [path]
+ const indexPath = paths[index]
- if (!path) {
- throw new Error(`Cannot stream resource ${path} at index ${index}`)
+ if (!indexPath) {
+ throw new Error(`Cannot stream resource ${indexPath} at index ${index}`)
}
- const isRemote = isRemotePath(path)
+ const isRemote = isRemotePath(indexPath)
const stream = isRemote
- ? await loadRemoteFileStream(path, options)
- : await loadLocalFileStream(path, options)
+ ? await loadRemoteFileStream(indexPath, options)
+ : await loadLocalFileStream(indexPath, options)
return stream
}
diff --git a/lib/package/fixtures/issue-153/datapackage.json b/lib/package/fixtures/issue-153/datapackage.json
new file mode 100644
index 00000000..44a2ca9f
--- /dev/null
+++ b/lib/package/fixtures/issue-153/datapackage.json
@@ -0,0 +1,223 @@
+{
+ "name": "example_package",
+ "id": "115f49c1-8603-463e-a908-68de98327266",
+ "licenses": [
+ {
+ "name": "CC0-1.0",
+ "path": "https://creativecommons.org/publicdomain/zero/1.0/",
+ "title": "CC0 1.0"
+ }
+ ],
+ "version": "1.0",
+ "created": "2021-03-02T17:22:33Z",
+ "spatial": null,
+ "temporal": {
+ "start": "2020-01-01",
+ "end": "2021-01-10"
+ },
+ "resources": [
+ {
+ "name": "deployments",
+ "path": "deployments.csv",
+ "profile": "tabular-data-resource",
+ "title": "Camera trap deployments",
+ "format": "csv",
+ "mediatype": "text/csv",
+ "encoding": "utf-8",
+ "schema": {
+ "fields": [
+ {
+ "name": "deployment_id",
+ "type": "string",
+ "constraints": {
+ "required": true,
+ "unique": true
+ }
+ },
+ {
+ "name": "longitude",
+ "type": "number",
+ "constraints": {
+ "required": true,
+ "minimum": -180,
+ "maximum": 180
+ }
+ },
+ {
+ "name": "latitude",
+ "constraints": {
+ "required": true
+ }
+ },
+ {
+ "name": "start",
+ "type": "date",
+ "format": "%x",
+ "constraints": {
+ "required": true
+ }
+ },
+ {
+ "name": "comments",
+ "type": "string",
+ "constraints": {
+ "required": false
+ }
+ }
+ ],
+ "missingValues": ["", "NA", "NaN"],
+ "primaryKey": "deployment_id"
+ }
+ },
+ {
+ "name": "observations",
+ "path": ["observations_1.tsv", "observations_2.tsv"],
+ "profile": "tabular-data-resource",
+ "title": "Camera trap observations",
+ "format": "csv",
+ "mediatype": "text/csv",
+ "encoding": "utf-8",
+ "dialect": {
+ "delimiter": "\t"
+ },
+ "schema": {
+ "fields": [
+ {
+ "name": "observation_id",
+ "type": "string",
+ "constraints": {
+ "required": true,
+ "unique": true
+ }
+ },
+ {
+ "name": "deployment_id",
+ "type": "string",
+ "constraints": {
+ "required": true
+ }
+ },
+ {
+ "name": "timestamp",
+ "type": "datetime",
+ "format": "%Y-%m-%dT%H:%M:%S%Z",
+ "constraints": {
+ "required": true
+ }
+ },
+ {
+ "name": "scientific_name",
+ "type": "string",
+ "constraints": {
+ "required": false
+ }
+ },
+ {
+ "name": "count",
+ "type": "integer",
+ "constraints": {
+ "required": false,
+ "minimum": 1
+ }
+ },
+ {
+ "name": "life_stage",
+ "type": "string",
+ "constraints": {
+ "required": false,
+ "enum": ["adult", "subadult", "juvenile", "offspring", "unknown"]
+ }
+ },
+ {
+ "name": "comments",
+ "type": "string",
+ "constraints": {
+ "required": false
+ }
+ }
+ ],
+ "missingValues": ["", "NA", "NaN"],
+ "primaryKey": "observation_id",
+ "foreignKeys": [
+ {
+ "fields": "deployment_id",
+ "reference": {
+ "resource": "deployments",
+ "fields": "deployment_id"
+ }
+ }
+ ]
+ }
+ },
+ {
+ "name": "media",
+ "data": [
+ {
+ "media_id": "aed5fa71-3ed4-4284-a6ba-3550d1a4de8d",
+ "deployment_id": "1",
+ "observation_id": "1-1",
+ "timestamp": "2020-09-28 02:14:59+02:00",
+ "file_path": "https://multimedia.agouti.eu/assets/aed5fa71-3ed4-4284-a6ba-3550d1a4de8d/file"
+ },
+ {
+ "media_id": "da81a501-8236-4cbd-aa95-4bc4b10a05df",
+ "deployment_id": "1",
+ "observation_id": "1-1",
+ "timestamp": "2020-09-28 02:15:00+02:00",
+ "file_path": "https://multimedia.agouti.eu/assets/da81a501-8236-4cbd-aa95-4bc4b10a05df/file"
+ },
+ {
+ "media_id": "0ba57608-3cf1-49d6-a5a2-fe680851024d",
+ "deployment_id": "1",
+ "observation_id": "1-1",
+ "timestamp": "2020-09-28 02:15:01+02:00",
+ "file_path": "https://multimedia.agouti.eu/assets/0ba57608-3cf1-49d6-a5a2-fe680851024d/file"
+ }
+ ],
+ "profile": "tabular-data-resource",
+ "title": "Camera trap media files",
+ "schema": {
+ "fields": [
+ {
+ "name": "media_id",
+ "type": "string"
+ },
+ {
+ "name": "deployment_id",
+ "type": "string"
+ },
+ {
+ "name": "observation_id",
+ "type": "string"
+ },
+ {
+ "name": "timestamp",
+ "type": "datetime",
+ "format": "%Y-%m-%d %H:%M:%S%z"
+ },
+ {
+ "name": "file_path",
+ "type": "string"
+ }
+ ],
+ "primaryKey": "media_id",
+ "foreignKeys": [
+ {
+ "fields": "deployment_id",
+ "reference": {
+ "resource": "deployments",
+ "fields": "deployment_id"
+ }
+ },
+ {
+ "fields": "observation_id",
+ "reference": {
+ "resource": "observations",
+ "fields": "observation_id"
+ }
+ }
+ ]
+ }
+ }
+ ]
+}
diff --git a/lib/package/fixtures/issue-153/deployments.csv b/lib/package/fixtures/issue-153/deployments.csv
new file mode 100644
index 00000000..9197708e
--- /dev/null
+++ b/lib/package/fixtures/issue-153/deployments.csv
@@ -0,0 +1,4 @@
+deployment_id,longitude,latitude,start,comments
+1,4.61612,50.76698,09/25/20,
+2,4.64286,50.82716,10/01/20,"On ""forêt"" road."
+3,bad,50.81860,10/05/20,"Malfunction/no photos, data"
diff --git a/lib/package/fixtures/issue-153/observations_1.tsv b/lib/package/fixtures/issue-153/observations_1.tsv
new file mode 100644
index 00000000..047d10b5
--- /dev/null
+++ b/lib/package/fixtures/issue-153/observations_1.tsv
@@ -0,0 +1,4 @@
+observation_id deployment_id timestamp scientific_name count life_stage comments
+1-1 1 2020-09-28T00:13:07Z Capreolus capreolus 1 juvenile Comment 1
+1-2 1 2020-09-28T15:59:17Z Capreolus capreolus 1 adult Comment 2
+1-3 1 2020-09-28T16:35:23Z Lepus europaeus 1 adult Comment 3
diff --git a/lib/package/fixtures/issue-153/observations_2.tsv b/lib/package/fixtures/issue-153/observations_2.tsv
new file mode 100644
index 00000000..883ec4b3
--- /dev/null
+++ b/lib/package/fixtures/issue-153/observations_2.tsv
@@ -0,0 +1,6 @@
+observation_id deployment_id timestamp scientific_name count life_stage comments
+1-4 1 2020-09-28T17:04:04Z Lepus europaeus 1 adult NA
+1-5 1 2020-09-28T19:19:54Z Sus scrofa 2 unknown NA
+2-1 2 2021-10-01T01:25:06Z Sus scrofa 1 unknown Duplicate
+2-2 2 2021-10-01T01:25:06Z Sus scrofa 1 unknown Duplicate
+2-3 2 2021-10-01T04:47:30Z Sus scrofa 1 unknown NA
diff --git a/lib/package/index.ts b/lib/package/index.ts
index 39f8f5db..9786294d 100644
--- a/lib/package/index.ts
+++ b/lib/package/index.ts
@@ -1,4 +1,4 @@
export { loadPackage } from "./load.ts"
export { savePackage } from "./save.ts"
export { inferPackage } from "./infer.ts"
-export { validatePackage } from "./validate.ts"
+export { validatePackage, validatePackageData } from "./validate.ts"
diff --git a/lib/package/validate.spec.ts b/lib/package/validate.spec.ts
index 8f787ada..ef097e63 100644
--- a/lib/package/validate.spec.ts
+++ b/lib/package/validate.spec.ts
@@ -170,4 +170,21 @@ describe("validatePackage", () => {
expect(error.resource).toBe("error-resource")
})
})
+
+ it("should detect bad cell type (issue-153)", async () => {
+ const dataPackage = "lib/package/fixtures/issue-153/datapackage.json"
+
+ const result = await validatePackage(dataPackage)
+
+ expect(result.valid).toBe(false)
+ expect(result.errors).toEqual([
+ {
+ rowNumber: 3,
+ type: "cell/type",
+ fieldName: "longitude",
+ cell: "bad",
+ resource: "deployments",
+ },
+ ])
+ })
})
diff --git a/lib/package/validate.ts b/lib/package/validate.ts
index 56941400..cf8f2109 100644
--- a/lib/package/validate.ts
+++ b/lib/package/validate.ts
@@ -1,10 +1,7 @@
import type { Descriptor, Package } from "@dpkit/core"
import { loadDescriptor, validatePackageDescriptor } from "@dpkit/core"
import { dpkit } from "../plugin.ts"
-import { validateResource } from "../resource/index.ts"
-
-// TODO: Improve implementation
-// TODO: Support multipart resources? (clarify on the specs level)
+import { validateResourceData } from "../resource/index.ts"
export async function validatePackage(
source: string | Descriptor | Partial,
@@ -43,15 +40,24 @@ export async function validatePackage(
}
}
- const resourceErrors = (
+ return await validatePackageData(dataPackage)
+}
+
+export async function validatePackageData(dataPackage: Package) {
+ const errors = (
await Promise.all(
dataPackage.resources.map(async resource => {
- const { errors } = await validateResource(resource)
- return errors.map(error => ({ ...error, resource: resource.name }))
+ try {
+ const { errors } = await validateResourceData(resource)
+ return errors.map(error => ({ ...error, resource: resource.name }))
+ } catch (error) {
+ const message = error instanceof Error ? error.message : String(error)
+ throw new Error(`[${resource.name}] ${message}`)
+ }
}),
)
).flat()
- const resourceValid = !resourceErrors.length
- return { valid: resourceValid, errors: resourceErrors }
+ const valid = !errors.length
+ return { valid, errors: errors }
}
diff --git a/lib/resource/index.ts b/lib/resource/index.ts
index ea494c50..1f1da403 100644
--- a/lib/resource/index.ts
+++ b/lib/resource/index.ts
@@ -1,2 +1,2 @@
export { inferResource } from "./infer.ts"
-export { validateResource } from "./validate.ts"
+export { validateResource, validateResourceData } from "./validate.ts"
diff --git a/lib/resource/validate.ts b/lib/resource/validate.ts
index fae3a089..ec74b4e4 100644
--- a/lib/resource/validate.ts
+++ b/lib/resource/validate.ts
@@ -1,14 +1,12 @@
import type { Descriptor, Resource } from "@dpkit/core"
import { loadResourceSchema } from "@dpkit/core"
-import { loadDescriptor, validateResourceDescriptor } from "@dpkit/core"
+import { loadDescriptor, validateResourceMetadata } from "@dpkit/core"
import { validateFile } from "@dpkit/file"
import { validateTable } from "@dpkit/table"
import type { InferSchemaOptions } from "@dpkit/table"
import { inferSchema } from "../schema/index.ts"
import { loadTable } from "../table/index.ts"
-// TODO: Support multipart resources? (clarify on the specs level)
-
export async function validateResource(
source: string | Descriptor | Partial,
options?: InferSchemaOptions & { basepath?: string },
@@ -22,7 +20,7 @@ export async function validateResource(
basepath = result.basepath
}
- const { valid, errors, resource } = await validateResourceDescriptor(
+ const { valid, errors, resource } = await validateResourceMetadata(
descriptor,
{ basepath },
)
@@ -31,24 +29,33 @@ export async function validateResource(
return { valid, errors }
}
- if (resource.bytes || resource.hash) {
- if (typeof resource.path === "string") {
- return await validateFile(resource.path, {
- bytes: resource.bytes,
- hash: resource.hash,
- })
- }
+ return await validateResourceData(resource, options)
+}
+
+export async function validateResourceData(
+ resource: Partial,
+ options?: InferSchemaOptions,
+) {
+ const fileReport = await validateFile(resource.path, {
+ bytes: resource.bytes,
+ hash: resource.hash,
+ encoding: resource.encoding,
+ })
+
+ if (!fileReport.valid) {
+ return fileReport
}
- try {
- // TODO: rebase on not-rasing?
- // It will raise if the resource is not a table
+ const table = await loadTable(resource, { denormalized: true })
+ if (table) {
let schema = await loadResourceSchema(resource.schema)
if (!schema) schema = await inferSchema(resource, options)
+ const tableReport = await validateTable(table, { schema })
- const table = await loadTable(resource, { denormalized: true })
- return await validateTable(table, { schema })
- } catch {}
+ if (!tableReport.valid) {
+ return tableReport
+ }
+ }
return { valid: true, errors: [] }
}
diff --git a/lib/schema/infer.ts b/lib/schema/infer.ts
index bd8b7f8b..aa2f1b85 100644
--- a/lib/schema/infer.ts
+++ b/lib/schema/infer.ts
@@ -1,4 +1,4 @@
-import type { Resource, Schema } from "@dpkit/core"
+import type { Resource } from "@dpkit/core"
import type { InferSchemaOptions } from "@dpkit/table"
import { inferSchemaFromTable } from "@dpkit/table"
import { dpkit } from "../plugin.ts"
@@ -7,7 +7,7 @@ import { loadTable } from "../table/index.ts"
export async function inferSchema(
resource: Partial,
options?: InferSchemaOptions,
-): Promise {
+) {
for (const plugin of dpkit.plugins) {
const schema = await plugin.inferSchema?.(resource, options)
if (schema) {
@@ -16,6 +16,10 @@ export async function inferSchema(
}
const table = await loadTable(resource, { denormalized: true })
+ if (!table) {
+ return undefined
+ }
+
const schema = await inferSchemaFromTable(table, options)
return schema
}
diff --git a/lib/table/infer.ts b/lib/table/infer.ts
index 42970275..dc25105e 100644
--- a/lib/table/infer.ts
+++ b/lib/table/infer.ts
@@ -1,15 +1,10 @@
-import type { Dialect, Resource, Schema } from "@dpkit/core"
+import type { Resource } from "@dpkit/core"
import { loadResourceDialect, loadResourceSchema } from "@dpkit/core"
-import type { Table } from "@dpkit/table"
import { inferSchemaFromTable } from "@dpkit/table"
import { inferDialect } from "../dialect/index.ts"
import { loadTable } from "./load.ts"
-// TODO: Allow non-tabular resources returning undefined?
-
-export async function inferTable(
- resource: Partial,
-): Promise<{ dialect: Dialect; schema: Schema; table: Table }> {
+export async function inferTable(resource: Partial) {
let dialect = await loadResourceDialect(resource.dialect)
if (!dialect) {
dialect = await inferDialect(resource)
@@ -20,6 +15,10 @@ export async function inferTable(
{ denormalized: true },
)
+ if (!table) {
+ return undefined
+ }
+
let schema = await loadResourceSchema(resource.schema)
if (!schema) {
schema = await inferSchemaFromTable(table)
diff --git a/lib/table/load.ts b/lib/table/load.ts
index 029e6dc7..4dcd484b 100644
--- a/lib/table/load.ts
+++ b/lib/table/load.ts
@@ -1,11 +1,11 @@
import type { Resource } from "@dpkit/core"
-import type { LoadTableOptions, Table } from "@dpkit/table"
+import type { LoadTableOptions } from "@dpkit/table"
import { dpkit } from "../plugin.ts"
export async function loadTable(
resource: Partial,
options?: LoadTableOptions,
-): Promise {
+) {
for (const plugin of dpkit.plugins) {
const table = await plugin.loadTable?.(resource, options)
if (table) {
@@ -13,5 +13,5 @@ export async function loadTable(
}
}
- throw new Error(`No plugin can load the table: ${resource}`)
+ return undefined
}
diff --git a/lib/table/save.ts b/lib/table/save.ts
index 204ed4ec..2332c080 100644
--- a/lib/table/save.ts
+++ b/lib/table/save.ts
@@ -9,5 +9,5 @@ export async function saveTable(table: Table, options: SaveTableOptions) {
}
}
- throw new Error(`No plugin can save the table to the path: ${options.path}`)
+ throw new Error(`No plugin can save the table: ${options.path}`)
}
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
index 682983d2..26c67621 100644
--- a/pnpm-lock.yaml
+++ b/pnpm-lock.yaml
@@ -434,12 +434,22 @@ importers:
isbinaryfile:
specifier: ^5.0.4
version: 5.0.6
+ multistream:
+ specifier: ^4.1.0
+ version: 4.1.0
+ p-map:
+ specifier: ^7.0.3
+ version: 7.0.3
tempy:
specifier: 3.1.0
version: 3.1.0
tiny-invariant:
specifier: ^1.3.3
version: 1.3.3
+ devDependencies:
+ '@types/multistream':
+ specifier: 4.1.3
+ version: 4.1.3
folder:
dependencies:
@@ -2661,6 +2671,9 @@ packages:
'@types/ms@2.1.0':
resolution: {integrity: sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==}
+ '@types/multistream@4.1.3':
+ resolution: {integrity: sha512-t57vmDEJOZuC0M3IrZYfCd9wolTcr3ZTCGk1iwHNosvgBX+7/SMvCGcR8wP9lidpelBZQ12crSuINOxkk0azPA==}
+
'@types/nlcst@2.0.3':
resolution: {integrity: sha512-vSYNSDe6Ix3q+6Z7ri9lyWqgGhJTmzRjZRqyq15N0Z/1/UnVsno9G/N40NBijoYx2seFDIl0+B2mgAb9mezUCA==}
@@ -4608,6 +4621,9 @@ packages:
ms@2.1.3:
resolution: {integrity: sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==}
+ multistream@4.1.0:
+ resolution: {integrity: sha512-J1XDiAmmNpRCBfIWJv+n0ymC4ABcf/Pl+5YvC5B/D2f/2+8PtHvCNxMPKiQcZyi922Hq69J2YOpb1pTywfifyw==}
+
mysql2@3.15.1:
resolution: {integrity: sha512-WZMIRZstT2MFfouEaDz/AGFnGi1A2GwaDe7XvKTdRJEYiAHbOrh4S3d8KFmQeh11U85G+BFjIvS1Di5alusZsw==}
engines: {node: '>= 8.0'}
@@ -4882,6 +4898,9 @@ packages:
resolution: {integrity: sha512-737ZY3yNnXy37FHkQxPzt4UZ2UWPWiCZWLvFZ4fu5cueciegX0zGPnrlY6bwRg4FdQOe9YU8MkmJwGhoMybl8A==}
engines: {node: '>= 0.8'}
+ once@1.4.0:
+ resolution: {integrity: sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==}
+
onetime@5.1.2:
resolution: {integrity: sha512-kbpaSSGJTWdAY5KPVeMOKXSrPtr8C8C7wodJbcsd51jRnmD+GZu8Y0VoU6Dm5Z4vWr0Ig/1NKuWRKf7j5aaYSg==}
engines: {node: '>=6'}
@@ -5377,6 +5396,10 @@ packages:
readable-stream@2.3.8:
resolution: {integrity: sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==}
+ readable-stream@3.6.2:
+ resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==}
+ engines: {node: '>= 6'}
+
readdirp@3.6.0:
resolution: {integrity: sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==}
engines: {node: '>=8.10.0'}
@@ -6523,6 +6546,9 @@ packages:
resolution: {integrity: sha512-42AtmgqjV+X1VpdOfyTGOYRi0/zsoLqtXQckTmqTeybT+BDIbM/Guxo7x3pE2vtpr1ok6xRqM9OpBe+Jyoqyww==}
engines: {node: '>=18'}
+ wrappy@1.0.2:
+ resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==}
+
ws@7.5.10:
resolution: {integrity: sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==}
engines: {node: '>=8.3.0'}
@@ -8578,6 +8604,10 @@ snapshots:
'@types/ms@2.1.0': {}
+ '@types/multistream@4.1.3':
+ dependencies:
+ '@types/node': 24.2.0
+
'@types/nlcst@2.0.3':
dependencies:
'@types/unist': 3.0.3
@@ -11047,6 +11077,11 @@ snapshots:
ms@2.1.3: {}
+ multistream@4.1.0:
+ dependencies:
+ once: 1.4.0
+ readable-stream: 3.6.2
+
mysql2@3.15.1:
dependencies:
aws-ssl-profiles: 1.1.2
@@ -11236,6 +11271,10 @@ snapshots:
on-headers@1.1.0: {}
+ once@1.4.0:
+ dependencies:
+ wrappy: 1.0.2
+
onetime@5.1.2:
dependencies:
mimic-fn: 2.1.0
@@ -11702,6 +11741,12 @@ snapshots:
string_decoder: 1.1.1
util-deprecate: 1.0.2
+ readable-stream@3.6.2:
+ dependencies:
+ inherits: 2.0.4
+ string_decoder: 1.1.1
+ util-deprecate: 1.0.2
+
readdirp@3.6.0:
dependencies:
picomatch: 2.3.1
@@ -12912,6 +12957,8 @@ snapshots:
string-width: 7.2.0
strip-ansi: 7.1.2
+ wrappy@1.0.2: {}
+
ws@7.5.10: {}
ws@8.18.0: {}
diff --git a/table/field/denormalize.ts b/table/field/denormalize.ts
new file mode 100644
index 00000000..e0c3ec2e
--- /dev/null
+++ b/table/field/denormalize.ts
@@ -0,0 +1,23 @@
+import type { Field } from "@dpkit/core"
+import { col, lit, when } from "nodejs-polars"
+import type { Expr } from "nodejs-polars"
+import { stringifyField } from "./stringify.ts"
+
+const DEFAULT_MISSING_VALUE = ""
+
+export function denormalizeField(field: Field, expr?: Expr) {
+ expr = expr ?? col(field.name)
+ expr = stringifyField(field, expr)
+
+ const flattenMissingValues = field.missingValues?.map(it =>
+ typeof it === "string" ? it : it.value,
+ )
+
+ const missingValue = flattenMissingValues?.[0] ?? DEFAULT_MISSING_VALUE
+ expr = when(expr.isNull())
+ .then(lit(missingValue))
+ .otherwise(expr)
+ .alias(field.name)
+
+ return expr
+}
diff --git a/table/field/index.ts b/table/field/index.ts
index c0ecec8e..990d8eb1 100644
--- a/table/field/index.ts
+++ b/table/field/index.ts
@@ -1,5 +1,7 @@
+export { denormalizeField } from "./denormalize.ts"
export { parseField } from "./parse.ts"
export { validateField } from "./validate.ts"
export { matchField } from "./match.ts"
+export { normalizeField } from "./normalize.ts"
export { stringifyField } from "./stringify.ts"
export type { PolarsField } from "./Field.ts"
diff --git a/table/field/normalize.ts b/table/field/normalize.ts
new file mode 100644
index 00000000..f1329b67
--- /dev/null
+++ b/table/field/normalize.ts
@@ -0,0 +1,31 @@
+import type { Field } from "@dpkit/core"
+import { col, lit, when } from "nodejs-polars"
+import type { Expr } from "nodejs-polars"
+import { parseField } from "./parse.ts"
+
+const DEFAULT_MISSING_VALUES = [""]
+
+export function normalizeField(
+ field: Field,
+ expr?: Expr,
+ options?: { dontParse?: boolean },
+) {
+ expr = expr ?? col(field.name)
+
+ const flattenMissingValues =
+ field.missingValues?.map(it => (typeof it === "string" ? it : it.value)) ??
+ DEFAULT_MISSING_VALUES
+
+ if (flattenMissingValues.length) {
+ expr = when(expr.isIn(flattenMissingValues))
+ .then(lit(null))
+ .otherwise(expr)
+ .alias(field.name)
+ }
+
+ if (options?.dontParse) {
+ return expr
+ }
+
+ return parseField(field, expr)
+}
diff --git a/table/field/parse.ts b/table/field/parse.ts
index 0ac6c831..64a8eb2d 100644
--- a/table/field/parse.ts
+++ b/table/field/parse.ts
@@ -1,5 +1,5 @@
import type { Field } from "@dpkit/core"
-import { col, lit, when } from "nodejs-polars"
+import { col } from "nodejs-polars"
import type { Expr } from "nodejs-polars"
import { parseArrayField } from "./types/array.ts"
import { parseBooleanField } from "./types/boolean.ts"
@@ -17,22 +17,9 @@ import { parseTimeField } from "./types/time.ts"
import { parseYearField } from "./types/year.ts"
import { parseYearmonthField } from "./types/yearmonth.ts"
-const DEFAULT_MISSING_VALUES = [""]
-
export function parseField(field: Field, expr?: Expr) {
expr = expr ?? col(field.name)
- const flattenMissingValues =
- field.missingValues?.map(it => (typeof it === "string" ? it : it.value)) ??
- DEFAULT_MISSING_VALUES
-
- if (flattenMissingValues.length) {
- expr = when(expr.isIn(flattenMissingValues))
- .then(lit(null))
- .otherwise(expr)
- .alias(field.name)
- }
-
switch (field.type) {
case "array":
return parseArrayField(field, expr)
diff --git a/table/field/stringify.ts b/table/field/stringify.ts
index 6b62ad0b..f2d2ad8c 100644
--- a/table/field/stringify.ts
+++ b/table/field/stringify.ts
@@ -1,5 +1,5 @@
import type { Field } from "@dpkit/core"
-import { col, lit, when } from "nodejs-polars"
+import { col } from "nodejs-polars"
import type { Expr } from "nodejs-polars"
import { stringifyArrayField } from "./types/array.ts"
import { stringifyBooleanField } from "./types/boolean.ts"
@@ -17,68 +17,41 @@ import { stringifyTimeField } from "./types/time.ts"
import { stringifyYearField } from "./types/year.ts"
import { stringifyYearmonthField } from "./types/yearmonth.ts"
-const DEFAULT_MISSING_VALUE = ""
-
export function stringifyField(field: Field, expr?: Expr) {
expr = expr ?? col(field.name)
switch (field.type) {
case "array":
- expr = stringifyArrayField(field, expr)
- break
+ return stringifyArrayField(field, expr)
case "boolean":
- expr = stringifyBooleanField(field, expr)
- break
+ return stringifyBooleanField(field, expr)
case "date":
- expr = stringifyDateField(field, expr)
- break
+ return stringifyDateField(field, expr)
case "datetime":
- expr = stringifyDatetimeField(field, expr)
- break
+ return stringifyDatetimeField(field, expr)
case "duration":
- expr = stringifyDurationField(field, expr)
- break
+ return stringifyDurationField(field, expr)
case "geojson":
- expr = stringifyGeojsonField(field, expr)
- break
+ return stringifyGeojsonField(field, expr)
case "geopoint":
- expr = stringifyGeopointField(field, expr)
- break
+ return stringifyGeopointField(field, expr)
case "integer":
- expr = stringifyIntegerField(field, expr)
- break
+ return stringifyIntegerField(field, expr)
case "list":
- expr = stringifyListField(field, expr)
- break
+ return stringifyListField(field, expr)
case "number":
- expr = stringifyNumberField(field, expr)
- break
+ return stringifyNumberField(field, expr)
case "object":
- expr = stringifyObjectField(field, expr)
- break
+ return stringifyObjectField(field, expr)
case "string":
- expr = stringifyStringField(field, expr)
- break
+ return stringifyStringField(field, expr)
case "time":
- expr = stringifyTimeField(field, expr)
- break
+ return stringifyTimeField(field, expr)
case "year":
- expr = stringifyYearField(field, expr)
- break
+ return stringifyYearField(field, expr)
case "yearmonth":
- expr = stringifyYearmonthField(field, expr)
- break
+ return stringifyYearmonthField(field, expr)
+ default:
+ return expr
}
-
- const flattenMissingValues = field.missingValues?.map(it =>
- typeof it === "string" ? it : it.value,
- )
-
- const missingValue = flattenMissingValues?.[0] ?? DEFAULT_MISSING_VALUE
- expr = when(expr.isNull())
- .then(lit(missingValue))
- .otherwise(expr)
- .alias(field.name)
-
- return expr
}
diff --git a/table/table/denormalize.ts b/table/table/denormalize.ts
index 29ffa762..f39be815 100644
--- a/table/table/denormalize.ts
+++ b/table/table/denormalize.ts
@@ -1,7 +1,7 @@
import type { Field, Schema } from "@dpkit/core"
import { col, lit } from "nodejs-polars"
import type { Expr } from "nodejs-polars"
-import { stringifyField } from "../field/index.ts"
+import { denormalizeField } from "../field/index.ts"
import type { PolarsSchema } from "../schema/index.ts"
import { getPolarsSchema } from "../schema/index.ts"
import type { Table } from "./Table.ts"
@@ -40,10 +40,11 @@ export function denormalizeFields(
if (polarsField) {
expr = col(polarsField.name).alias(field.name)
+ // TODO: Move this logic to denormalizeField?
if (!nativeTypes?.includes(field.type ?? "any")) {
const missingValues = field.missingValues ?? schema.missingValues
const mergedField = { ...field, missingValues }
- expr = stringifyField(mergedField, expr)
+ expr = denormalizeField(mergedField, expr)
}
}
diff --git a/table/table/normalize.ts b/table/table/normalize.ts
index 2121af3d..39cded33 100644
--- a/table/table/normalize.ts
+++ b/table/table/normalize.ts
@@ -3,7 +3,7 @@ import type { Expr } from "nodejs-polars"
import { DataType } from "nodejs-polars"
import { col, lit } from "nodejs-polars"
import { matchField } from "../field/index.ts"
-import { parseField } from "../field/index.ts"
+import { normalizeField } from "../field/index.ts"
import { getPolarsSchema } from "../schema/index.ts"
import type { PolarsSchema } from "../schema/index.ts"
import type { Table } from "./Table.ts"
@@ -14,16 +14,16 @@ export async function normalizeTable(
table: Table,
schema: Schema,
options?: {
- noParse?: boolean
+ dontParse?: boolean
},
) {
- const { noParse } = options ?? {}
+ const { dontParse } = options ?? {}
const head = await table.head(HEAD_ROWS).collect()
const polarsSchema = getPolarsSchema(head.schema)
return table.select(
- ...Object.values(normalizeFields(schema, polarsSchema, { noParse })),
+ ...Object.values(normalizeFields(schema, polarsSchema, { dontParse })),
)
}
@@ -31,10 +31,10 @@ export function normalizeFields(
schema: Schema,
polarsSchema: PolarsSchema,
options?: {
- noParse?: boolean
+ dontParse?: boolean
},
) {
- const { noParse } = options ?? {}
+ const { dontParse } = options ?? {}
const exprs: Record = {}
for (const [index, field] of schema.fields.entries()) {
@@ -44,10 +44,11 @@ export function normalizeFields(
if (polarsField) {
expr = col(polarsField.name).alias(field.name)
- if (!noParse && polarsField.type.equals(DataType.String)) {
+ // TODO: Move this logic to normalizeField?
+ if (polarsField.type.equals(DataType.String)) {
const missingValues = field.missingValues ?? schema.missingValues
const mergedField = { ...field, missingValues }
- expr = parseField(mergedField, expr)
+ expr = normalizeField(mergedField, expr, { dontParse })
}
}
diff --git a/table/table/validate.ts b/table/table/validate.ts
index 8cfd7179..9ee59b89 100644
--- a/table/table/validate.ts
+++ b/table/table/validate.ts
@@ -137,13 +137,13 @@ async function validateFields(
const targetNames: string[] = []
const sources = Object.entries(
- normalizeFields(schema, polarsSchema, { noParse: true }),
+ normalizeFields(schema, polarsSchema, { dontParse: true }),
).map(([name, expr]) => {
return expr.alias(`source:${name}`)
})
const targets = Object.entries(
- normalizeFields(schema, polarsSchema, { noParse: false }),
+ normalizeFields(schema, polarsSchema, { dontParse: false }),
).map(([name, expr]) => {
const targetName = `target:${name}`
targetNames.push(targetName)
@@ -179,20 +179,27 @@ async function validateFields(
.collect()
for (const record of errorFrame.toRecords() as any[]) {
+ const typeErrorInFields: string[] = []
for (const [key, value] of Object.entries(record)) {
const [kind, type, name] = key.split(":")
-
if (kind === "error" && value === true && type && name) {
const rowNumber = record.row_nr
// Cell-level errors
if (type.startsWith("cell/")) {
- errors.push({
- rowNumber,
- type: type as any,
- fieldName: name as any,
- cell: (record[`source:${name}`] ?? "").toString(),
- })
+ if (!typeErrorInFields.includes(name)) {
+ errors.push({
+ rowNumber,
+ type: type as any,
+ fieldName: name as any,
+ cell: (record[`source:${name}`] ?? "").toString(),
+ })
+ }
+
+ // Type error is a terminating error for a cell
+ if (type === "cell/type") {
+ typeErrorInFields.push(name)
+ }
}
// Row-level errors