diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/AllErrorsMetaDataValidator.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/AllErrorsMetaDataValidator.scala index ecfcaf37..ae6d9aab 100644 --- a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/AllErrorsMetaDataValidator.scala +++ b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/AllErrorsMetaDataValidator.scala @@ -27,7 +27,7 @@ trait AllErrorsMetaDataValidator extends MetaDataValidator { results.reverse } else { val row = rows.next() - val result = validateRow(row, schema) + val result = validateRow(row, schema, Some(rows.hasNext)) validateRows(result :: results) } } @@ -36,17 +36,18 @@ trait AllErrorsMetaDataValidator extends MetaDataValidator { v.sequence[MetaDataValidation, Any] } - override protected def rules(row: Row, schema: Schema): MetaDataValidation[List[Any]] = { + + override protected def rules(row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[List[Any]] = { val cells: (Int) => Option[Cell] = row.cells.lift val v = schema.columnDefinitions.zipWithIndex.map { case (columnDefinition, columnIndex) => - validateCell(columnIndex, cells, row, schema) + validateCell(columnIndex, cells, row, schema, mayBeLast) } v.sequence[MetaDataValidation, Any] } - override protected def rulesForCell(columnIndex: Int, row: Row, schema: Schema): MetaDataValidation[Any] = { + override protected def rulesForCell(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[Any] = { val columnDefinition = schema.columnDefinitions(columnIndex) @@ -54,8 +55,8 @@ trait AllErrorsMetaDataValidator extends MetaDataValidator { def isOptionDirective: Boolean = columnDefinition.directives.contains(Optional()) if(row.cells(columnIndex).value.trim.isEmpty && isOptionDirective) true.successNel - else columnDefinition.rules.map(_.evaluate(columnIndex, row, schema)).map{ ruleResult:Rule#RuleValidation[Any] => { + else columnDefinition.rules.map(_.evaluate(columnIndex, row, schema, mayBeLast)).map{ ruleResult:Rule#RuleValidation[Any] => { if(isWarningDirective) toWarnings(ruleResult, row.lineNumber, columnIndex) else toErrors(ruleResult, row.lineNumber, columnIndex) }}.sequence[MetaDataValidation, Any] } -} +} \ No newline at end of file diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/FailFastMetaDataValidator.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/FailFastMetaDataValidator.scala index 24065599..c615be0b 100644 --- a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/FailFastMetaDataValidator.scala +++ b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/FailFastMetaDataValidator.scala @@ -32,7 +32,7 @@ trait FailFastMetaDataValidator extends MetaDataValidator { results.reverse } else { val row = rows.next() - val result = validateRow(row, schema) + val result = validateRow(row, schema, Some(rows.hasNext)) validateRows(result :: results) } } @@ -41,7 +41,7 @@ trait FailFastMetaDataValidator extends MetaDataValidator { v.sequence[MetaDataValidation, Any] } - override protected def rules(row: Row, schema: Schema): MetaDataValidation[List[Any]] = { + override protected def rules(row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[List[Any]] = { val cells: (Int) => Option[Cell] = row.cells.lift @tailrec @@ -54,7 +54,7 @@ trait FailFastMetaDataValidator extends MetaDataValidator { accum.reverse case (columnDefinition, columnIndex) :: tail => - validateCell(columnIndex, cells, row, schema) match { + validateCell(columnIndex, cells, row, schema, mayBeLast) match { case failure @ Failure(_) if(!schema.columnDefinitions(columnIndex).directives.contains(Warning())) => validateRules(List.empty, failure :: accum) //stop on first failure which is not a warning case result => @@ -67,7 +67,7 @@ trait FailFastMetaDataValidator extends MetaDataValidator { v.sequence[MetaDataValidation, Any] } - override protected def rulesForCell(columnIndex: Int, row: Row, schema: Schema): MetaDataValidation[Any] = { + override protected def rulesForCell(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[Any] = { val columnDefinition = schema.columnDefinitions(columnIndex) def isWarningDirective: Boolean = columnDefinition.directives.contains(Warning()) @@ -76,16 +76,16 @@ trait FailFastMetaDataValidator extends MetaDataValidator { @tailrec def validateRulesForCell(rules: List[Rule]): MetaDataValidation[Any] = rules match { case Nil => true.successNel[FailMessage] - case rule :: tail => rule.evaluate(columnIndex, row, schema) match { + case rule :: tail => rule.evaluate(columnIndex, row, schema, mayBeLast) match { case e@Failure(_) => toErrors(e, row.lineNumber, columnIndex) case _ => validateRulesForCell(tail) } } - def validateAllRulesForCell(rules: List[Rule]): MetaDataValidation[Any] = rules.map(_.evaluate(columnIndex, row, schema)).map(toWarnings(_, row.lineNumber, columnIndex)).sequence[MetaDataValidation, Any] + def validateAllRulesForCell(rules: List[Rule]): MetaDataValidation[Any] = rules.map(_.evaluate(columnIndex, row, schema, mayBeLast)).map(toWarnings(_, row.lineNumber, columnIndex)).sequence[MetaDataValidation, Any] if(row.cells(columnIndex).value.trim.isEmpty && isOptionDirective) true.successNel else if(isWarningDirective) validateAllRulesForCell(columnDefinition.rules) else validateRulesForCell(columnDefinition.rules) } -} +} \ No newline at end of file diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala index 84c01a2b..22d11a8d 100644 --- a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala +++ b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidator.scala @@ -32,7 +32,6 @@ trait MetaDataValidator { type MetaDataValidation[S] = ValidationNel[FailMessage, S] def validate(csv: JReader, schema: Schema, progress: Option[ProgressCallback]): MetaDataValidation[Any] = { - //try to find the number of rows for the //purposes pf reporting progress //can only do that if we can reset() @@ -53,12 +52,12 @@ trait MetaDataValidator { } /** - * Browse csv File and return all the titleIndex as a list - * @param csv the CSV reader - * @param schema the Schema - * @param columnIndex the index of the column to be return - * @return all the element of the column columnIndex - */ + * Browse csv File and return all the titleIndex as a list + * @param csv the CSV reader + * @param schema the Schema + * @param columnIndex the index of the column to be return + * @return all the element of the column columnIndex + */ def getColumn(csv: JReader, schema: Schema, columnIndex: Int): List[String] = { val separator = schema.globalDirectives.collectFirst { @@ -134,6 +133,7 @@ trait MetaDataValidator { // if 'no header' is not set and the file is empty - this is an error // if 'no header' is not set and 'permit empty' is not set but the file contains only one line - this is an error + val rowIt = new RowIterator(reader, progress) val maybeNoData = @@ -173,16 +173,25 @@ trait MetaDataValidator { case Left(ts) => //TODO emit all errors not just first! ErrorMessage(ts(0).toString).failureNel[Any] - //ts.toList.map(t => ErrorMessage(t.toString).failureNel[Any]).sequence[MetaDataValidation, Any] + //ts.toList.map(t => ErrorMessage(t.toString).failureNel[Any]).sequence[MetaDataValidation, Any] } } /** - * Return the column at the index columnIndex - * @param rows the row iterator - * @param columnIndex the index of the column - * @return List of string of all element at the columnIndex - */ + * Performs some extra validation when all rows have been validated + * @param csv + * @param schema + * @param progress + * @return + */ + def postValidate(csv: JReader, schema: Schema, progress: Option[ProgressFor]): MetaDataValidation[Any] = true.successNel[FailMessage] + + /** + * Return the column at the index columnIndex + * @param rows the row iterator + * @param columnIndex the index of the column + * @return List of string of all element at the columnIndex + */ def getColumn(rows: Iterator[Row], columnIndex: Int): List[String] = rows.foldLeft(List[String]()){ (acc,row) => acc :+ filename(row, columnIndex) @@ -193,6 +202,7 @@ trait MetaDataValidator { def validateRows(rows: Iterator[Row], schema: Schema): MetaDataValidation[Any] + def validateHeader(header: Row, schema: Schema): Option[MetaDataValidation[Any]] = { val icnc: Option[IgnoreColumnNameCase] = schema.globalDirectives.collectFirst {case i @ IgnoreColumnNameCase() => i } @@ -209,9 +219,9 @@ trait MetaDataValidator { Some(ErrorMessage(s"Metadata header, cannot find the column headers - ${Util.diff(schemaHeader.toSet, headerList.toSet).mkString(", ")} - .${if (icnc.isEmpty) " (Case sensitive)" else ""}").failNel[Any]) } - def validateRow(row: Row, schema: Schema): MetaDataValidation[Any] = { + def validateRow(row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[Any] = { val totalColumnsV = totalColumns(row, schema) - val rulesV = rules(row, schema) + val rulesV = rules(row, schema, mayBeLast) (totalColumnsV |@| rulesV) { _ :: _ } } @@ -224,11 +234,11 @@ trait MetaDataValidator { else ErrorMessage(s"Expected @totalColumns of ${tc.get.numberOfColumns} and found ${row.cells.length} on line ${row.lineNumber}", Some(row.lineNumber), Some(row.cells.length)).failureNel[Any] } - protected def rules(row: Row, schema: Schema): MetaDataValidation[List[Any]] + protected def rules(row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[List[Any]] - protected def validateCell(columnIndex: Int, cells: (Int) => Option[Cell], row: Row, schema: Schema): MetaDataValidation[Any] = { + protected def validateCell(columnIndex: Int, cells: (Int) => Option[Cell], row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[Any] = { cells(columnIndex) match { - case Some(c) => rulesForCell(columnIndex, row, schema) + case Some(c) => rulesForCell(columnIndex, row, schema, mayBeLast) case _ => ErrorMessage(s"Missing value at line: ${row.lineNumber}, column: ${schema.columnDefinitions(columnIndex).id}", Some(row.lineNumber), Some(columnIndex)).failureNel[Any] } } @@ -236,7 +246,7 @@ trait MetaDataValidator { protected def toWarnings(results: Rule#RuleValidation[Any], lineNumber: Int, columnIndex: Int): MetaDataValidation[Any] = results.leftMap(_.map(WarningMessage(_, Some(lineNumber), Some(columnIndex)))) protected def toErrors(results: Rule#RuleValidation[Any], lineNumber: Int, columnIndex: Int): MetaDataValidation[Any] = results.leftMap(_.map(ErrorMessage(_, Some(lineNumber), Some(columnIndex)))) - protected def rulesForCell(columnIndex: Int, row: Row, schema: Schema): MetaDataValidation[Any] + protected def rulesForCell(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): MetaDataValidation[Any] protected def countRows(textFile: TextFile): Int = { withReader(textFile) { @@ -278,9 +288,9 @@ trait MetaDataValidator { trait ProgressCallback { /** - * A percentage is always between - * 0 and 100 inclusive - */ + * A percentage is always between + * 0 and 100 inclusive + */ type Percentage = Float @@ -325,4 +335,4 @@ class RowIterator(reader: CSVReader, progress: Option[ProgressFor]) extends Iter override def hasNext: Boolean = current.nonEmpty private def toRow(rowData: Option[Array[String]]): Option[Row] = rowData.map(data => Row(data.toList.map(Cell(_)), index)) -} +} \ No newline at end of file diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/Util.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/Util.scala index 871046de..75d7a1ab 100644 --- a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/Util.scala +++ b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/Util.scala @@ -8,6 +8,7 @@ */ package uk.gov.nationalarchives.csv.validator +import scala.collection.mutable import scalax.file.Path import scalaz._ import Scalaz._ @@ -30,40 +31,40 @@ object Util { def fileNotReadableMessage(file: Path) = SchemaMessage("Unable to read file : " + file.path) /** - * Check if the list l1 contain all element in l2 - * @param l1 the containing list - * @param l2 the contained list - * @tparam A type of the list - * @return true,if the list l1 contain all element in l2 - */ + * Check if the list l1 contain all element in l2 + * @param l1 the containing list + * @param l2 the contained list + * @tparam A type of the list + * @return true,if the list l1 contain all element in l2 + */ def containAll[A](l1: List[A], l2: List[A]): Boolean = l2 forall ((l1.toSet) contains) /** - * Returns Set l1 \ Set l2 (Minus) - * @param l1 - * @param l2 - * @tparam A - * @return - */ + * Returns Set l1 \ Set l2 (Minus) + * @param l1 + * @param l2 + * @tparam A + * @return + */ def minus[A](l1: Set[A], l2: Set[A]): Set[A] = l1.filterNot(l2) /** - * Returns Set l1 \ Set l2 (Minus) - * @param l1 - * @param l2 - * @tparam A - * @return - */ + * Returns Set l1 \ Set l2 (Minus) + * @param l1 + * @param l2 + * @tparam A + * @return + */ def diff[A](l1: Set[A], l2: Set[A]): Set[A] = minus(l1,l2) ++ minus(l2,l1) /** - * List recursively all the files (but not the subfolder) in the folder given as a parameter - * @param folder - * @return List of all filename - */ + * List recursively all the files (but not the subfolder) in the folder given as a parameter + * @param folder + * @return List of all filename + */ def findAllFiles(includeFolder : Boolean,folder: File): Set[File] = { if (folder.exists()){ val these = folder.listFiles.toSet @@ -170,17 +171,17 @@ object Util { def file2PatformDependent(file: String) : String = TypedPath(file).toPlatform.toString /** - * Checks that a filepath exactly matches - * the file path available on disk - * - * This ensures case-sensitivity - * and is useful on platforms such as - * Windows NTFS which are case-insensitive, - * where new File("test.txt").exists - * and new File("TEST.TXT").exists - * may both return true when there is - * only one file. - */ + * Checks that a filepath exactly matches + * the file path available on disk + * + * This ensures case-sensitivity + * and is useful on platforms such as + * Windows NTFS which are case-insensitive, + * where new File("test.txt").exists + * and new File("TEST.TXT").exists + * may both return true when there is + * only one file. + */ @tailrec final def caseSensitivePathMatchesFs(f: File): Boolean = { @@ -207,6 +208,7 @@ object Util { val separator: Char = FILE_SEPARATOR private def substitutePath(filename: String): String = { + //TODO Refactor using collect val x = { pathSubstitutions.filter { case (subFrom, _) => filename.contains(subFrom) @@ -216,10 +218,20 @@ object Util { } if(x.isEmpty) filename + else x.head } + private def contentDir(filepath: String): String = { + val dir = pathSubstitutions.collectFirst{ + case (subFrom, subTo) if filepath.contains(subFrom) => subTo + } getOrElse { + (filepath split "content").head + } + dir + separator + "content" + } + def jointPath: String = { val uri_sep: Char = URI_PATH_SEPARATOR @@ -239,6 +251,8 @@ object Util { } def exists(enforceCaseSensitivePathChecks: Boolean = false): Boolean = { + + FileSystem.createFile(FileSystem.convertPath2Platform(substitutePath(jointPath))) match { case scala.util.Success(f) => { val exists = f.exists @@ -252,6 +266,41 @@ object Util { } } + def scanDir(dir: File): Set[File] = findAllFiles(dir, false) + + def findAllFiles(folder: File, includeFolder : Boolean = true): Set[File] = { + if (folder.exists()){ + val these = folder.listFiles.toSet + val head = if (includeFolder) Set(folder) else Nil + (head ++ these.filter( f => if (!includeFolder) f.isFile else true) ++ these.filter(f => f.isDirectory && !(f.getName == "RECYCLER") && !(f.getName == "$RECYCLE.BIN")).flatMap(file => findAllFiles(file, includeFolder))).toSet + } + else + throw new FileNotFoundException(s"Cannot find the folder $folder") + } + + def integrityCheck(fileMap: Map[String, Set[File]], enforceCaseSensitivePathChecks: Boolean = false): Map[String, Set[File]] = { + val contentDirectory = contentDir(jointPath) + val files = fileMap.get(contentDirectory) match { + case None => + val theFiles = FileSystem.createFile(FileSystem.convertPath2Platform(substitutePath(contentDirectory))) match + { + case scala.util.Success(f) => scanDir(f) + case scala.util.Failure(_) => Set[File]() + } + theFiles + case Some(f) => f + } + + // println("file path --->" + jointPath) + + val remainder = FileSystem.createFile(FileSystem.convertPath2Platform(substitutePath(substitutePath(jointPath)))) match { + case scala.util.Success(f) => files - f + case _ => files + } + + fileMap.filterKeys(_ == contentDirectory) + (contentDirectory -> remainder) + } + def expandBasePath: String = { if( basePath.isEmpty || basePath.getOrElse("") == "") FileSystem.file2PlatformIndependent(substitutePath(file)) @@ -259,4 +308,4 @@ object Util { FileSystem.file2PlatformIndependent(substitutePath(jointPath)) } } -} +} \ No newline at end of file diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala index 097ed220..ed6573d7 100644 --- a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala +++ b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/api/CsvValidator.scala @@ -8,7 +8,7 @@ */ package uk.gov.nationalarchives.csv.validator.api -import uk.gov.nationalarchives.csv.validator.schema.{IntegrityCheck, Schema, SchemaParser} +import uk.gov.nationalarchives.csv.validator.schema.{Schema, SchemaParser} import scalaz._, Scalaz._ import scalax.file.Path import uk.gov.nationalarchives.csv.validator._ @@ -25,67 +25,34 @@ object CsvValidator { def createValidator(failFast: Boolean, pathSubstitutionsList: List[SubstitutePath], enforceCaseSensitivePathChecksSwitch: Boolean, traceSwitch: Boolean) = { if(failFast) { - new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; } + new CsvValidator with FailFastMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch } } else { - new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch; } + new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = pathSubstitutionsList; val enforceCaseSensitivePathChecks = enforceCaseSensitivePathChecksSwitch; val trace = traceSwitch } } } } /** - * Represent a Text file on disk - * that has both a path and a specific - * encoding. - * - * If no encoding is specified, then UTF-8 will - * be assumed. - */ + * Represent a Text file on disk + * that has both a path and a specific + * encoding. + * + * If no encoding is specified, then UTF-8 will + * be assumed. + */ case class TextFile(file: Path, encoding: JCharset = CsvValidator.DEFAULT_ENCODING) trait CsvValidator extends SchemaParser { this: MetaDataValidator => def validate(csvFile: TextFile, csvSchema: Schema, progress: Option[ProgressCallback]): MetaDataValidation[Any] = { - val integrationValidation: MetaDataValidation[Any] = integrityCheckValidation(csvFile, csvSchema).getOrElse(true.successNel[FailMessage]) - - val metadataValidation:MetaDataValidation[Any] = withReader(csvFile) { + withReader(csvFile) { reader => - validateKnownRows(reader, csvSchema, progress.map(p => ProgressFor(countRows(csvFile), p))) + val totalRows = countRows(csvFile) + validateKnownRows(reader, csvSchema, progress.map(p => {ProgressFor(totalRows, p)} ) ) } - //TODO Combine in a better depending in the strategy FailFast or not - List(integrationValidation, metadataValidation).sequence[MetaDataValidation, Any] } - def integrityCheckValidation(csvFile: TextFile, csvSchema: Schema): Option[MetaDataValidation[Any]] = { - val ic = csvSchema.globalDirectives.collectFirst{ case i @ IntegrityCheck(_, _) => i} - ic.map { integrityCheck => - val filenameColumn = integrityCheck.filepathColumn - val filenameColumnIndex = csvSchema.columnDefinitions.map(x => x.id.value).indexOf(filenameColumn) - val allMetadataFilenames = withReader(csvFile) { - reader => - getColumn(reader, csvSchema, filenameColumnIndex) - }.map(new File(_).getName) - - csvFile.file.parent.map(_ / "content").map { contentPath => - val contentFile = new File(contentPath.toURI) - val includeFolder = integrityCheck.includeFolder - - scala.util.Try(Util.findAllFiles(includeFolder, contentFile)).map{ allContentFiles => - val allContentFilename = allContentFiles.map(_.getName) - if (Util.containAll(allMetadataFilenames,allContentFilename.toList)) - true.successNel[FailMessage] - else - ErrorMessage(s"[Integrity Check], The file(s) ${Util.minus( allContentFilename, allMetadataFilenames.toSet).mkString(" ")} " + - s"are not listed in the metadata content under ${csvFile.file.parent}").failNel[Any] - }.getOrElse { - ErrorMessage(s"[Integrity Check], Cannot find the content folder under ${csvFile.file.parent}").failNel[Any] - } - }.getOrElse { - ErrorMessage(s"[Integrity Check], Cannot find the content folder under ${csvFile.file.parent}").failNel[Any] - } - } - - } def parseSchema(csvSchemaFile: TextFile): ValidationNel[FailMessage, Schema] = { withReader(csvSchemaFile) { @@ -95,4 +62,4 @@ trait CsvValidator extends SchemaParser { } def parseSchema(csvSchema: JReader): ValidationNel[FailMessage, Schema] = parseAndValidate(csvSchema) -} +} \ No newline at end of file diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Rule.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Rule.scala index 030058a7..4395d42c 100644 --- a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Rule.scala +++ b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Rule.scala @@ -29,11 +29,11 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi var explicitColumn: Option[ColumnReference] = None - def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { - if (valid(cellValue(columnIndex, row, schema), schema.columnDefinitions(columnIndex), columnIndex, row, schema)) true.successNel[String] else fail(columnIndex, row, schema) + def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { + if (valid(cellValue(columnIndex, row, schema), schema.columnDefinitions(columnIndex), columnIndex, row, schema, mayBeLast)) true.successNel[String] else fail(columnIndex, row, schema) } - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean def fail(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { val columnDefinition = schema.columnDefinitions(columnIndex) @@ -67,19 +67,19 @@ abstract class Rule(name: String, val argProviders: ArgProvider*) extends Positi } case class OrRule(left: Rule, right: Rule) extends Rule("or") { - override def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { - left.evaluate(columnIndex, row, schema) match { + override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { + left.evaluate(columnIndex, row, schema, mayBeLast) match { case s @ SuccessZ(_) => s - case FailureZ(_) => right.evaluate(columnIndex, row, schema) match { + case FailureZ(_) => right.evaluate(columnIndex, row, schema, mayBeLast) match { case s @ SuccessZ(_) => s case FailureZ(_) => fail(columnIndex, row, schema) } } } - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { - evaluate(columnIndex, row, schema) match { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { + evaluate(columnIndex, row, schema, mayBeLast) match { case FailureZ(_) => false case SuccessZ(_) => true } @@ -90,16 +90,16 @@ case class OrRule(left: Rule, right: Rule) extends Rule("or") { case class ParenthesesRule(rules: List[Rule]) extends Rule("parentheses") { - override def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { + override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { val v = for (rule <- rules) yield { - rule.evaluate(columnIndex, row, schema) + rule.evaluate(columnIndex, row, schema, mayBeLast) } v.sequence[RuleValidation, Any] } - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { - evaluate(columnIndex, row, schema) match { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { + evaluate(columnIndex, row, schema, mayBeLast) match { case FailureZ(_) => false case SuccessZ(_) => true } @@ -113,7 +113,7 @@ case class ParenthesesRule(rules: List[Rule]) extends Rule("parentheses") { case class IfRule(condition: Rule, rules: List[Rule], elseRules: Option[List[Rule]]) extends Rule("if") { - override def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { + override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { val (cellValue,idx) = condition.explicitColumn match { case Some(columnRef) => (columnRef.referenceValueEx(columnIndex, row, schema), columnIdentifierToIndex(schema, columnRef.ref)) @@ -122,7 +122,7 @@ case class IfRule(condition: Rule, rules: List[Rule], elseRules: Option[List[Rul } val v = if (condition.valid(cellValue, schema.columnDefinitions(columnIndex), idx, row, schema)) { - for (rule <- rules) yield { + for (rule <- rules) yield { rule.evaluate(columnIndex, row, schema) } } else { @@ -138,7 +138,7 @@ case class IfRule(condition: Rule, rules: List[Rule], elseRules: Option[List[Rul v.sequence[RuleValidation, Any] } - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { evaluate(columnIndex, row, schema) match { case FailureZ(_) => false case SuccessZ(_) => true @@ -152,7 +152,7 @@ case class IfRule(condition: Rule, rules: List[Rule], elseRules: Option[List[Rul } case class RegExpRule(regex: String) extends Rule("regex") { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { val regexp = if (columnDefinition.directives.contains(IgnoreCase())) "(?i)" + regex else regex cellValue matches regexp @@ -166,22 +166,59 @@ case class RegExpRule(regex: String) extends Rule("regex") { //TODO note the use of `Seq(rootPath): _*` when extending Rule, this is to workaround this bug https://issues.scala-lang.org/browse/SI-7436. This pattern is repeated below! case class FileExistsRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None)) extends Rule("fileExists", Seq(rootPath): _*) { - def valid(filePath: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema) = { + def valid(filePath: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None) = { + val ruleValue = rootPath.referenceValue(columnIndex, row, schema) val fileExists = ruleValue match { - case Some(rp) => new FileSystem(rp, filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks) + case Some(rp) => new FileSystem(rp, filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks) case None => new FileSystem(filePath, pathSubstitutions).exists(enforceCaseSensitivePathChecks) } - fileExists } override def toError = s"""$ruleName""" + (if (rootPath == Literal(None)) "" else s"""(${rootPath.toError})""") } +case class IntegrityCheckRule(pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean, rootPath: ArgProvider = Literal(None), includeFolder: Boolean = false) extends Rule("integrityCheck", Seq(rootPath): _*) { + + //TODO introduce state, not very functional + var filesMap = Map[String, Set[File]]() + + def valid(filePath: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean]): Boolean = { + + if (!filePath.isEmpty){ + + val ruleValue = rootPath.referenceValue(columnIndex, row, schema) + + filesMap = new FileSystem(ruleValue, filePath, pathSubstitutions).integrityCheck(filesMap, enforceCaseSensitivePathChecks) + val isLastLine = mayBeLast.map(!_).getOrElse(false) + + if (isLastLine) + { filesMap.forall{case (folder,files) => files.isEmpty} } + else + true + } + else + false + + + } + + override def toError = { s"""$ruleName""" + (if (rootPath == Literal(None)) "" else s"""(${rootPath.toError})""") } + + override def toValueError(row: Row, columnIndex:Int ) = { + + val extraFiles = filesMap.collect{ case (folder,files) if files.nonEmpty => + files.mkString(", ") + }.mkString(", ") + + s"""files: ${'"'}$extraFiles${'"'} are not listed in the metadata""" + } +} + case class InRule(inValue: ArgProvider) extends Rule("in", Seq(inValue): _*) { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { val ruleValue = inValue.referenceValue(columnIndex, row, schema) val (rv, cv) = if (columnDefinition.directives.contains(IgnoreCase())) (ruleValue.get.toLowerCase, cellValue.toLowerCase) else (ruleValue.get, cellValue) @@ -190,7 +227,7 @@ case class InRule(inValue: ArgProvider) extends Rule("in", Seq(inValue): _*) { } case class IsRule(isValue: ArgProvider) extends Rule("is", Seq(isValue): _*) { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { val ruleValue = isValue.referenceValue(columnIndex, row, schema) val (rv, cv) = if (columnDefinition.directives.contains(IgnoreCase())) (ruleValue.get.toLowerCase, cellValue.toLowerCase) else (ruleValue.get, cellValue) @@ -199,7 +236,7 @@ case class IsRule(isValue: ArgProvider) extends Rule("is", Seq(isValue): _*) { } case class NotRule(notValue: ArgProvider) extends Rule("not", Seq(notValue): _*) { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema,mayBeLast: Option[Boolean] = None): Boolean = { val ruleValue = notValue.referenceValue(columnIndex, row, schema) val (rv, cv) = if (columnDefinition.directives.contains(IgnoreCase())) (ruleValue.get.toLowerCase, cellValue.toLowerCase) else (ruleValue.get, cellValue) @@ -208,7 +245,7 @@ case class NotRule(notValue: ArgProvider) extends Rule("not", Seq(notValue): _*) } case class StartsRule(startsValue: ArgProvider) extends Rule("starts", Seq(startsValue): _*) { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { val ruleValue = startsValue.referenceValue(columnIndex, row, schema) val (rv, cv) = if (columnDefinition.directives.contains(IgnoreCase())) (ruleValue.get.toLowerCase, cellValue.toLowerCase) else (ruleValue.get, cellValue) @@ -217,7 +254,7 @@ case class StartsRule(startsValue: ArgProvider) extends Rule("starts", Seq(start } case class EndsRule(endsValue: ArgProvider) extends Rule("ends", Seq(endsValue): _*) { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { val ruleValue = endsValue.referenceValue(columnIndex, row, schema) val (rv, cv) = if (columnDefinition.directives.contains(IgnoreCase())) (ruleValue.get.toLowerCase, cellValue.toLowerCase) else (ruleValue.get, cellValue) @@ -226,20 +263,20 @@ case class EndsRule(endsValue: ArgProvider) extends Rule("ends", Seq(endsValue): } case class EmptyRule() extends Rule("empty") { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { - cellValue.isEmpty - } + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { + cellValue.isEmpty + } } case class NotEmptyRule() extends Rule("notEmpty") { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { !cellValue.isEmpty } } //case class UriRule() extends PatternRule("uri", UriRegex) case class UriRule() extends Rule("uri") { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { try { val uri = new URI(cellValue) true @@ -286,7 +323,7 @@ abstract class DateRangeRule(name: String, dateRegex: String, dateParser: DatePa lazy val fromDate = parse(from) lazy val toDate = parse(to) - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { cellValue matches dateRegex match { case true => { val inRange = for ( frmDt <- fromDate; toDt <- toDate; cellDt <- parse(cellValue)) yield { @@ -305,11 +342,11 @@ abstract class DateRangeRule(name: String, dateRegex: String, dateParser: DatePa } abstract class PatternRule(name: String, pattern: String) extends Rule(name) { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = cellValue matches pattern + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = cellValue matches pattern } abstract class DateRule(name: String, dateRegex: String, dateParser: DateParser) extends PatternRule(name, dateRegex) { - override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { super.valid(cellValue, columnDefinition, columnIndex, row, schema) match { case true => dateParser.parse(cellValue).isSuccess case _ => false @@ -344,27 +381,27 @@ case class UpperCaseRule() extends PatternRule("upperCase", UpperCaseRegex) case class LowerCaseRule() extends PatternRule("lowerCase", LowerCaseRegex) case class IdenticalRule() extends Rule("identical") { - + var lastValue: Option[String] = None - override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { - if (cellValue.isEmpty) false - else if (lastValue.isEmpty){ - lastValue = Some(cellValue) - true - } - else{ - val res = cellValue.equals(lastValue.getOrElse("")) - lastValue = Some(cellValue) - res - } + override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { + if (cellValue.isEmpty) false + else if (lastValue.isEmpty){ + lastValue = Some(cellValue) + true + } + else{ + val res = cellValue.equals(lastValue.getOrElse("")) + lastValue = Some(cellValue) + res + } } } case class UniqueRule() extends Rule("unique") { val distinctValues = mutable.HashMap[String, Int]() - override def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { + override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { val columnDefinition = schema.columnDefinitions(columnIndex) def originalValue: Option[String] = { @@ -382,7 +419,7 @@ case class UniqueRule() extends Rule("unique") { } } - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { evaluate(columnIndex, row, schema) match { case FailureZ(_) => false case SuccessZ(_) => true @@ -394,7 +431,7 @@ case class UniqueMultiRule(columns: List[ColumnReference]) extends Rule("unique( val SEPARATOR:Char = 0x07 // BEL val distinctValues = mutable.HashMap[String, Int]() - override def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { + override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { val columnDefinition = schema.columnDefinitions(columnIndex) def secondaryValues = columns.map(_.referenceValue(columnIndex, row, schema)).mkString(SEPARATOR.toString) @@ -416,7 +453,7 @@ case class UniqueMultiRule(columns: List[ColumnReference]) extends Rule("unique( } } - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { evaluate(columnIndex, row, schema) match { case FailureZ(_) => false case SuccessZ(_) => true @@ -429,7 +466,7 @@ case class ChecksumRule(rootPath: ArgProvider, file: ArgProvider, algorithm: Str def this(file: ArgProvider, algorithm: String, pathSubstitutions: List[(String,String)], enforceCaseSensitivePathChecks: Boolean) = this(Literal(None), file, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks) def this(file: ArgProvider, algorithm: String, enforceCaseSensitivePathChecks: Boolean) = this(Literal(None), file, algorithm, List.empty[(String,String)], enforceCaseSensitivePathChecks) - override def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { + override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { val columnDefinition = schema.columnDefinitions(columnIndex) search(filename(columnIndex, row, schema)) match { @@ -439,7 +476,7 @@ case class ChecksumRule(rootPath: ArgProvider, file: ArgProvider, algorithm: Str } } - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { evaluate(columnIndex, row, schema) match { case FailureZ(_) => false case SuccessZ(_) => true @@ -498,10 +535,10 @@ case class ChecksumRule(rootPath: ArgProvider, file: ArgProvider, algorithm: Str .map(_.toHex) .runLast .attemptRun - .validation - .leftMap(_.getMessage) - .rightMap(_.getOrElse("NO CHECKSUM")) - .toValidationNel + .validation + .leftMap(_.getMessage) + .rightMap(_.getOrElse("NO CHECKSUM")) + .toValidationNel } FileSystem.createFile(file) match { @@ -526,14 +563,14 @@ case class FileCountRule(rootPath: ArgProvider, file: ArgProvider, pathSubstitut def this(file: ArgProvider, pathSubstitutions: List[SubstitutePath]) = this(Literal(None), file, pathSubstitutions) def this(rootPath: Literal, file: Literal) = this(rootPath, file, List.empty) - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { evaluate(columnIndex, row, schema) match { case FailureZ(_) => false case SuccessZ(_) => true } } - override def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { + override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { val columnDefinition = schema.columnDefinitions(columnIndex) Try(cellValue(columnIndex,row,schema).toInt) match { @@ -662,16 +699,17 @@ trait FileWildcardSearch[T] { } } + case class RangeRule(min: Option[BigDecimal], max: Option[BigDecimal]) extends Rule("range") { - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + override def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { Try[BigDecimal]( BigDecimal(cellValue)) match { - + case scala.util.Success(callDecimal) => min.map( callDecimal >= _).getOrElse(true) && max.map( callDecimal <= _).getOrElse(true) case _ => false - } + } } override def toError = s"""$ruleName(${min.getOrElse("*")},${max.getOrElse("*")})""" @@ -682,7 +720,7 @@ case class LengthRule(from: Option[String], to: String) extends Rule("length") { def toValue: Int = if (to == "*") Int.MaxValue else to.toInt def fromValue: Int = if (from.get == "*") 0 else from.get.toInt - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { val cellLen = cellValue.length from match { @@ -695,7 +733,7 @@ case class LengthRule(from: Option[String], to: String) extends Rule("length") { } case class AndRule(left: Rule, right: Rule) extends Rule("and") { - override def evaluate(columnIndex: Int, row: Row, schema: Schema): RuleValidation[Any] = { + override def evaluate(columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): RuleValidation[Any] = { left.evaluate(columnIndex, row, schema) match { case s @ FailureZ(_) => fail(columnIndex, row, schema) @@ -706,7 +744,7 @@ case class AndRule(left: Rule, right: Rule) extends Rule("and") { } } - def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema): Boolean = { + def valid(cellValue: String, columnDefinition: ColumnDefinition, columnIndex: Int, row: Row, schema: Schema, mayBeLast: Option[Boolean] = None): Boolean = { evaluate(columnIndex, row, schema) match { case FailureZ(_) => false case SuccessZ(_) => true @@ -714,4 +752,4 @@ case class AndRule(left: Rule, right: Rule) extends Rule("and") { } override def toError = s"""${left.toError} $ruleName ${right.toError}""" -} +} \ No newline at end of file diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Schema.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Schema.scala index 236a0544..518388c8 100644 --- a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Schema.scala +++ b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/Schema.scala @@ -25,8 +25,6 @@ case class Quoted() extends GlobalDirective("quoted") case class TotalColumns(numberOfColumns: BigInt) extends GlobalDirective("totalColumns") -case class IntegrityCheck(filepathColumn: String, includeFolder: Boolean) extends GlobalDirective("integrityCheck") - case class PermitEmpty() extends GlobalDirective("permitEmpty") case class NoHeader() extends GlobalDirective("noHeader") @@ -99,4 +97,3 @@ case class Warning() extends ColumnDirective { case class IgnoreCase() extends ColumnDirective { override def toString = "ignoreCase" } - diff --git a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala index 8f8ad832..47c5920a 100644 --- a/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala +++ b/csv-validator-core/src/main/scala/uk/gov/nationalarchives/csv/validator/schema/SchemaParser.scala @@ -20,251 +20,242 @@ import uk.gov.nationalarchives.csv.validator.EOL import uk.gov.nationalarchives.csv.validator.{SchemaMessage, FailMessage} /** - * CSV Schema Parser - * - * Uses Scala Parser Combinators to parse the CSV Schema language defined in - * the specification document - * @see http://digital-preservation.github.io/csv-validator/csv-schema-1.0.html - */ + * CSV Schema Parser + * + * Uses Scala Parser Combinators to parse the CSV Schema language defined in + * the specification document + * @see http://digital-preservation.github.io/csv-validator/csv-schema-1.0.html + */ trait SchemaParser extends RegexParsers - with PackratParsers - with TraceableParsers { +with PackratParsers +with TraceableParsers { /** - * Any path substitutions needed when - * resolving file paths - */ + * Any path substitutions needed when + * resolving file paths + */ val pathSubstitutions: List[(String, String)] /** - * Whether to enforce case sensitivity - * in file path checks. Useful - * when working on operating systems / - * filesystems that ignore file path - * case sensitivity, e.g. Windows and NTFS - */ + * Whether to enforce case sensitivity + * in file path checks. Useful + * when working on operating systems / + * filesystems that ignore file path + * case sensitivity, e.g. Windows and NTFS + */ val enforceCaseSensitivePathChecks: Boolean // /** - * [1] Schema ::= Prolog Body - */ + * [1] Schema ::= Prolog Body + */ lazy val schema : PackratParser[Schema] = "Schema" ::= prolog ~ body ^^ { case version ~ globalDirectives ~ columnDefs => Schema(globalDirectives, columnDefs) } /** - * [2] Prolog ::= VersionDecl GlobalDirectives - */ + * [2] Prolog ::= VersionDecl GlobalDirectives + */ lazy val prolog = "Prolog" ::= versionDecl ~ globalDirectives /** - * [3] VersionDecl ::= "version 1.0" - */ + * [3] VersionDecl ::= "version 1.0" + */ lazy val versionDecl: PackratParser[String] = "VersionDecl" ::= ("version" ~> Schema.version <~ eol).withFailureMessage(s"Schema version declaration 'version ${Schema.version}' missing or incorrect") /** - * [4] GlobalDirectives ::= SeparatorDirective? QuotedDirective? TotalColumnsDirective? PermitEmptyDirective? (NoHeaderDirective | IgnoreColumnNameCaseDirective)? IntegrityCheckDirective /* expr: unordered */ - */ - lazy val globalDirectives: PackratParser[List[GlobalDirective]] = "GlobalDirectives" ::= opt(mingle(List(separatorDirective, quotedDirective, totalColumnsDirective, permitEmptyDirective, noHeaderDirective | ignoreColumnNameCaseDirective, integrityCheckDirective).map(positioned(_) <~ opt(eol))).withFailureMessage("Invalid global directive")) ^^ { - _.getOrElse(List.empty) + * [4] GlobalDirectives ::= SeparatorDirective? QuotedDirective? TotalColumnsDirective? PermitEmptyDirective? (NoHeaderDirective | IgnoreColumnNameCaseDirective)? /* expr: unordered */ + */ + lazy val globalDirectives: PackratParser[List[GlobalDirective]] = "GlobalDirectives" ::= opt(mingle(List(separatorDirective, quotedDirective, totalColumnsDirective, permitEmptyDirective, noHeaderDirective | ignoreColumnNameCaseDirective).map(positioned(_) <~ opt(eol))).withFailureMessage("Invalid global directive")) ^^ { + _.getOrElse(List.empty) } /** - * [5] DirectivePrefix ::= "@" - */ + * [5] DirectivePrefix ::= "@" + */ lazy val directivePrefix = "DirectivePrefix" ::= "@" /** - * [6] SeparatorDirective ::= DirectivePrefix "separator" (SeparatorTabExpr | SeparatorChar) - */ + * [6] SeparatorDirective ::= DirectivePrefix "separator" (SeparatorTabExpr | SeparatorChar) + */ lazy val separatorDirective: PackratParser[Separator] = "SeparatorDirective" ::= directivePrefix ~> "separator" ~> (separatorTabExpr | separatorChar) /** - * [7] SeparatorTabExpr ::= "TAB" | '\t' - */ + * [7] SeparatorTabExpr ::= "TAB" | '\t' + */ lazy val separatorTabExpr: PackratParser[Separator] = "SeparatorTabExpr" ::= ("TAB" | """'\t'""") ^^^ Separator('\t') /** - * [8] SeparatorChar ::= CharacterLiteral - */ + * [8] SeparatorChar ::= CharacterLiteral + */ lazy val separatorChar: PackratParser[Separator] = "SeparatorChar" ::= characterLiteral ^^ { Separator(_) } /** - * [9] QuotedDirective ::= DirectivePrefix "quoted" - */ + * [9] QuotedDirective ::= DirectivePrefix "quoted" + */ lazy val quotedDirective: PackratParser[Quoted] = "QuotedDirective" ::= directivePrefix ~> "quoted" ^^^ Quoted() /** - * [10] TotalColumnsDirective ::= DirectivePrefix "totalColumns" PositiveNonZeroIntegerLiteral - */ + * [10] TotalColumnsDirective ::= DirectivePrefix "totalColumns" PositiveNonZeroIntegerLiteral + */ lazy val totalColumnsDirective: PackratParser[TotalColumns] = "TotalColumnsDirective" ::= (directivePrefix ~> "totalColumns" ~> positiveNonZeroIntegerLiteral ^^ { TotalColumns(_) }).withFailureMessage("@totalColumns invalid") /** - * [11] NoHeaderDirective ::= DirectivePrefix "noHeader" - */ + * [11] NoHeaderDirective ::= DirectivePrefix "noHeader" + */ lazy val noHeaderDirective: PackratParser[NoHeader] = "NoHeaderDirective" ::= directivePrefix ~> "noHeader" ^^^ NoHeader() /** - * [12] PermitEmptyDirective ::= DirectivePrefix "permitEmpty" - */ + * [12] PermitEmptyDirective ::= DirectivePrefix "permitEmpty" + */ lazy val permitEmptyDirective: PackratParser[PermitEmpty] = "PermitEmptyDirective" ::= directivePrefix ~> "permitEmpty" ^^^ PermitEmpty() /** - * [13] IgnoreColumnNameCaseDirective ::= DirectivePrefix "ignoreColumnNameCase" - */ + * [13] IgnoreColumnNameCaseDirective ::= DirectivePrefix "ignoreColumnNameCase" + */ lazy val ignoreColumnNameCaseDirective: PackratParser[IgnoreColumnNameCase] = "IgnoreColumnNameCaseDirective" ::= directivePrefix ~> "ignoreColumnNameCase" ^^^ IgnoreColumnNameCase() /** - * IntegrityCheckDirective ::= DirectivePrefix "integrityCheck("StringLiteral, StringLiteral? ")" - */ - lazy val integrityCheckDirective: PackratParser[IntegrityCheck] = (directivePrefix ~> "integrityCheck(" ~> stringLiteral ~ (("," ~> stringLiteral)?) <~ ")" ^^ { - case filepathColumn ~ Some(includeFolder) if includeFolder == "includeFolder" => IntegrityCheck(filepathColumn, true) - case filepathColumn ~ Some(includeFolder) if includeFolder == "excludeFolder" => IntegrityCheck(filepathColumn, false) - case filepathColumn ~ None => IntegrityCheck(filepathColumn, false) - }).withFailureMessage("@integrityCheck invalid") - - /** - * [14] Body ::= BodyPart+ - */ + * [14] Body ::= BodyPart+ + */ lazy val body = "Body" ::= rep1(bodyPart) <~ rep(eol) /** - * [15] BodyPart ::= Comment* ColumnDefinition Comment* - */ + * [15] BodyPart ::= Comment* ColumnDefinition Comment* + */ lazy val bodyPart = "BodyPart" ::= (rep(comment) ~> columnDefinition) <~ rep(comment) /** - * [16] Comment ::= SingleLineComment | MultiLineComment - */ + * [16] Comment ::= SingleLineComment | MultiLineComment + */ lazy val comment: PackratParser[Any] = "Comment" ::= singleLineComment | multiLineComment /** - * [17] SingleLineComment ::= "//" NonBreakingChar* - */ + * [17] SingleLineComment ::= "//" NonBreakingChar* + */ lazy val singleLineComment: Parser[String] = "SingleLineComment" ::= """//[\S\t ]*(?:\r?\n)?""".r /** - * [18] MultiLineComment ::= "/*" Char* "*/" - */ + * [18] MultiLineComment ::= "/*" Char* "*/" + */ lazy val multiLineComment: Parser[String] = "MultiLineComment" ::= """\/\*(?:[^*\r\n]+|(?:\r?\n))*\*\/(?:\r?\n)?""".r /** - * [19] ColumnDefinition ::= (ColumnIdentifier | QuotedColumnIdentifier) ":" ColumnRule - */ + * [19] ColumnDefinition ::= (ColumnIdentifier | QuotedColumnIdentifier) ":" ColumnRule + */ lazy val columnDefinition: PackratParser[ColumnDefinition] = "ColumnDefinition" ::= positioned(( ((columnIdentifier | quotedColumnIdentifier) <~ ":") ~ columnRule <~ (endOfColumnDefinition | comment) ^^ { case id ~ (rules ~ columnDirectives) => ColumnDefinition(id, rules, columnDirectives) } - ).withFailureMessage("Invalid column definition")) + ).withFailureMessage("Invalid column definition")) /** - * [20] ColumnIdentifier ::= PositiveNonZeroIntegerLiteral | Ident - */ + * [20] ColumnIdentifier ::= PositiveNonZeroIntegerLiteral | Ident + */ lazy val columnIdentifier: PackratParser[ColumnIdentifier] = "ColumnIdentifier" ::= (positiveNonZeroIntegerLiteral | ident).withFailureMessage("Column identifier invalid") ^^ { case offset: BigInt => OffsetColumnIdentifier(offset) case ident: String => NamedColumnIdentifier(ident) } /** - * [21] QuotedColumnIdentifier ::= StringLiteral - */ + * [21] QuotedColumnIdentifier ::= StringLiteral + */ lazy val quotedColumnIdentifier = "QuotedColumnIdentifier" ::= stringLiteral.withFailureMessage("Quoted column identifier invalid") ^^ { NamedColumnIdentifier(_) } /** - * [22] ColumnRule ::= ColumnValidationExpr* ColumnDirectives - */ + * [22] ColumnRule ::= ColumnValidationExpr* ColumnDirectives + */ lazy val columnRule = "ColumnRule" ::= rep(columnValidationExpr) ~ columnDirectives /** - * [23] ColumnDirectives ::= OptionalDirective? MatchIsFalseDirective? IgnoreCaseDirective? WarningDirective? /* expr: unordered */ - */ + * [23] ColumnDirectives ::= OptionalDirective? MatchIsFalseDirective? IgnoreCaseDirective? WarningDirective? /* expr: unordered */ + */ lazy val columnDirectives: Parser[List[ColumnDirective]] = "ColumnDirectives" ::= opt(mingle(List(optionalDirective, matchIsFalseDirective, ignoreCaseDirective, warningDirective).map(positioned(_))).withFailureMessage("Invalid column directive")) ^^ { _.getOrElse(List.empty) } /** - * [24] OptionalDirective ::= DirectivePrefix "optional" - */ + * [24] OptionalDirective ::= DirectivePrefix "optional" + */ lazy val optionalDirective = "OptionalDirective" ::= directivePrefix ~> "optional" ^^^ Optional() /** - * [25] MatchIsFalseDirective ::= DirectivePrefix "matchIsFalse" - */ + * [25] MatchIsFalseDirective ::= DirectivePrefix "matchIsFalse" + */ //TODO implement workings of matchIsFalseDirective at present it does nothing! lazy val matchIsFalseDirective = "MatchIsFalseDirective" ::= directivePrefix ~> "matchIsFalse" ^^^ MatchIsFalse() /** - * [26] IgnoreCaseDirective ::= DirectivePrefix "ignoreCase" - */ + * [26] IgnoreCaseDirective ::= DirectivePrefix "ignoreCase" + */ lazy val ignoreCaseDirective = "IgnoreCaseDirective" ::= directivePrefix ~> "ignoreCase" ^^^ IgnoreCase() /** - * [27] WarningDirective ::= DirectivePrefix "warningDirective" - */ + * [27] WarningDirective ::= DirectivePrefix "warningDirective" + */ lazy val warningDirective = "WarningDirective" ::= directivePrefix ~> "warning" ^^^ Warning() /** - * [28] ColumnValidationExpr ::= CombinatorialExpr | NonCombinatorialExpr - */ + * [28] ColumnValidationExpr ::= CombinatorialExpr | NonCombinatorialExpr + */ lazy val columnValidationExpr: PackratParser[Rule] = "ColumnValidationExpr" ::= positioned(combinatorialExpr | nonCombinatorialExpr) /** - * [29] CombinatorialExpr ::= OrExpr | AndExpr - */ + * [29] CombinatorialExpr ::= OrExpr | AndExpr + */ lazy val combinatorialExpr = "CombinatorialExpr" ::= orExpr | andExpr /** - * [30] OrExpr ::= nonCombinatorialExpr "or" columnValidationExpr - * - * Uses nonCombinatorialExpr on the left-hand-side - * to avoid left recursive rule - */ + * [30] OrExpr ::= nonCombinatorialExpr "or" columnValidationExpr + * + * Uses nonCombinatorialExpr on the left-hand-side + * to avoid left recursive rule + */ lazy val orExpr: PackratParser[OrRule] = "OrExpr" ::= nonCombinatorialExpr ~ "or" ~ columnValidationExpr ^^ { case lhs ~ "or" ~ rhs => OrRule(lhs, rhs) } /** - * [31] AndExpr ::= nonCombinatorialExpr "and" ColumnValidationExpr - * - * Uses nonCombinatorialExpr on the left-hand-side - * to avoid left recursive rule - */ + * [31] AndExpr ::= nonCombinatorialExpr "and" ColumnValidationExpr + * + * Uses nonCombinatorialExpr on the left-hand-side + * to avoid left recursive rule + */ lazy val andExpr: PackratParser[AndRule] = "AndExpr" ::= nonCombinatorialExpr ~ "and" ~ columnValidationExpr ^^ { case lhs ~ "and" ~ rhs => AndRule(lhs, rhs) } /** - * [32] NonCombinatorialExpr ::= NonConditionalExpr | ConditionalExpr - */ + * [32] NonCombinatorialExpr ::= NonConditionalExpr | ConditionalExpr + */ lazy val nonCombinatorialExpr = "NonCombinatorialExpr" ::= nonConditionalExpr | conditionalExpr /** - * [33] NonConditionalExpr ::= SingleExpr | ExternalSingleExpr | ParenthesizedExpr - */ + * [33] NonConditionalExpr ::= SingleExpr | ExternalSingleExpr | ParenthesizedExpr + */ lazy val nonConditionalExpr: PackratParser[Rule] = "NonConditionalExpr" ::= singleExpr | externalSingleExpr | parenthesizedExpr /** - * [34] SingleExpr ::= ExplicitContextExpr? (IsExpr | NotExpr | InExpr | - * StartsWithExpr | EndsWithExpr | RegExpExpr | - * RangeExpr | LengthExpr | - * EmptyExpr | NotEmptyExpr | UniqueExpr | identicalExpr | - * UriExpr | - * XsdDateTimeExpr | XsdDateExpr | XsdTimeExpr | - * UkDateExpr | DateExpr | PartialUkDateExpr | PartialDateExpr | - * uuid4Expr | - * PositiveIntegerExpr) - */ + * [34] SingleExpr ::= ExplicitContextExpr? (IsExpr | NotExpr | InExpr | + * StartsWithExpr | EndsWithExpr | RegExpExpr | + * RangeExpr | LengthExpr | + * EmptyExpr | NotEmptyExpr | UniqueExpr | identicalExpr | + * UriExpr | + * XsdDateTimeExpr | XsdDateExpr | XsdTimeExpr | + * UkDateExpr | DateExpr | PartialUkDateExpr | PartialDateExpr | + * uuid4Expr | + * PositiveIntegerExpr | UpperCaseExpr | LowerCaseExpr) + */ //TODO need to implement and add DateExpr, PartialDateExpr lazy val singleExpr: PackratParser[Rule] = "SingleExpr" ::= opt(explicitContextExpr) ~ (isExpr | notExpr | inExpr | @@ -276,61 +267,61 @@ trait SchemaParser extends RegexParsers ukDateExpr | partialUkDateExpr | uuid4Expr | positiveIntegerExpr | upperCaseExpr | lowerCaseExpr) ^^ { - case explicitContext ~ rule => - rule.explicitColumn = explicitContext - rule - } + case explicitContext ~ rule => + rule.explicitColumn = explicitContext + rule + } /** - * [35] ExplicitContextExpr ::= ColumnRef "/" - */ + * [35] ExplicitContextExpr ::= ColumnRef "/" + */ lazy val explicitContextExpr = "ExplicitContextExpr" ::= columnRef <~ "/" /** - * [36] ColumnRef ::= "$" (ColumnIdentifier | QuotedColumnIdentifier) - */ + * [36] ColumnRef ::= "$" (ColumnIdentifier | QuotedColumnIdentifier) + */ lazy val columnRef: PackratParser[ColumnReference] = "ColumnRef" ::= "$" ~> (columnIdentifier | quotedColumnIdentifier) ^^ { ColumnReference(_) } /** - * [37] IsExpr ::= "is(" StringProvider ")" - */ + * [37] IsExpr ::= "is(" StringProvider ")" + */ lazy val isExpr: PackratParser[IsRule] = "IsExpr" ::= "is(" ~> stringProvider <~ ")" ^^ { IsRule } /** - * [38] NotExpr ::= "not(" StringProvider ")" - */ + * [38] NotExpr ::= "not(" StringProvider ")" + */ lazy val notExpr: PackratParser[NotRule] = "NotExpr" ::= "not(" ~> stringProvider <~ ")" ^^ { NotRule } /** - * [39] InExpr ::= "in(" StringProvider ")" - */ + * [39] InExpr ::= "in(" StringProvider ")" + */ lazy val inExpr: PackratParser[InRule] = "InExpr" ::= "in(" ~> stringProvider <~ ")" ^^ { InRule } /** - * [40] StartsWithExpr ::= "starts(" StringProvider ")" - */ + * [40] StartsWithExpr ::= "starts(" StringProvider ")" + */ lazy val startsWithExpr: PackratParser[StartsRule] = "StartsWithExpr" ::= "starts(" ~> stringProvider <~ ")" ^^ { StartsRule } /** - * [41] EndsWithExpr ::= "ends(" StringProvider ")" - */ + * [41] EndsWithExpr ::= "ends(" StringProvider ")" + */ lazy val endsWithExpr: PackratParser[EndsRule] = "EndsWithExpr" ::= "ends(" ~> stringProvider <~ ")" ^^ { EndsRule } /** - * [42] RegExpExpr ::= "regex(" StringLiteral ")" - */ + * [42] RegExpExpr ::= "regex(" StringLiteral ")" + */ //TODO could improve error or regex? //TODO How to escape quotes inside regex? lazy val regExpExpr: PackratParser[RegExpRule] = "RegExpExpr" ::= "regex" ~> """([(]")(.*?)("[)])""".r ^^ { @@ -339,23 +330,23 @@ trait SchemaParser extends RegexParsers } withFailureMessage("""regex not correctly delimited as ("your regex")""") /** - * [43] RangeExpr ::= "range(" NumericOrAny "," NumericOrAny ")" /* range is inclusive */ - */ + * [43] RangeExpr ::= "range(" NumericOrAny "," NumericOrAny ")" /* range is inclusive */ + */ lazy val rangeExpr: PackratParser[RangeRule] = "RangeExpr" ::= "range(" ~> numericOrAny ~ "," ~ numericOrAny <~ ")" ^^ { case a ~ "," ~ b => RangeRule(a,b) } /** - * [44] LengthExpr ::= "length(" (PositiveIntegerOrAny ",")? PositiveIntegerOrAny ")" - * - * /* - * length has 4 forms. - * 1) length(n) ensures the value is: the exact length n (absolute length) - * 2) length(n, *) ensures the value is: longer than or equal to n (minimum length) - * 3) length(*, n) ensures the value is: shorter than or equal to n (maximum length) - * 4) length(n1, n2) ensures the value is: longer than or equal to n1 AND shorter than or equal to n2 (minumum and maximum lengths) - * */ - */ + * [44] LengthExpr ::= "length(" (PositiveIntegerOrAny ",")? PositiveIntegerOrAny ")" + * + * /* + * length has 4 forms. + * 1) length(n) ensures the value is: the exact length n (absolute length) + * 2) length(n, *) ensures the value is: longer than or equal to n (minimum length) + * 3) length(*, n) ensures the value is: shorter than or equal to n (maximum length) + * 4) length(n1, n2) ensures the value is: longer than or equal to n1 AND shorter than or equal to n2 (minumum and maximum lengths) + * */ + */ lazy val lengthExpr: PackratParser[LengthRule] = "LengthExpr" ::= "length(" ~> opt(positiveIntegerOrAny <~ ",") ~ positiveIntegerOrAny <~ ")" ^^ { case from ~ to => def str(x: Option[BigInt]): String = x.map(_.toString).getOrElse(wildcard) @@ -363,8 +354,8 @@ trait SchemaParser extends RegexParsers } /** - * [45] PositiveIntegerOrAny ::= PositiveIntegerLiteral | WildcardLiteral - */ + * [45] PositiveIntegerOrAny ::= PositiveIntegerLiteral | WildcardLiteral + */ def positiveIntegerOrAny: Parser[Option[BigInt]] = "PositiveIntegerOrAny" ::= (positiveIntegerLiteral | wildcardLiteral) ^^ { case positiveInteger: BigInt => Option(positiveInteger) @@ -373,8 +364,8 @@ trait SchemaParser extends RegexParsers } /** - * [45.1] NumericOrAny ::= NumericLiteral | WildcardLiteral - */ + * [45.1] NumericOrAny ::= NumericLiteral | WildcardLiteral + */ def numericOrAny: Parser[Option[BigDecimal]] = "NumericOrAny" ::= (numericLiteral | wildcardLiteral) ^^ { case positiveInteger: BigDecimal => Option(positiveInteger) @@ -383,21 +374,21 @@ trait SchemaParser extends RegexParsers } /** - * [46] EmptyExpr ::= "empty" - */ + * [46] EmptyExpr ::= "empty" + */ lazy val emptyExpr = "EmptyExpr" ::= "empty" ^^^ EmptyRule() /** - * [47] NotEmptyExpr ::= "notEmpty" - */ + * [47] NotEmptyExpr ::= "notEmpty" + */ lazy val notEmptyExpr = "NotEmptyExpr" ::= "notEmpty" ^^^ NotEmptyRule() - + lazy val identicalExpr: PackratParser[IdenticalRule] = "IdenticalExpr" ::= "identical" ^^^ IdenticalRule() /** - * [48] UniqueExpr ::= "unique" ("(" ColumnRef ("," ColumnRef)* ")")? - */ + * [48] UniqueExpr ::= "unique" ("(" ColumnRef ("," ColumnRef)* ")")? + */ lazy val uniqueExpr: PackratParser[Rule] = "UniqueExpr" ::= "unique" ~> opt("(" ~> columnRef ~ rep("," ~> columnRef) <~ ")") ^^ { case None => UniqueRule() @@ -406,13 +397,13 @@ trait SchemaParser extends RegexParsers } /** - * [49] UriExpr ::= "uri" - */ + * [49] UriExpr ::= "uri" + */ lazy val uriExpr = "UriExpr" ::= "uri" ^^^ UriRule() /** - * [50] XsdDateTimeExpr ::= "xDateTime" ("(" XsdDateTimeLiteral "," XsdDateTimeLiteral ")")? - */ + * [50] XsdDateTimeExpr ::= "xDateTime" ("(" XsdDateTimeLiteral "," XsdDateTimeLiteral ")")? + */ lazy val xsdDateTimeExpr = "XsdDateTimeExpr" ::= "xDateTime" ~> opt((("(" ~> xsdDateTimeLiteral) <~ ",") ~ (xsdDateTimeLiteral <~ ")")) ^^ { case None => XsdDateTimeRule() @@ -421,8 +412,8 @@ trait SchemaParser extends RegexParsers } /** - * [51] XsdDateExpr ::= "xDate" ("(" XsdDateLiteral "," XsdDateLiteral ")")? - */ + * [51] XsdDateExpr ::= "xDate" ("(" XsdDateLiteral "," XsdDateLiteral ")")? + */ lazy val xsdDateExpr = "XsdDateExpr" ::= "xDate" ~> opt((("(" ~> xsdDateLiteral) <~ ",") ~ (xsdDateLiteral <~ ")")) ^^ { case None => XsdDateRule() @@ -431,8 +422,8 @@ trait SchemaParser extends RegexParsers } /** - * [52] XsdTimeExpr ::= "xTime" ("(" XsdTimeLiteral "," XsdTimeLiteral ")")? - */ + * [52] XsdTimeExpr ::= "xTime" ("(" XsdTimeLiteral "," XsdTimeLiteral ")")? + */ lazy val xsdTimeExpr = "XsdTimeExpr" ::= "xTime" ~> opt((("(" ~> xsdTimeLiteral) <~ ",") ~ (xsdTimeLiteral <~ ")")) ^^ { case None => XsdTimeRule() @@ -441,8 +432,8 @@ trait SchemaParser extends RegexParsers } /** - * [53] UkDateExpr ::= "ukDate" ("(" UkDateLiteral "," UkDateLiteral ")")? - */ + * [53] UkDateExpr ::= "ukDate" ("(" UkDateLiteral "," UkDateLiteral ")")? + */ lazy val ukDateExpr = "UkDateExpr" ::= "ukDate" ~> opt((("(" ~> ukDateLiteral) <~ ",") ~ (ukDateLiteral <~ ")")) ^^ { case None => UkDateRule() @@ -451,53 +442,55 @@ trait SchemaParser extends RegexParsers } /** - * [54] DateExpr ::= "date(" StringProvider "," StringProvider "," StringProvider ("," XsdDateLiteral "," XsdDateLiteral)? ")" - */ + * [54] DateExpr ::= "date(" StringProvider "," StringProvider "," StringProvider ("," XsdDateLiteral "," XsdDateLiteral)? ")" + */ //TODO implement DateExpr /** - * [55] PartialUkDateExpr ::= "partUkDate" - */ + * [55] PartialUkDateExpr ::= "partUkDate" + */ lazy val partialUkDateExpr: PackratParser[PartUkDateRule] = "PartialUkDateExpr" ::= "partUkDate" ^^^ PartUkDateRule() /** - * [56] PartialDateExpr ::= "partDate(" StringProvider "," StringProvider "," StringProvider ")" - */ + * [56] PartialDateExpr ::= "partDate(" StringProvider "," StringProvider "," StringProvider ")" + */ //TODO implement PartialDateExpr /** - * [57] Uuid4Expr ::= "uuid4" - */ + * [57] Uuid4Expr ::= "uuid4" + */ lazy val uuid4Expr: PackratParser[Uuid4Rule] = "Uuid4Expr" ::= "uuid4" ^^^ Uuid4Rule() /** - * [58] PositiveIntegerExpr ::= "positiveInteger" - */ + * [58] PositiveIntegerExpr ::= "positiveInteger" + */ lazy val positiveIntegerExpr: PackratParser[PositiveIntegerRule] = "PositiveIntegerExpr" ::= "positiveInteger" ^^^ PositiveIntegerRule() + lazy val upperCaseExpr: PackratParser[UpperCaseRule] = "UpperCaseExpr" ::= "upperCase" ^^^ UpperCaseRule() + lazy val lowerCaseExpr: PackratParser[LowerCaseRule] = "LowerCaseExpr" ::= "lowerCase" ^^^ LowerCaseRule() /** - * [59] StringProvider ::= ColumnRef | StringLiteral - */ + * [59] StringProvider ::= ColumnRef | StringLiteral + */ lazy val stringProvider: PackratParser[ArgProvider] = "StringProvider" ::= columnRef | stringLiteral ^^ { s => Literal(Some(s)) } /** - * [60] ExternalSingleExpr ::= ExplicitContextExpr? (FileExistsExpr | ChecksumExpr | FileCountExpr) - */ - lazy val externalSingleExpr: PackratParser[Rule] = "ExternalSingleExpr" ::= opt(explicitContextExpr) ~ (fileExistsExpr | checksumExpr | fileCountExpr) ^^ { + * [60] ExternalSingleExpr ::= ExplicitContextExpr? (FileExistsExpr | ChecksumExpr | FileCountExpr) + */ + lazy val externalSingleExpr: PackratParser[Rule] = "ExternalSingleExpr" ::= opt(explicitContextExpr) ~ (fileExistsExpr | integrityCheckExpr | checksumExpr | fileCountExpr) ^^ { case explicitContext ~ rule => rule.explicitColumn = explicitContext rule } /** - * [61] FileExistsExpr ::= "fileExists" ("(" StringProvider ")")? /* optional path to prepend to this cell with filename in */ - */ + * [61] FileExistsExpr ::= "fileExists" ("(" StringProvider ")")? /* optional path to prepend to this cell with filename in */ + */ lazy val fileExistsExpr = "FileExistsExpr" ::= ("fileExists" ~> opt("(" ~> stringProvider <~ ")")).withFailureMessage("Invalid fileExists rule") ^^ { case None => FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks) @@ -505,42 +498,53 @@ trait SchemaParser extends RegexParsers FileExistsRule(pathSubstitutions, enforceCaseSensitivePathChecks, s) } + + lazy val integrityCheckExpr = "IntegrityCheckExpr" ::= ("integrityCheck" ~> "(" ~> opt(stringProvider <~ ",") ~ stringLiteral <~ ")" ).withFailureMessage("Invalid integrityCheck rule") ^^ { + case rp ~ includeFolder if (includeFolder == "includeFolder") => + IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), true) + case rp ~ includeFolder if (includeFolder == "excludeFolder") => + IntegrityCheckRule(pathSubstitutions, enforceCaseSensitivePathChecks, rp.getOrElse(Literal(None)), false) + // case rp ~ includeFolder => failure(s"$includeFolder should be either includeFolder or excludeFolder") + + } + + /** - * [62] ChecksumExpr ::= "checksum(" FileExpr "," StringLiteral ")" /* first arg is file expr, second arg is algorithm to use for checksum */ - */ + * [62] ChecksumExpr ::= "checksum(" FileExpr "," StringLiteral ")" /* first arg is file expr, second arg is algorithm to use for checksum */ + */ lazy val checksumExpr = "ChecksumExpr" ::= ("checksum(" ~> fileExpr <~ ",") ~ stringLiteral <~ ")" ^^ { case files ~ algorithm => ChecksumRule(files._1.getOrElse(Literal(None)), files._2, algorithm, pathSubstitutions, enforceCaseSensitivePathChecks) } /** - * [63] FileExpr ::= "file(" (StringProvider ",")? StringProvider ")" /* first (optional) arg is path (or ColumnRef of path) to prepend to second arg, second arg is filename (or ColumnRef of filename) */ - */ + * [63] FileExpr ::= "file(" (StringProvider ",")? StringProvider ")" /* first (optional) arg is path (or ColumnRef of path) to prepend to second arg, second arg is filename (or ColumnRef of filename) */ + */ lazy val fileExpr = "FileExpr" ::= "file(" ~> opt(stringProvider <~ ",") ~ stringProvider <~ ")" /** - * [64] FileCountExpr ::= "fileCount(" FileExpr ")" - */ + * [64] FileCountExpr ::= "fileCount(" FileExpr ")" + */ lazy val fileCountExpr = "FileCountExpr" ::= "fileCount(" ~> fileExpr <~ ")" ^^ { case a => FileCountRule(a._1.getOrElse(Literal(None)), a._2, pathSubstitutions) } /** - * [65] ParenthesizedExpr ::= "(" ColumnValidationExpr+ ")" - */ + * [65] ParenthesizedExpr ::= "(" ColumnValidationExpr+ ")" + */ lazy val parenthesizedExpr: PackratParser[ParenthesesRule] = "ParenthesizedExpr" ::= "(" ~> rep1(columnValidationExpr) <~ ")" ^^ { ParenthesesRule } | failure("unmatched paren") /** - * [66] ConditionalExpr ::= IfExpr - */ + * [66] ConditionalExpr ::= IfExpr + */ lazy val conditionalExpr: PackratParser[Rule] = "ConditionalExpr" ::= ifExpr /** - * [67] IfExpr ::= "if(" (CombinatorialExpr | NonConditionalExpr) "," ColumnValidationExpr+ ("," ColumnValidationExpr+)? ")" /* if with optional else */ - */ + * [67] IfExpr ::= "if(" (CombinatorialExpr | NonConditionalExpr) "," ColumnValidationExpr+ ("," ColumnValidationExpr+)? ")" /* if with optional else */ + */ lazy val ifExpr: PackratParser[IfRule] = "IfExpr" ::= (("if(" ~> (combinatorialExpr | nonConditionalExpr) <~ ",") ~ rep1(columnValidationExpr) ~ opt("," ~> rep1(columnValidationExpr)) <~ ")" ^^ { case condition ~ thenExpr ~ elseExpr => IfRule(condition, thenExpr, elseExpr) @@ -558,85 +562,85 @@ trait SchemaParser extends RegexParsers // /** - * [68] XsdDateTimeLiteral ::= XsdDateWithoutTimezoneComponent "T" XsdTimeLiteral - */ + * [68] XsdDateTimeLiteral ::= XsdDateWithoutTimezoneComponent "T" XsdTimeLiteral + */ lazy val xsdDateTimeLiteral: Parser[String] = "XsdDateTimeLiteral" ::= (xsdDateWithoutTimezoneComponent + "T" + xsdTimeWithoutTimezoneComponent + xsdTimezoneComponent).r /** - * [69] XsdDateLiteral ::= XsdDateWithoutTimezoneComponent XsdTimezoneComponent - */ + * [69] XsdDateLiteral ::= XsdDateWithoutTimezoneComponent XsdTimezoneComponent + */ lazy val xsdDateLiteral: Parser[String] = "XsdDateLiteral" ::= (xsdDateWithoutTimezoneComponent + xsdTimezoneComponent).r /** - * [70] XsdTimeLiteral ::= XsdTimeWithoutTimezoneComponent XsdTimezoneComponent - */ + * [70] XsdTimeLiteral ::= XsdTimeWithoutTimezoneComponent XsdTimezoneComponent + */ lazy val xsdTimeLiteral: Parser[String] = "XsdTimeLiteral" ::= (xsdTimeWithoutTimezoneComponent + xsdTimezoneComponent).r /** - * [71] XsdDateWithoutTimezoneComponent ::= -?[0-9]{4}-(((0(1|3|5|7|8)|1(0|2))-(0[1-9]|(1|2)[0-9]|3[0-1]))|((0(4|6|9)|11)-(0[1-9]|(1|2)[0-9]|30))|(02-(0[1-9]|(1|2)[0-9]))) /* xgc:regular-expression */ - */ + * [71] XsdDateWithoutTimezoneComponent ::= -?[0-9]{4}-(((0(1|3|5|7|8)|1(0|2))-(0[1-9]|(1|2)[0-9]|3[0-1]))|((0(4|6|9)|11)-(0[1-9]|(1|2)[0-9]|30))|(02-(0[1-9]|(1|2)[0-9]))) /* xgc:regular-expression */ + */ //NOTE - we use a more relaxed regexp here than the spec, as another validation parse is done by the relevant rule class (e.g. schema.XsdTimeRegex, schema.XsdDateRegex or schema.XsdDateTimeRegex) lazy val xsdDateWithoutTimezoneComponent = "[0-9]{4}-[0-9]{2}-[0-9]{2}" /** - * [72] XsdTimeWithoutTimezoneComponent ::= ([0-1][0-9]|2[0-4]):(0[0-9]|[1-5][0-9]):(0[0-9]|[1-5][0-9])(\.[0-9]{3})? /* xgc:regular-expression */ - */ + * [72] XsdTimeWithoutTimezoneComponent ::= ([0-1][0-9]|2[0-4]):(0[0-9]|[1-5][0-9]):(0[0-9]|[1-5][0-9])(\.[0-9]{3})? /* xgc:regular-expression */ + */ //NOTE - we use a more relaxed regexp here than the spec, as another validation parse is done by the relevant rule class (e.g. schema.XsdTimeRegex, schema.XsdDateRegex or schema.XsdDateTimeRegex) lazy val xsdTimeWithoutTimezoneComponent = """[0-9]{2}:[0-9]{2}:[0-9]{2}(\.[0-9]{3})?""" /** - * [73] XsdTimezoneComponent ::= ((\+|-)(0[1-9]|1[0-9]|2[0-4]):(0[0-9]|[1-5][0-9])|Z)? /* xgc:regular-expression */ - */ + * [73] XsdTimezoneComponent ::= ((\+|-)(0[1-9]|1[0-9]|2[0-4]):(0[0-9]|[1-5][0-9])|Z)? /* xgc:regular-expression */ + */ //NOTE - we use a more relaxed regexp here than the spec, as another validation parse is done by the relevant rule class (e.g. schema.XsdTimeRegex, schema.XsdDateRegex or schema.XsdDateTimeRegex) lazy val xsdTimezoneComponent = "(([+-][0-9]{2}:[0-9]{2})|Z)?" /** - * [74] UkDateLiteral ::= (((0[1-9]|(1|2)[0-9]|3[0-1])\/(0(1|3|5|7|8)|1(0|2)))|((0[1-9]|(1|2)[0-9]|30)\/(0(4|6|9)|11))|((0[1-9]|(1|2)[0-9])\/02))\/[0-9]{4} - */ + * [74] UkDateLiteral ::= (((0[1-9]|(1|2)[0-9]|3[0-1])\/(0(1|3|5|7|8)|1(0|2)))|((0[1-9]|(1|2)[0-9]|30)\/(0(4|6|9)|11))|((0[1-9]|(1|2)[0-9])\/02))\/[0-9]{4} + */ //NOTE - we use a more relaxed parser here than the spec, as another validation parse is done by schema.UkDateRegex in the appropriate rule lazy val ukDateLiteral: Parser[String] = "UkDateLiteral" ::= "[0-9]{1,2}/[0-9]{1,2}/[0-9]{4}".r /** - * [75] PositiveNonZeroIntegerLiteral ::= [1-9][0-9]* /* xgc:regular-expression */ /* A Natural Number, positive integer */ - */ + * [75] PositiveNonZeroIntegerLiteral ::= [1-9][0-9]* /* xgc:regular-expression */ /* A Natural Number, positive integer */ + */ lazy val positiveNonZeroIntegerLiteral: Parser[BigInt] = "PositiveNonZeroIntegerLiteral" ::= "[1-9][0-9]*".r ^^ { BigInt(_) } /** - * [76] PositiveIntegerLiteral ::= [0-9]+ /* xgc:regular-expression */ /* A Natural Number, non-negative integer */ - */ + * [76] PositiveIntegerLiteral ::= [0-9]+ /* xgc:regular-expression */ /* A Natural Number, non-negative integer */ + */ lazy val positiveIntegerLiteral: Parser[BigInt] = "PositiveIntegerLiteral" ::= "[0-9]+".r ^^ { BigInt(_) } /** - * [77] NumericLiteral ::= -?[0-9]+(\.[0-9]+)? /* xgc:regular-expression */ /* A Real Number, expressed as an integer or decimal */ - */ + * [77] NumericLiteral ::= -?[0-9]+(\.[0-9]+)? /* xgc:regular-expression */ /* A Real Number, expressed as an integer or decimal */ + */ lazy val numericLiteral: Parser[BigDecimal] = "NumericLiteral" ::= """-?[0-9]+(\.[0-9]+)?""".r ^^ { BigDecimal(_) } /** - * [78] StringLiteral ::= "\"" [^"]* "\"" /* xgc:regular-expression */ /* Any characters except: quotation mark */ - */ + * [78] StringLiteral ::= "\"" [^"]* "\"" /* xgc:regular-expression */ /* Any characters except: quotation mark */ + */ lazy val stringLiteral: Parser[String] = "StringLiteral" ::= "\"" ~> """[^"]*""".r <~ "\"" /** - * [79] CharacterLiteral ::= "'" [^\r\n\f'] "'" /* xgc:regular-expression */ /* Any characters except: carriage-return, line-break, form-feed and apostrophe */ - */ + * [79] CharacterLiteral ::= "'" [^\r\n\f'] "'" /* xgc:regular-expression */ /* Any characters except: carriage-return, line-break, form-feed and apostrophe */ + */ lazy val characterLiteral: Parser[Char] = "CharacterLiteral" ::= "'" ~> """[^\r\n\f']""".r <~ "'" ^^ { _.head } /** - * [80] WildcardLiteral ::= "*" - */ + * [80] WildcardLiteral ::= "*" + */ def wildcardLiteral : Parser[String] = "WildcardLiteral" ::= wildcard /** - * [81] Ident ::= [A-Za-z0-9\-_\.]+ /* xgc:regular-expression */ - */ + * [81] Ident ::= [A-Za-z0-9\-_\.]+ /* xgc:regular-expression */ + */ lazy val ident: Parser[String] = "Ident" ::= """[A-Za-z0-9\-_\.]+""".r private val wildcard = "*" @@ -655,36 +659,36 @@ trait SchemaParser extends RegexParsers /** - * Given 1 or more Parsers - * this function produces - * all permutations of - * all combinations. - * - * Put more simply if you have a List - * of Parsers, we create a Parser - * that matches n of those parsers - * in any order - * - * @param parsers A list of parsers to mingle - * @return A parser that represents all permutations of - * all combinations of the parsers - */ + * Given 1 or more Parsers + * this function produces + * all permutations of + * all combinations. + * + * Put more simply if you have a List + * of Parsers, we create a Parser + * that matches n of those parsers + * in any order + * + * @param parsers A list of parsers to mingle + * @return A parser that represents all permutations of + * all combinations of the parsers + */ private def mingle[T, U](parsers : List[Parser[T]]): Parser[List[T]] = { /** - * All permutations of all combinations - * of a List - */ + * All permutations of all combinations + * of a List + */ def mingle[T](data: List[T]): List[List[T]] = { (for(i <- 1 to data.length) yield data.combinations(i).flatMap(_.permutations) - ).toList.flatten + ).toList.flatten } /** - * Combines n parsers together - * in the same manner as p1 ~ p2 ~ ... pN - */ + * Combines n parsers together + * in the same manner as p1 ~ p2 ~ ... pN + */ def combine[T](parsers: List[Parser[T]]): Parser[List[T]] = { parsers.foldRight(success(List.empty[T])) { case (p, acc) => for { @@ -729,8 +733,8 @@ trait SchemaParser extends RegexParsers def parse(reader: Reader) = parseAll(schema, reader) private def validate(g: List[GlobalDirective], c: List[ColumnDefinition]): String = { - globDirectivesValid(g) ::totalColumnsValid(g, c) :: integrityCheckValid(g, c) :: columnDirectivesValid(c) :: duplicateColumnsValid(c) :: crossColumnsValid(c) :: checksumAlgorithmValid(c) :: - rangeValid(c) :: lengthValid(c) :: regexValid(c) :: dateRangeValid(c) :: uniqueMultiValid(c) :: explicitColumnValid(c) :: Nil collect { case Some(s: String) => s } mkString(EOL) + globDirectivesValid(g) ::totalColumnsValid(g, c) :: columnDirectivesValid(c) :: duplicateColumnsValid(c) :: crossColumnsValid(c) :: checksumAlgorithmValid(c) :: + rangeValid(c) :: lengthValid(c) :: regexValid(c) :: dateRangeValid(c) :: uniqueMultiValid(c) :: explicitColumnValid(c) :: Nil collect { case Some(s: String) => s } mkString(EOL) } private def totalColumnsValid(g: List[GlobalDirective], c: List[ColumnDefinition]): Option[String] = { @@ -742,17 +746,6 @@ trait SchemaParser extends RegexParsers None } - private def integrityCheckValid(g: List[GlobalDirective], c: List[ColumnDefinition]): Option[String] = { - val maybeIntegrityCheck: Option[IntegrityCheck] = g.collectFirst { case i @ IntegrityCheck(_, _) => i} - - maybeIntegrityCheck.flatMap{ integrityCheck => - val filePathColumn = integrityCheck.filepathColumn - if (c.map(_.id).exists(_.value == filePathColumn)) - None - else - Some(s"[Integrity Check], Cannot find the colunm $filePathColumn") - } - } private def duplicateColumnsValid(columnDefinitions: List[ColumnDefinition]): Option[String] = { val duplicates = TreeMap(columnDefinitions.groupBy(_.id).toSeq:_*)(scala.math.Ordering.by[ColumnIdentifier, String](_.toString)).filter(_._2.length > 1) @@ -775,7 +768,7 @@ trait SchemaParser extends RegexParsers if (cd.directives.distinct.length != cd.directives.length) } yield { s"${cd.id}: Duplicated column directives: " + - cd.directives.groupBy(identity).filter { case (_, cds) => cds.size > 1}.map { case (cdId, _) => "@" + cdId + s" at line: ${cdId.pos.line}, column: ${cdId.pos.column}"}.mkString(", ") + cd.directives.groupBy(identity).filter { case (_, cds) => cds.size > 1}.map { case (cdId, _) => "@" + cdId + s" at line: ${cdId.pos.line}, column: ${cdId.pos.column}"}.mkString(", ") } if (v.isEmpty) None else Some(v.mkString(EOL)) @@ -993,4 +986,4 @@ trait SchemaParser extends RegexParsers if (result.isEmpty) None else Some(result.mkString(EOL)) } -} +} \ No newline at end of file diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000.csvs b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000.csvs index 213a2544..ad8176bb 100644 --- a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000.csvs +++ b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/WO_95/tech_acq_metadata_v1_WO95Y14B000.csvs @@ -1,5 +1,5 @@ version 1.0 -@totalColumns 33 @integrityCheck("file_path","excludeFolder") +@totalColumns 33 /*--------------------------------------------------------------------------------------------------------------- |This schema is for the validation of technical acquisition metadata | |csv files according to the specification given for digitised surrogates in | @@ -8,23 +8,23 @@ version 1.0 | 20140818 Version 1.0 DHU First release version for this project | | 20140910 Version 1.1 DHU Updated date regex to fix issues, allowed items up to 14, disallow fullstops | |at end of description as this causes search issues in Discovery. | -| 20141016 version 1.2 NW Updated regex to allow 20 items, 500 ordinals & addition of legal_status | +| 20141016 version 1.2 NW Updated regex to allow 20 items, 500 ordinals & addition of legal_status | |and held_by fields, changed date column to covering_date | | 20141110 version 1.3 NW fixed sub_sub_series rule | |from sub_sub_series: range(1,7) or is("115") or if($piece/is("5500"),is("")) | -|to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115"))) | +|to sub_sub_series: if($piece/is("5500"),is(""),(range(1,7) or is("115"))) | ---------------------------------------------------------------------------------------------------------------*/ -/*The header of the schema file, ie the statements version 1.0 and @totalColumns 31, indicates that this schema - is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), +/*The header of the schema file, ie the statements version 1.0 and @totalColumns 31, indicates that this schema + is using version 1.0 of the schema language (NB, not that that it is version 1.0 of this particular schema), and that there are 31 columns in total in the file.*/ batch_code: length(1,11) regex("^WO95Y14B([0-9]{3}|smp)$") - //1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined + //1st part, batch_code must be between 1 and 11 characters long, and (implicitly multiple conditions are joined //by a logical AND unless another boolean is provided). 2nd part restricts to form similar to WO95Y14B000 (last //three digits are running number for batches throughout the project. department: is("WO") and (in($file_path) and in($resource_uri)) //Parentheses control evaluation order of booleans as might be expected - //Department is fixed value of WO for this project. - //The grouped "in" statements say that the value found in this field must also be found as part of the fields + //Department is fixed value of WO for this project. + //The grouped "in" statements say that the value found in this field must also be found as part of the fields //"file_path" and "resource_uri" division: is("13") //this field must be precisely 13 @@ -57,23 +57,23 @@ covering_date: regex("^19(14|15|16|17|18|19|20|21|22|23)( (Jan|Feb|Mar|Apr|May|J legal_status: is("Public Record") held_by: is("The National Archives, Kew") file_uuid: uuid4 unique - //must be a version 4 uuid, and the value must be unique within the file. uuids must be lower case. -file_path: uri starts("file:///WO_95/") unique fileExists + //must be a version 4 uuid, and the value must be unique within the file. uuids must be lower case. +file_path: uri starts("file:///WO_95/") unique fileExists integrityCheck("excludeFolder") //fileExists checks that there is actually a file of the given name at the specified location on the file system. - //In practice, the validator will normally be run with the --path switch + //In practice, the validator will normally be run with the --path switch //(see http://digital-preservation.github.io/csv-validator/) //We also require that the path is a valid uri, and begins file:///WO_95/ as this is the top-level folder for each batch - //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the + //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the //content of this field) //must be unique within the file file_checksum: unique checksum(file($file_path),"SHA-256") - //Compare the value given in this field to the checksum calculated for the file found at the location given in + //Compare the value given in this field to the checksum calculated for the file found at the location given in //the "file_path" field (again path substitution may well be applied as described for the "file_path" field itself). //Use the specified checksum algorithm (must use lowercase hex characters). //unique within the file - an identical checksum would imply identical images resource_uri: uri starts("http://datagov.nationalarchives.gov.uk/66/WO/95/") unique //Must be a valid uri which starts with the specified string, the uri is constructed such that it must be unique in the file - //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the + //(Conditions specified on earlier columns say that the values of those columns must also appear as part of the //content of this field) scan_operator: length(1,12) regex("^[0-9a-zA-Z]{1,12}$") //12 alphanumeric characters representing the identity of the scanning operator (the ability to decode this is @@ -97,7 +97,7 @@ image_format: is("x-fmt/392") //must be string: x-fmt/392 (precisely) - ie a jp2 file as understood by PRONOM //(http://www.nationalarchives.gov.uk/PRONOM/x-fmt/392) image_compression: positiveInteger is("6") - //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm + //Always a positive (non-zero) integer, generally 6 to represent 6-fold compression with the lossy algorithm //available in the JPEG2000 specification image_colour_space: is("sRGB") //must be string: sRGB (precisely - case as shown) diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckDefaultIncludeFolderSchema.csvs b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckDefaultIncludeFolderSchema.csvs index cdfdd737..6c0a2c59 100644 --- a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckDefaultIncludeFolderSchema.csvs +++ b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckDefaultIncludeFolderSchema.csvs @@ -1,5 +1,5 @@ version 1.0 -@totalColumns 3 @integrityCheck("identifier") -identifier: fileExists +@totalColumns 3 +identifier: fileExists integrityCheck("excludeFolder") filename: size: \ No newline at end of file diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckSchema.csvs b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckSchema.csvs index 7311adac..20ef05df 100644 --- a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckSchema.csvs +++ b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/header/integrityCheckSchema.csvs @@ -1,5 +1,5 @@ version 1.0 -@totalColumns 3 @integrityCheck("identifier","includeFolder") -identifier: fileExists +@totalColumns 3 +identifier: fileExists integrityCheck("includeFolder") filename: size: \ No newline at end of file diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/noheader/badIntegrityCheckSchema.csvs b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/noheader/badIntegrityCheckSchema.csvs index 2149abb9..454216eb 100644 --- a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/noheader/badIntegrityCheckSchema.csvs +++ b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/noheader/badIntegrityCheckSchema.csvs @@ -1,5 +1,5 @@ version 1.0 -@totalColumns 3 @noHeader @integrityCheck("identifier","badincludeFolder") -identifier: fileExists +@totalColumns 3 @noHeader +identifier: fileExists integrityCheck("badincludeFolder") filename: size: \ No newline at end of file diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/noheader/integrityCheckSchema.csvs b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/noheader/integrityCheckSchema.csvs index 91d9c3d9..1766d344 100644 --- a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/noheader/integrityCheckSchema.csvs +++ b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/noheader/integrityCheckSchema.csvs @@ -1,5 +1,5 @@ version 1.0 -@totalColumns 3 @noHeader @integrityCheck("identifier") -identifier: fileExists +@totalColumns 3 @noHeader +identifier: fileExists integrityCheck("includeFolder") filename: size: \ No newline at end of file diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/schema/integrityCheck/folder1/content/file#2.txt b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/schema/integrityCheck/folder1/content/file#2.txt new file mode 100644 index 00000000..e69de29b diff --git a/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/schema/integrityCheck/folder1/content/file1.txt b/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/schema/integrityCheck/folder1/content/file1.txt new file mode 100644 index 00000000..e69de29b diff --git a/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala index 7eb8be28..36ff6c4f 100644 --- a/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala +++ b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/MetaDataValidatorIntegrityCheckSpec.scala @@ -8,6 +8,7 @@ */ package uk.gov.nationalarchives.csv.validator +import java.io.File import java.lang.management.MemoryUsage import org.specs2.mutable.Specification @@ -20,11 +21,7 @@ import scalaz.{Failure, Success} class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResources { - //TODO find a more generic way to do so - val base = s"${System.getProperty("user.dir")}/csv-validator-core/src/test/resources/uk/gov/nationalarchives/csv/validator/integrityCheck/" - - - def buildValidator(substitutionPath: List[(String,String)], filenameColumn: Option[String], include: Boolean = false) : CsvValidator = new CsvValidator with AllErrorsMetaDataValidator { + def buildValidator(substitutionPath: List[(String,String)]) : CsvValidator = new CsvValidator with AllErrorsMetaDataValidator { val pathSubstitutions = substitutionPath val enforceCaseSensitivePathChecks = false val trace = false @@ -33,28 +30,24 @@ class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResourc def parse(filePath: String, validator: CsvValidator): Schema = validator.parseSchema(TextFile(Path.fromString(filePath))) fold (f => throw new IllegalArgumentException(f.toString()), s => s) "integrity Check" should { - val headerPath = base + "/header/" - val noHeaderPath = base + "/noheader/" - val WO95Path = base + "/WO_95/" - + val headerPath = integrityCheckPath + "/header/" + val noHeaderPath = integrityCheckPath + "/noheader/" + val WO95Path = integrityCheckPath + "/WO_95/" + "succeed with good values - header" in { val substitutionPaths = List(("file:///T:/WORK/RF_5/",headerPath)) - val validator = buildValidator(substitutionPaths, Some("filename")) - validator.validate(TextFile(Path.fromString(headerPath) / "integrityCheckMetaData.csv"), parse(headerPath + "/integrityCheckSchema.csvs",validator), None) must beLike { - case Success(_) => ok - } + val validator = buildValidator(substitutionPaths) + validator.validate(TextFile(Path.fromString(headerPath) / "integrityCheckMetaData.csv"), parse(headerPath + "/integrityCheckSchema.csvs",validator), None).isSuccess mustEqual true } - "succeed with good values and implicit include folder - header" in { + "succeed with good values and exclude folder paramter - header" in { val substitutionPaths = List(("file:///T:/WORK/RF_5/",headerPath)) - val validator = buildValidator(substitutionPaths, Some("filename")) - validator.validate(TextFile(Path.fromString(headerPath) / "integrityCheckMetaData.csv"), parse(headerPath + "/integrityCheckDefaultIncludeFolderSchema.csvs",validator), None) must beLike { - case Success(_) => ok - } + val validator = buildValidator(substitutionPaths) + validator.validate(TextFile(Path.fromString(headerPath) / "integrityCheckMetaData.csv"), parse(headerPath + "/integrityCheckDefaultIncludeFolderSchema.csvs",validator), None).isSuccess mustEqual true } @@ -62,38 +55,37 @@ class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResourc "fail for metadatafile missing files - header" in { val substitutionPaths = List(("file:///T:/WORK/RF_5/",headerPath)) - val validator = buildValidator(substitutionPaths, Some("filename")) + // val validator = buildValidator(substitutionPaths, Some("filename")) + val validator = buildValidator(substitutionPaths) val result = validator.validate(TextFile(Path.fromString(headerPath) / "integrityCheckMetaData-missing-files.csv"), parse(headerPath + "/integrityCheckSchema.csvs",validator), None) result.isFailure mustEqual true val Failure(message) = result //TODO perform test on nonEmptyList instead of using to string - message.toString must contain("[Integrity Check]") - message.toString must contain("file2 are not listed in ") + message.toString must contain("integrityCheck fails for") + // message.toString must contain("file2 are not listed in ") } "fail for metadatafile missing files - header" in { val substitutionPaths = List(("file:///T:/WORK/RF_5/",headerPath)) - val validator = buildValidator(substitutionPaths, Some("filename")) + val validator = buildValidator(substitutionPaths) val result = validator.validate(TextFile(Path.fromString(headerPath) / "integrityCheckMetaData-missing-files.csv"), parse(headerPath + "/integrityCheckSchema.csvs",validator), None) result.isFailure mustEqual true val Failure(message) = result //TODO perform test on nonEmptyList instead of using to string - message.toString must contain("[Integrity Check]") - message.toString must contain("file2 are not listed in ") + message.toString must contain("integrityCheck fails for") + // message.toString must contain("file2 are not listed in ") } "succeed with good values - no header" in { val substitutionPaths = List(("file:///T:/WORK/RF_5/",noHeaderPath)) - val validator = buildValidator(substitutionPaths, Some("identifier")) + val validator = buildValidator(substitutionPaths) val schema: Schema = parse(noHeaderPath + "/integrityCheckSchema.csvs", validator) - validator.validate(TextFile(Path.fromString(noHeaderPath) / "integrityCheckMetaData.csv"), schema, None) must beLike { - case Success(_) => ok - } + validator.validate(TextFile(Path.fromString(noHeaderPath) / "integrityCheckMetaData.csv"), schema, None).isSuccess mustEqual true } @@ -101,23 +93,21 @@ class MetaDataValidatorIntegrityCheckSpec extends Specification with TestResourc "fail with wrong includeFolder directive - no header" in { val substitutionPaths = List(("file:///T:/WORK/RF_5/",noHeaderPath)) - val validator = buildValidator(substitutionPaths, Some("filename")) + val validator = buildValidator(substitutionPaths) val result = validator.validate(TextFile(Path.fromString(noHeaderPath) / "integrityCheckMetaData.csv"), parse(noHeaderPath + "/badIntegrityCheckSchema.csvs",validator), None) result.isFailure mustEqual true val Failure(message) = result - println(message) + // println(message) ok }.pendingUntilFixed() "Validate WO 95" in { val substitutionPaths = List(("file:///WO_95",WO95Path)) - val validator = buildValidator(substitutionPaths, Some("file_path"), false) - validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000.csvs",validator), None) must beLike { - case Success(_) => ok - } + val validator = buildValidator(substitutionPaths) + validator.validate(TextFile(Path.fromString(WO95Path) / "tech_acq_metadata_v1_WO95Y14B003.csv"), parse(WO95Path + "/tech_acq_metadata_v1_WO95Y14B000.csvs",validator), None).isSuccess mustEqual true } } -} +} \ No newline at end of file diff --git a/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/IntegrityCheckRuleSpec.scala b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/IntegrityCheckRuleSpec.scala new file mode 100644 index 00000000..7f8296c6 --- /dev/null +++ b/csv-validator-core/src/test/scala/uk/gov/nationalarchives/csv/validator/schema/IntegrityCheckRuleSpec.scala @@ -0,0 +1,106 @@ +/** + * Copyright (c) 2013, The National Archives + * http://www.nationalarchives.gov.uk + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + */ +package uk.gov.nationalarchives.csv.validator.schema + +import org.specs2.mutable.Specification +import uk.gov.nationalarchives.csv.validator._ +import uk.gov.nationalarchives.csv.validator.api.CsvValidator.SubstitutePath +import uk.gov.nationalarchives.csv.validator.metadata.{Cell, Row} + +import scalaz.{Failure, Success} + +class IntegrityCheckRuleSpec extends Specification with TestResources { + +// val relMustExistForRulePath = relResourcePath("mustExistForRule.csvs") + val relIntegrityCheckForRulePath = relResourcePath("integrityCheck" + FILE_SEPARATOR + "folder1" + FILE_SEPARATOR + "content" + FILE_SEPARATOR + "file1.txt") + val relIntegrityCheckForRulePath2 = relResourcePath("integrityCheck" + FILE_SEPARATOR + "folder1" + FILE_SEPARATOR + "content" + FILE_SEPARATOR + "file#2.txt") + +// val relPath = relativePath(relMustExistForRulePath) + + val relMustExistForHashRulePath = relResourcePath("mustExistFor#Rule.csvs") + val hashSegments = relMustExistForHashRulePath.split(FILE_SEPARATOR) + val hashRelPath = (hashSegments.slice(0, hashSegments.length - 3).reduceLeft(_ + FILE_SEPARATOR + _), hashSegments.slice(hashSegments.length - 3, hashSegments.length).reduceLeft(_ + FILE_SEPARATOR + _)) + + + val relPath2 = relativePath(relIntegrityCheckForRulePath) + val relPath3 = relativePath(relIntegrityCheckForRulePath2) + + def relativePath(path: String):(String,String) = { + val segments = path.split(FILE_SEPARATOR) + (segments.slice(0, segments.length - 3).reduceLeft(_ + FILE_SEPARATOR + _), segments.slice(segments.length - 3, segments.length).reduceLeft(_ + FILE_SEPARATOR + _)) + } + val emptyPathSubstitutions = List[SubstitutePath]() + + + + "IntegrityCheckRule" should { + + val globalDirsOne = List(TotalColumns(1)) + val globalDirsTwo = List(TotalColumns(2)) + + "fail for extra file" in { + val integrityCheckRule = IntegrityCheckRule(emptyPathSubstitutions,false) + + val schema: Schema = Schema(globalDirsOne, List(ColumnDefinition(NamedColumnIdentifier("column1")))) + val totalRows: Some[Boolean] = Some(false) + + integrityCheckRule.evaluate(0, Row(List(Cell(relIntegrityCheckForRulePath)), 1), schema, totalRows) must beLike { + case Failure(messages) => messages.head mustEqual "integrityCheck fails for line: 1, column: column1, files: \"csv-validator-core/target/test-classes/uk/gov/nationalarchives/csv/validator/schema/integrityCheck/folder1/content/file#2.txt\" are not listed in the metadata" + } + } + + "fail for empty file path" in { + val integrityCheckRule = IntegrityCheckRule(emptyPathSubstitutions,false) + val schema: Schema = Schema(globalDirsTwo, List(ColumnDefinition(NamedColumnIdentifier("column1")), ColumnDefinition(NamedColumnIdentifier("column2")))) + + + integrityCheckRule.evaluate(1, Row(List(Cell("abc"), Cell(relIntegrityCheckForRulePath)), 1), schema, Some(true)) mustEqual Success(true) + + integrityCheckRule.evaluate(1, Row(List(Cell("abc"), Cell("")), 2), schema, Some(false)) must beLike { + case Failure(messages) => messages.head mustEqual "integrityCheck fails for line: 2, column: column2, files: \"csv-validator-core/target/test-classes/uk/gov/nationalarchives/csv/validator/schema/integrityCheck/folder1/content/file#2.txt\" are not listed in the metadata" + } + + } + + "succeed for file that exists with no root file path" in { + + val integrityCheckRule = IntegrityCheckRule(emptyPathSubstitutions,false) + + val schema: Schema = Schema(globalDirsOne, List(ColumnDefinition(NamedColumnIdentifier("column1")))) + + integrityCheckRule.evaluate(0, Row(List(Cell(relIntegrityCheckForRulePath)), 1), schema, Some(true)) mustEqual Success(true) + integrityCheckRule.evaluate(0, Row(List(Cell(relIntegrityCheckForRulePath2)), 2), schema, Some(false)) mustEqual Success(true) + + } + + "succeed for file that exists with root file path" in { + val integrityCheckRule = IntegrityCheckRule(emptyPathSubstitutions, false, Literal(Some(relPath2._1 + FILE_SEPARATOR))) + + val schema: Schema = Schema(globalDirsOne, List(ColumnDefinition(NamedColumnIdentifier("column1")))) + + + integrityCheckRule.evaluate(0, Row(List(Cell(relPath2._2)), 1), schema, Some(true)) must be_==(Success(true)) + integrityCheckRule.evaluate(0, Row(List(Cell(relPath3._2)), 2), schema, Some(false)) must be_==(Success(true)) + + } + + "succeed for root file path without final file separator and file without initial file separator" in { + + val integrityCheckRule = IntegrityCheckRule(emptyPathSubstitutions, false, Literal(Some(relPath2._1))) + + val schema: Schema = Schema(globalDirsOne, List(ColumnDefinition(NamedColumnIdentifier("column1")))) + + integrityCheckRule.evaluate(0, Row(List(Cell(relPath2._2)), 1), schema, Some(true)) must be_==(Success(true)) + integrityCheckRule.evaluate(0, Row(List(Cell(relPath3._2)), 2), schema, Some(false)) must be_==(Success(true)) + + } + + } + +}