From 4015925ec25b8405341598a0b3f6f93f6ddbcb0c Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 16 Mar 2026 01:22:57 +0000 Subject: [PATCH 1/6] More involved ApParser --- .../lib/cleanup/SupplierProcessors.scala | 223 +++++++++++++++++- 1 file changed, 210 insertions(+), 13 deletions(-) diff --git a/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala b/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala index a62276b161..6d132ce4cf 100644 --- a/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala +++ b/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala @@ -1,5 +1,7 @@ package com.gu.mediaservice.lib.cleanup +import java.text.Normalizer + import com.gu.mediaservice.lib.config.UsageRightsConfigProvider import com.gu.mediaservice.lib.metadata.UsageRightsMetadataMapper import com.gu.mediaservice.model._ @@ -264,8 +266,19 @@ trait CanonicalisingImageProcessor extends ImageProcessor { object ApParser extends ImageProcessor { - val InvisionFor = "^invision for (.+)".r - val PersonInvisionAp = "(.+)\\s*/invision/ap$".r + val TrailingSlashAp = "(?i)^(.+)/ap$".r + val TrailingViaAp = "(?i)^(.+)\\s+via ap$".r + val ApImagesCredit = "(?i)^ap images.*".r + val BareInvision = "(?i)^invision(\\s+for\\s+.+)?$".r + + /** Normalise a string for fuzzy matching: strip diacritics, lowercase, collapse whitespace/dots */ + def normalise(s: String): String = + Normalizer.normalize(s, Normalizer.Form.NFD) + .replaceAll("\\p{M}", "") + .toLowerCase + .replaceAll("[.]+", " ") + .replaceAll("\\s+", " ") + .trim def getSuppliersReference(image: Image) = { image.fileMetadata.readXmpHeadStringProp("plus:ImageSupplierImageID").orElse(image.metadata.suppliersReference) @@ -274,18 +287,202 @@ object ApParser extends ImageProcessor { // But that field is json so let's not. } - def apply(image: Image): Image = image.metadata.credit.map(_.toLowerCase) match { - case Some("ap") | Some("associated press") => image.copy( - usageRights = Agency("AP"), - metadata = image.metadata.copy(credit = Some("AP"), suppliersReference = getSuppliersReference(image)) - ) - case Some("invision") | Some("invision/ap") | - Some(InvisionFor(_)) | Some(PersonInvisionAp(_)) => image.copy( - usageRights = Agency("AP", Some("Invision")), - metadata = image.metadata.copy(suppliersReference = getSuppliersReference(image)) - ) - case _ => image + // Source values that should never become intermediary in Credit + val sourceIgnoreList: Set[String] = Set( + "ap", "associated press", "ap files", "aptn", "wire", + "mlbpv ap", "file", "files", "str", + "print", "digital camera", "ho", "agoev" + ).map(normalise) + + // FR-pattern sources (e.g. "FR159526 AP", "FR172078") → treat as plain AP (no intermediary) + val FrSource = "(?i)^FR\\d{1,7}(\\s+AP)?$".r + + // Source rename map: maps raw Source values to the desired display name for Credit. + // Keys are normalised at creation time for case-insensitive lookup. + val sourceRenameMap: Map[String, String] = Map( + "CP" -> "The Canadian Press", + "DPA" -> "dpa", + "KEYSTONE" -> "Keystone", + "Pool Sputnik Kremlin" -> "Sputnik/Kremlin", + "Pool Sputnik Government" -> "Sputnik/Kremlin", + "Pool Presidential Press Service" -> "Presidential Press Service", + "KCNA via KNS" -> "KCNA/KNS", + "CHINATOPIX" -> "Chinatopix", + "AAPIMAGE" -> "AAP", + "AAP Image" -> "AAP", + "AAPImage" -> "AAP", + "YONHAP" -> "Yonhap", + "A24 Films" -> "A24", + "Twentieth Century Fox" -> "20th Century Fox", + "XINHUA" -> "Xinhua", + "KYODO NEWS" -> "Kyodo News", + "PRESSENS BILD" -> "Pressens Bild", + "TT NEWS AGENCY" -> "TT News Agency", + "COLOR CHINA PHOTO" -> "Color China Photo", + "U.S. Central Command" -> "US Central Command", + "U.S. Navy" -> "US Navy", + "U.S. Army" -> "US Army", + "U.S. Air Force" -> "US Air Force", + "U.S. Geological Survey" -> "US Geological Survey", + "U.S. Coast Guard" -> "US Coast Guard", + "U.S. Fish and Wildlife Service" -> "US Fish and Wildlife Service", + "U.S. Marine Corps" -> "US Marine Corps", + "NASA" -> "Nasa", + "NASA TV" -> "Nasa TV", + "THE DALLAS MORNING NEWS" -> "The Dallas Morning News", + "BERLINALE" -> "Berlinale", + "COLUMBIA PICTURES" -> "Columbia Pictures", + "Disney Plus" -> "Disney+", + "FOTOPRESS" -> "Fotopress", + "Getty" -> "Getty Images", + "Olympic Information Services OIS" -> "OIS/IOC", + "SHIYO" -> "Yomiuri Shimbun" + ).map { case (k, v) => normalise(k) -> v } + + /** Strip "Pool" from any position in a source string, cleaning up leftover delimiters */ + private def stripPool(source: String): String = + source.replaceAll("(?i)\\bpool\\b", "").replaceAll("^[\\s/]+|[\\s/]+$", "").trim + + /** Determine intermediary name from Source field */ + def getIntermediary(source: Option[String]): Option[String] = source.flatMap { src => + val srcTrimmed = src.trim + val srcNorm = normalise(srcTrimmed) + + if (sourceIgnoreList.contains(srcNorm)) None + else if (FrSource.findFirstMatchIn(srcTrimmed).isDefined) None + else sourceRenameMap.get(srcNorm) match { + // Full source found in rename map (handles Sputnik, Presidential Press Service, etc.) + case Some(renamed) => Some(renamed) + case None if srcNorm.contains("pool") => + // Strip "Pool" from source, use remaining agency as intermediary + // e.g. "Pool EPA" → "EPA", "AFP Pool" → "AFP", "POOL AP" → "AP" → ignored + val rest = stripPool(srcTrimmed) + val restNorm = normalise(rest) + if (restNorm.isEmpty || sourceIgnoreList.contains(restNorm)) None + else sourceRenameMap.get(restNorm).orElse(Some(rest)) + case None => + // Pass through original casing + Some(srcTrimmed) + } + } + + // Description patterns for AP credit trailers + val ApPhotoPattern = """(?s)(.*?)\s*\(AP Photo/([^)]+)\)(.*)""".r + val PhotoByViaApPattern = """(?s)(.*?)\s*\(Photo by ([^)]+?)\s+via AP([^)]*)\)(.*)""".r + val ViaApPattern = """(?s)(.*?)\s*\(([^)]+?)\s+via AP([^)]*)\)(.*)""".r + val PhotoByPattern = """(?s)(.*?)\s*\(Photo by ([^)]+)\)(.*)""".r + + /** Extract (before, tokens, after) from a description if it contains a recognised AP credit trailer */ + private def extractTrailer(description: String): Option[(String, String, String)] = + description match { + case ApPhotoPattern(before, tokens, after) => Some((before, tokens, after)) + case PhotoByViaApPattern(before, tokens, _, after) => Some((before, tokens, after)) + case ViaApPattern(before, tokens, _, after) => Some((before, tokens, after)) + case PhotoByPattern(before, tokens, after) => Some((before, tokens, after)) + case _ => None + } + + /** Clean AP description credit trailer after verifying tokens appear in byline/credit */ + def cleanDescription(image: Image, description: String): String = + extractTrailer(description) match { + case Some((before, tokens, after)) if descriptionTokensAccountedFor(image, tokens) => + (before.trim + " " + after.trim).trim + case _ => description + } + + /** Check if all meaningful tokens from a description credit trailer are accounted for in byline/credit fields */ + def descriptionTokensAccountedFor(image: Image, descTokens: String): Boolean = { + val bylineNorm = image.metadata.byline.map(normalise).getOrElse("") + val creditNorm = image.metadata.credit.map(normalise).getOrElse("") + val sourceNorm = image.metadata.source.map(normalise).getOrElse("") + val intermediaryNorm = getIntermediary(image.metadata.source).map(normalise).getOrElse("") + + // Split by / and , then strip noise words from within each token + val noiseWords = Set("file", "pool", "ap", "photo") + val meaningfulTokens = descTokens.split("[/,]").map(_.trim).filter(_.nonEmpty).map { t => + normalise(t).split("\\s+").filterNot(noiseWords.contains).mkString(" ").trim + }.filter(_.nonEmpty) + + if (meaningfulTokens.isEmpty) true + else { + meaningfulTokens.forall { tokenNorm => + bylineNorm.contains(tokenNorm) || + creditNorm.contains(tokenNorm) || + sourceNorm.contains(tokenNorm) || + intermediaryNorm.contains(tokenNorm) || + // Check if the token is a known alias (via sourceRenameMap) for the intermediary + // e.g. description says "AAP Image" but intermediary (from Source "AAP") is "AAP" + sourceRenameMap.get(tokenNorm).exists(renamed => normalise(renamed) == intermediaryNorm) + } + } + } + + def isApCredit(credit: String): Boolean = { + val lc = credit.toLowerCase.trim + lc == "ap" || lc == "associated press" + } + + def isBareInvisionCredit(credit: String): Boolean = + BareInvision.findFirstMatchIn(credit.trim).isDefined + + def isTrailingApCredit(credit: String): Boolean = + TrailingSlashAp.findFirstMatchIn(credit.trim).isDefined + + def isViaApCredit(credit: String): Boolean = + TrailingViaAp.findFirstMatchIn(credit.trim).isDefined + + def isApImagesCredit(credit: String): Boolean = + ApImagesCredit.findFirstMatchIn(credit.trim).isDefined + + def apply(image: Image): Image = { + val credit = image.metadata.credit.getOrElse("") + + if (isApCredit(credit) || isTrailingApCredit(credit) || isViaApCredit(credit) || isBareInvisionCredit(credit)) { + // Core AP image, intermediary/AP, intermediary via AP, or bare Invision + // Primary: derive intermediary from Source field + // Fallback: extract from credit pattern (e.g. "NurPhoto/AP" → "NurPhoto", "Invision" → "Invision") + val sourceIntermediary = getIntermediary(image.metadata.source).filterNot { i => + // Don't use Source as intermediary if it's just the photographer's byline + image.metadata.byline.exists(b => normalise(b) == normalise(i)) + } + val creditIntermediary = credit.trim match { + case TrailingSlashAp(before) => Some(before.trim) + case TrailingViaAp(before) => Some(before.trim) + case c if isBareInvisionCredit(c) => Some(c) + case _ => None + } + val intermediary = sourceIntermediary.orElse(creditIntermediary) + val newCredit = intermediary match { + case Some(i) => s"$i/AP" + case None => "AP" + } + + // Clean description + val newDescription = image.metadata.description.map(desc => cleanDescription(image, desc)) + + image.copy( + usageRights = Agency("AP", intermediary), + metadata = image.metadata.copy( + credit = Some(newCredit), + description = newDescription.orElse(image.metadata.description), + suppliersReference = getSuppliersReference(image) + ) + ) + } else if (isApImagesCredit(credit)) { + // AP Images — keep original credit (it's descriptive, e.g. "AP Images for Delta Air Lines") + val newDescription = image.metadata.description.map(desc => cleanDescription(image, desc)) + image.copy( + usageRights = Agency("AP", Some("AP Images")), + metadata = image.metadata.copy( + description = newDescription, + suppliersReference = getSuppliersReference(image) + ) + ) + } else { + image + } } + } object CorbisParser extends ImageProcessor { From 741f44a72e9475a5cd31bfa8103aa49c0dbff0c1 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 16 Mar 2026 01:23:14 +0000 Subject: [PATCH 2/6] Tests for new ApParser --- .../lib/cleanup/SupplierProcessorsTest.scala | 207 +++++++++++++++++- 1 file changed, 197 insertions(+), 10 deletions(-) diff --git a/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala b/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala index 8a6da2cc22..4a0d72c20a 100644 --- a/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala +++ b/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala @@ -259,6 +259,7 @@ class SupplierProcessorsTest extends AnyFunSpec with Matchers with MetadataHelpe } describe("AP") { + // === Detection: existing credit matches === it("should match AP credit") { val image = createImageFromMetadata("credit" -> "AP") val processedImage = applyProcessors(image) @@ -273,33 +274,219 @@ class SupplierProcessorsTest extends AnyFunSpec with Matchers with MetadataHelpe processedImage.metadata.credit should be(Some("AP")) } - it("should match ASSOCIATED PRESS credit") { - val image = createImageFromMetadata("credit" -> "ASSOCIATED PRESS") - val processedImage = applyProcessors(image) - processedImage.usageRights should be(Agency("AP")) - processedImage.metadata.credit should be(Some("AP")) - } it("should match Invision credit") { val image = createImageFromMetadata("credit" -> "Invision") val processedImage = applyProcessors(image) processedImage.usageRights should be(Agency("AP", Some("Invision"))) - processedImage.metadata.credit should be(Some("Invision")) + processedImage.metadata.credit should be(Some("Invision/AP")) } it("should match Invision for ___ credit") { val image = createImageFromMetadata("credit" -> "Invision for Quaker") val processedImage = applyProcessors(image) - processedImage.usageRights should be(Agency("AP", Some("Invision"))) - processedImage.metadata.credit should be(Some("Invision for Quaker")) + processedImage.usageRights should be(Agency("AP", Some("Invision for Quaker"))) + processedImage.metadata.credit should be(Some("Invision for Quaker/AP")) } it("should match __/Invision/AP credit") { val image = createImageFromMetadata("credit" -> "Andy Kropa /Invision/AP") val processedImage = applyProcessors(image) - processedImage.usageRights should be(Agency("AP", Some("Invision"))) + processedImage.usageRights should be(Agency("AP", Some("Andy Kropa /Invision"))) processedImage.metadata.credit should be(Some("Andy Kropa /Invision/AP")) } + + // === Detection: NEW broadened credit matches === + it("should match credit ending with /AP (e.g. NurPhoto/AP)") { + val image = createImageFromMetadata("credit" -> "NurPhoto/AP") + val processedImage = applyProcessors(image) + processedImage.usageRights should be(Agency("AP", Some("NurPhoto"))) + processedImage.metadata.credit should be(Some("NurPhoto/AP")) + } + + + it("should match credit 'via AP' (e.g. Sputnik via AP)") { + val image = createImageFromMetadata("credit" -> "Sputnik via AP") + val processedImage = applyProcessors(image) + processedImage.usageRights should be(Agency("AP", Some("Sputnik"))) + processedImage.metadata.credit should be(Some("Sputnik/AP")) + } + + it("should match AP Images credit and keep original credit") { + val image = createImageFromMetadata("credit" -> "AP Images for Delta Air Lines") + val processedImage = applyProcessors(image) + processedImage.usageRights should be(Agency("AP", Some("AP Images"))) + processedImage.metadata.credit should be(Some("AP Images for Delta Air Lines")) + } + + // === Source-based intermediary in Credit === + it("should set intermediary from Source field and clean description") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "DPA", + "byline" -> "Kay Nietfeld", + "description" -> "Some event. (Kay Nietfeld/dpa via AP)") + val processedImage = applyProcessors(image) + processedImage.usageRights should be(Agency("AP", Some("dpa"))) + processedImage.metadata.credit should be(Some("dpa/AP")) + processedImage.metadata.description should be(Some("Some event.")) + } + + it("should rename Source 'CP' to 'The Canadian Press'") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "CP") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("The Canadian Press/AP")) + } + + // === Source ignore list === + it("should NOT set intermediary for ignored Source 'Wire'") { + val image = createImageFromMetadata("credit" -> "Associated Press", "source" -> "Wire") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("AP")) + } + + // === FR-pattern sources === + it("should NOT set intermediary for FR-pattern Source (e.g. FR159526 AP)") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "FR159526 AP") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("AP")) + } + + // === Pool handling === + it("should NOT set intermediary for Pool AP and should clean description") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "Pool AP", + "byline" -> "Hiro Komae", + "description" -> "PM speaks. (AP Photo/Hiro Komae, Pool)") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("AP")) + processedImage.metadata.description should be(Some("PM speaks.")) + } + + it("should set intermediary for Pool AFP source (strip Pool, keep AFP)") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "Pool AFP") + val processedImage = applyProcessors(image) + processedImage.usageRights should be(Agency("AP", Some("AFP"))) + processedImage.metadata.credit should be(Some("AFP/AP")) + } + + it("should NOT set intermediary for POOL alone") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "POOL") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("AP")) + } + + it("should handle Pool/WPA source without leading slash in credit") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "Pool/WPA") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("WPA/AP")) + } + + it("should rename Pool Getty to Getty Images/AP") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "Pool Getty") + val processedImage = applyProcessors(image) + processedImage.usageRights should be(Agency("AP", Some("Getty Images"))) + processedImage.metadata.credit should be(Some("Getty Images/AP")) + } + + // === Sputnik special case === + it("should handle Pool Sputnik Kremlin → Sputnik/Kremlin via rename map") { + val image = createImageFromMetadata("credit" -> "AP", "source" -> "Pool Sputnik Kremlin", + "byline" -> "Alexei Druzhinin", + "description" -> "Putin in Siberia. (Alexei Druzhinin, Sputnik, Kremlin Pool Photo via AP)") + val processedImage = applyProcessors(image) + processedImage.usageRights should be(Agency("AP", Some("Sputnik/Kremlin"))) + processedImage.metadata.credit should be(Some("Sputnik/Kremlin/AP")) + processedImage.metadata.description should be(Some("Putin in Siberia.")) + } + + // === Description cleanup === + it("should clean (AP Photo/Byline) from description when byline matches") { + val image = createImageFromMetadata( + "credit" -> "AP", + "byline" -> "Matt Dunham", + "description" -> "British PM speaks at 10 Downing Street. (AP Photo/Matt Dunham)") + val processedImage = applyProcessors(image) + processedImage.metadata.description should be(Some("British PM speaks at 10 Downing Street.")) + } + + it("should clean (Photo by Byline/Invision/AP, File) from description") { + val image = createImageFromMetadata( + "credit" -> "AP", + "byline" -> "Chris Pizzello", + "source" -> "Invision", + "description" -> "FILE - Filmmaker poses. (Photo by Chris Pizzello/Invision/AP, File)") + val processedImage = applyProcessors(image) + processedImage.usageRights should be(Agency("AP", Some("Invision"))) + processedImage.metadata.description should be(Some("FILE - Filmmaker poses.")) + } + + + it("should clean (Photo by Byline/Agency via AP) from description") { + val image = createImageFromMetadata( + "credit" -> "AP", "source" -> "LaPresse", + "byline" -> "Antonio Saia", + "description" -> "A match in Rome. (Photo by Antonio Saia/LaPresse via AP)") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("LaPresse/AP")) + processedImage.metadata.description should be(Some("A match in Rome.")) + } + + + it("should clean (Byline/Agency via AP) with trailing text preserved") { + val image = createImageFromMetadata( + "credit" -> "AP", "source" -> "LaPresse", + "byline" -> "Alessandro Garofalo", + "description" -> "A soccer match. (Alessandro Garofalo/LaPresse via AP) More text here.") + val processedImage = applyProcessors(image) + processedImage.metadata.description should be(Some("A soccer match. More text here.")) + } + + it("should NOT clean description with unaccounted tokens for any pattern") { + val descriptions = Seq( + "An event. (AP Photo/Unknown Person)", + "An event. (Unknown Person via AP)", + "An event. (Photo by Unknown Person)", + "An event. (Photo by Unknown Person/SomeAgency via AP)" + ) + descriptions.foreach { description => + val image = createImageFromMetadata( + "credit" -> "AP", + "byline" -> "Someone Else", + "description" -> description) + val processedImage = applyProcessors(image) + processedImage.metadata.description should be(Some(description)) + } + } + + it("should NOT use Source as intermediary when it matches the Byline") { + val image = createImageFromMetadata( + "credit" -> "AP", "source" -> "Athena Walsh", + "byline" -> "Athena Walsh", + "description" -> "A scene in Dublin. (Athena Walsh via AP)") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("AP")) + processedImage.metadata.description should be(Some("A scene in Dublin.")) + } + + + // === Diacritic/ASCII-folding in byline matching === + it("should match bylines with diacritics when description uses ASCII (e.g. José vs Jose)") { + val image = createImageFromMetadata( + "credit" -> "AP", + "byline" -> "José Luis Magaña", + "description" -> "Protesters march. (AP Photo/Jose Luis Magana)") + val processedImage = applyProcessors(image) + processedImage.metadata.description should be(Some("Protesters march.")) + } + + it("should clean description when token is a rename-map alias for the intermediary (AAP Image → AAP)") { + val image = createImageFromMetadata( + "credit" -> "AP", "source" -> "AAP", + "byline" -> "Mick Tsikas", + "description" -> "PM speaks at Parliament House. (Mick Tsikas/AAP Image via AP)") + val processedImage = applyProcessors(image) + processedImage.metadata.credit should be(Some("AAP/AP")) + processedImage.metadata.description should be(Some("PM speaks at Parliament House.")) + } + } From dbdc11e4305ad5416f4a38d7036492524e62c045 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 16 Mar 2026 11:03:35 +0000 Subject: [PATCH 3/6] =?UTF-8?q?Remove=20old,=20redundant=20test=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …BylineCreditReorganise and others would prevent it ever reaching us like that. Plus, outcome is incorrect. --- .../mediaservice/lib/cleanup/SupplierProcessorsTest.scala | 7 ------- 1 file changed, 7 deletions(-) diff --git a/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala b/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala index 4a0d72c20a..2f1df85bad 100644 --- a/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala +++ b/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala @@ -289,13 +289,6 @@ class SupplierProcessorsTest extends AnyFunSpec with Matchers with MetadataHelpe processedImage.metadata.credit should be(Some("Invision for Quaker/AP")) } - it("should match __/Invision/AP credit") { - val image = createImageFromMetadata("credit" -> "Andy Kropa /Invision/AP") - val processedImage = applyProcessors(image) - processedImage.usageRights should be(Agency("AP", Some("Andy Kropa /Invision"))) - processedImage.metadata.credit should be(Some("Andy Kropa /Invision/AP")) - } - // === Detection: NEW broadened credit matches === it("should match credit ending with /AP (e.g. NurPhoto/AP)") { val image = createImageFromMetadata("credit" -> "NurPhoto/AP") From 82c3fd4cc944ac0addd904c0759baeb9412cccc2 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 16 Mar 2026 21:37:49 +0000 Subject: [PATCH 4/6] Apparently redundant as never reached... --- .../com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala b/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala index 6d132ce4cf..d35a3a91da 100644 --- a/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala +++ b/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala @@ -464,7 +464,7 @@ object ApParser extends ImageProcessor { usageRights = Agency("AP", intermediary), metadata = image.metadata.copy( credit = Some(newCredit), - description = newDescription.orElse(image.metadata.description), + description = newDescription, suppliersReference = getSuppliersReference(image) ) ) From 97eadf6e80216e5855b9b3070c2cb60a6f43bed5 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Mon, 16 Mar 2026 21:44:14 +0000 Subject: [PATCH 5/6] Minor cleanup --- .../com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala | 5 +++-- .../gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala | 5 ----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala b/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala index d35a3a91da..3111a571f3 100644 --- a/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala +++ b/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala @@ -294,6 +294,9 @@ object ApParser extends ImageProcessor { "print", "digital camera", "ho", "agoev" ).map(normalise) + // Words stripped from description tokens before matching (e.g. "Kremlin Pool Photo" → "Kremlin") + val noiseWords: Set[String] = Set("file", "pool", "ap", "photo") + // FR-pattern sources (e.g. "FR159526 AP", "FR172078") → treat as plain AP (no intermediary) val FrSource = "(?i)^FR\\d{1,7}(\\s+AP)?$".r @@ -310,7 +313,6 @@ object ApParser extends ImageProcessor { "CHINATOPIX" -> "Chinatopix", "AAPIMAGE" -> "AAP", "AAP Image" -> "AAP", - "AAPImage" -> "AAP", "YONHAP" -> "Yonhap", "A24 Films" -> "A24", "Twentieth Century Fox" -> "20th Century Fox", @@ -398,7 +400,6 @@ object ApParser extends ImageProcessor { val intermediaryNorm = getIntermediary(image.metadata.source).map(normalise).getOrElse("") // Split by / and , then strip noise words from within each token - val noiseWords = Set("file", "pool", "ap", "photo") val meaningfulTokens = descTokens.split("[/,]").map(_.trim).filter(_.nonEmpty).map { t => normalise(t).split("\\s+").filterNot(noiseWords.contains).mkString(" ").trim }.filter(_.nonEmpty) diff --git a/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala b/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala index 2f1df85bad..4b7fb0654b 100644 --- a/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala +++ b/common-lib/src/test/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessorsTest.scala @@ -274,7 +274,6 @@ class SupplierProcessorsTest extends AnyFunSpec with Matchers with MetadataHelpe processedImage.metadata.credit should be(Some("AP")) } - it("should match Invision credit") { val image = createImageFromMetadata("credit" -> "Invision") val processedImage = applyProcessors(image) @@ -297,7 +296,6 @@ class SupplierProcessorsTest extends AnyFunSpec with Matchers with MetadataHelpe processedImage.metadata.credit should be(Some("NurPhoto/AP")) } - it("should match credit 'via AP' (e.g. Sputnik via AP)") { val image = createImageFromMetadata("credit" -> "Sputnik via AP") val processedImage = applyProcessors(image) @@ -411,7 +409,6 @@ class SupplierProcessorsTest extends AnyFunSpec with Matchers with MetadataHelpe processedImage.metadata.description should be(Some("FILE - Filmmaker poses.")) } - it("should clean (Photo by Byline/Agency via AP) from description") { val image = createImageFromMetadata( "credit" -> "AP", "source" -> "LaPresse", @@ -422,7 +419,6 @@ class SupplierProcessorsTest extends AnyFunSpec with Matchers with MetadataHelpe processedImage.metadata.description should be(Some("A match in Rome.")) } - it("should clean (Byline/Agency via AP) with trailing text preserved") { val image = createImageFromMetadata( "credit" -> "AP", "source" -> "LaPresse", @@ -459,7 +455,6 @@ class SupplierProcessorsTest extends AnyFunSpec with Matchers with MetadataHelpe processedImage.metadata.description should be(Some("A scene in Dublin.")) } - // === Diacritic/ASCII-folding in byline matching === it("should match bylines with diacritics when description uses ASCII (e.g. José vs Jose)") { val image = createImageFromMetadata( From 66a7de8923e4776860ca0398350d04cbcbe77dd8 Mon Sep 17 00:00:00 2001 From: Mateusz Date: Thu, 14 May 2026 23:36:35 +0100 Subject: [PATCH 6/6] Account for new shape of 1st party credit being "AP Photo" --- .../com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala b/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala index 3111a571f3..0054a7b95e 100644 --- a/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala +++ b/common-lib/src/main/scala/com/gu/mediaservice/lib/cleanup/SupplierProcessors.scala @@ -420,7 +420,7 @@ object ApParser extends ImageProcessor { def isApCredit(credit: String): Boolean = { val lc = credit.toLowerCase.trim - lc == "ap" || lc == "associated press" + lc == "ap" || lc == "associated press" || lc == "ap photo" } def isBareInvisionCredit(credit: String): Boolean =