From 367c6fb54b14e28fa41a1569c00a5f316d2c2507 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Thu, 23 Mar 2023 18:06:49 +0100 Subject: [PATCH] feat: remove non-GISAID strainnames in deduplicate scripts Thanks @AngieHinrichs for the suggestion in https://github.com/cov-lineages/pango-designation/commit/10404d2de710b85e89468d7080fd55e95c685319#commitcomment-105766629 --- deduplicate_keeping_first.py | 7 +++++++ deduplicate_keeping_last.py | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/deduplicate_keeping_first.py b/deduplicate_keeping_first.py index 8ab60d74..847b696e 100644 --- a/deduplicate_keeping_first.py +++ b/deduplicate_keeping_first.py @@ -27,6 +27,13 @@ d.remove(current) print(f"Removed duplicate strain: {line}") continue + # Exclude strain names that are not GISAIDy + # strain names must have at least 2 slashes + # This gets rid of things like `strain` or `OX0123456` + if split[0].count('/') < 2 and line != 'taxon,lineage\n': + d.remove(current) + print(f"Removed non-GISAID strain: {line}") + continue hashset.add(split[0]) diff --git a/deduplicate_keeping_last.py b/deduplicate_keeping_last.py index 39398908..40769710 100644 --- a/deduplicate_keeping_last.py +++ b/deduplicate_keeping_last.py @@ -27,6 +27,13 @@ d.remove(current) print(f"Removed duplicate strain: {line}") continue + # Exclude strain names that are not GISAIDy + # strain names must have at least 2 slashes + # This gets rid of things like `strain` or `OX0123456` + if split[0].count('/') < 2 and line != 'taxon,lineage\n': + d.remove(current) + print(f"Removed non-GISAID strain: {line}") + continue hashset.add(split[0])