add logic for dealing with weird paths in ncbi assemblies

sourmash-bio · Jul 21, 2020 · 94e6b09 · 94e6b09
1 parent 6aba794
commit 94e6b09
Showing 1 changed file with 49 additions and 5 deletions.
diff --git a/machine/wort-web/add_dataset_info_genomes.py b/machine/wort-web/add_dataset_info_genomes.py
@@ -13,6 +13,28 @@ def build_link(accession, asm_name):
     return f"{url}/{accession}_{asm_name}"
 
 
+def crawl_link(accession, session):
+    db, acc = accession.split("_")
+    number, version = acc.split(".")
+    number = "/".join([number[pos:pos + 3] for pos in range(0, len(number), 3)])
+    url = f"https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{number}"
+
+    asm_name = None
+    req = session.get(url)
+    if req.status_code == 200:
+        # try to read the right accession name
+        for line in req.text.iter_lines():
+            if line.startswith(f'<a href="{accession}_'):
+                asm_name = line.split('"')[1][:-1])
+                break
+    # TODO: check for 5xx
+
+    if asm_name is None:
+        raise ValueError(f"Couldn't find link for {accession}")
+
+    return f"{url}/{asm_name}/{asm_name}_genomic.fna.gz"
+
+
 n = 0
 total = 0
 with requests.Session() as s:
@@ -26,23 +48,45 @@ def build_link(accession, asm_name):
 
             # if dataset_in_db doesn't exist, create a new one
             if dataset_in_db is None:
+                # Build the signature name
+                name_parts = [row["assembly_accession"], " ", row['organism_name']]
+                if row['infraspecific_name']:
+                    name_parts += [" ", row['infraspecific_name']]
+                name_parts += [', ', row['asm_name']]
+                name = "".join(name_parts)[:128]
+
+                # Let's find the right download path
                 if row['ftp_path'] == "na":
                     # check if 'gbrs_paired_asm' is available and
                     # 'paired_asm_comp' is 'identical'
                     if row['paired_asm_comp'] == 'identical':
                         row['ftp_path'] = build_link(row['gbrs_paired_asm'], row['asm_name'])
                     else: # need to rebuild path from this accession...
                         row['ftp_path'] = build_link(row['assembly_accession'], row['asm_name'])
+
                 http_path = 'https' + row['ftp_path'][3:]
                 filename = http_path.split('/')[-1]
                 path = f"{http_path}/{filename}_genomic.fna.gz"
 
-                name_parts = [row["assembly_accession"], " ", row['organism_name']]
-                if row['infraspecific_name']:
-                    name_parts += [" ", row['infraspecific_name']]
-                name_parts += [', ', row['asm_name']]
-                name = "".join(name_parts)[:128]
+                # Let's check if the path exists
+                check_r = s.head(path)
+                if check_r.status_code == 404:
+                    # Error with this path, let's try to crawl instead
+                    try:
+                        path = crawl_link(accession, s)
+                    except ValueError:
+                        # Can't find this data, continue...
+                        continue
+                elif check_r.status_code >= 500:
+                    # Server error, try again
+                    os.sleep(2)
+                    check_r = s.head(path)
+                    if check_r.status_code >= 500:
+                        # ¯\_(ツ)_/¯ let's retry it some other time...
+                        continue
 
+                # Assembly summary doesn't include size of dataset
+                # Let's use a cheap head request to find the size
                 size_r = s.head(path)
                 if size_r.status_code == 404:
                     print(f"Error 404 on {path}")