Skip to content

Commit

Permalink
add logic for dealing with weird paths in ncbi assemblies
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Jul 21, 2020
1 parent 6aba794 commit 94e6b09
Showing 1 changed file with 49 additions and 5 deletions.
54 changes: 49 additions & 5 deletions machine/wort-web/add_dataset_info_genomes.py
Expand Up @@ -13,6 +13,28 @@ def build_link(accession, asm_name):
return f"{url}/{accession}_{asm_name}"


def crawl_link(accession, session):
db, acc = accession.split("_")
number, version = acc.split(".")
number = "/".join([number[pos:pos + 3] for pos in range(0, len(number), 3)])
url = f"https://ftp.ncbi.nlm.nih.gov/genomes/all/{db}/{number}"

asm_name = None
req = session.get(url)
if req.status_code == 200:
# try to read the right accession name
for line in req.text.iter_lines():
if line.startswith(f'<a href="{accession}_'):
asm_name = line.split('"')[1][:-1])
break
# TODO: check for 5xx

if asm_name is None:
raise ValueError(f"Couldn't find link for {accession}")

return f"{url}/{asm_name}/{asm_name}_genomic.fna.gz"


n = 0
total = 0
with requests.Session() as s:
Expand All @@ -26,23 +48,45 @@ def build_link(accession, asm_name):

# if dataset_in_db doesn't exist, create a new one
if dataset_in_db is None:
# Build the signature name
name_parts = [row["assembly_accession"], " ", row['organism_name']]
if row['infraspecific_name']:
name_parts += [" ", row['infraspecific_name']]
name_parts += [', ', row['asm_name']]
name = "".join(name_parts)[:128]

# Let's find the right download path
if row['ftp_path'] == "na":
# check if 'gbrs_paired_asm' is available and
# 'paired_asm_comp' is 'identical'
if row['paired_asm_comp'] == 'identical':
row['ftp_path'] = build_link(row['gbrs_paired_asm'], row['asm_name'])
else: # need to rebuild path from this accession...
row['ftp_path'] = build_link(row['assembly_accession'], row['asm_name'])

http_path = 'https' + row['ftp_path'][3:]
filename = http_path.split('/')[-1]
path = f"{http_path}/{filename}_genomic.fna.gz"

name_parts = [row["assembly_accession"], " ", row['organism_name']]
if row['infraspecific_name']:
name_parts += [" ", row['infraspecific_name']]
name_parts += [', ', row['asm_name']]
name = "".join(name_parts)[:128]
# Let's check if the path exists
check_r = s.head(path)
if check_r.status_code == 404:
# Error with this path, let's try to crawl instead
try:
path = crawl_link(accession, s)
except ValueError:
# Can't find this data, continue...
continue
elif check_r.status_code >= 500:
# Server error, try again
os.sleep(2)
check_r = s.head(path)
if check_r.status_code >= 500:
# ¯\_(ツ)_/¯ let's retry it some other time...
continue

# Assembly summary doesn't include size of dataset
# Let's use a cheap head request to find the size
size_r = s.head(path)
if size_r.status_code == 404:
print(f"Error 404 on {path}")
Expand Down

0 comments on commit 94e6b09

Please sign in to comment.