In [2]:
import polars as pl
import os
import glob

## SRP044740

Dataset with exomes of 13 FFPE breast tumor samples and 13 corresponding frozen samples.

In [8]:
sample_info = pl.read_csv("sample-info_stage0.tsv", separator="\t")
sample_info

run_accession,experiment_accession,sample_accession,scientific_name,instrument_model,study_title,fastq_md5,fastq_ftp,sra_ftp,sample_alias,sample_title
str,str,str,str,str,str,str,str,str,str,str
"""SRR1523240""","""SRX659776""","""SAMN02934178""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""1a53bc5402125072b95bace7108183…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FROZ3""","""Sample from Homo sapiens"""
"""SRR1523242""","""SRX659778""","""SAMN02934180""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""d312810b83c3c2cb991bfe9e7915b7…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FROZ6""","""Sample from Homo sapiens"""
"""SRR1523243""","""SRX659779""","""SAMN02934181""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""5cc183bc8031a1d2b146f338e76f8e…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FFPE1""","""Sample from Homo sapiens"""
"""SRR1523244""","""SRX659780""","""SAMN02934182""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""32793cef24679667017bf7a5d364e4…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FFPE2""","""Sample from Homo sapiens"""
"""SRR1523245""","""SRX659781""","""SAMN02934183""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""9fbe4b6582a6dc1d9b2173dd0ba56e…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FFPE8""","""Sample from Homo sapiens"""
…,…,…,…,…,…,…,…,…,…,…
"""SRR1523254""","""SRX659790""","""SAMN02934192""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""7f23adebc0b409b8fa8721ffb10df7…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FFPE6""","""Sample from Homo sapiens"""
"""SRR1523259""","""SRX659795""","""SAMN02934186""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""5e079acc63c38a3c4aa7d7dc32f4ef…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FFPE12""","""Sample from Homo sapiens"""
"""SRR1523260""","""SRX659796""","""SAMN02934184""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""d6ab9ad9b7fe337ec3c069b2846c8e…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FFPE10""","""Sample from Homo sapiens"""
"""SRR1523261""","""SRX659797""","""SAMN02934190""","""Homo sapiens""","""Illumina HiSeq 2000""","""WES of 13 FFPE breast tumor sa…","""ecb9c7416e7337cf8f3146fe032895…","""ftp.sra.ebi.ac.uk/vol1/fastq/S…","""ftp.sra.ebi.ac.uk/vol1/srr/SRR…","""BGI-FFPE5""","""Sample from Homo sapiens"""


In [25]:
new_sample_info = (
	sample_info
	.with_columns([
		pl.col("sample_alias").str.replace("BGI-FROZ", "").str.replace("BGI-FFPE", "").cast(int).alias("sample_number"),
		pl.col("sample_alias").str.replace("BGI-", "").str.replace(r"\d+$", "").alias("sample_type")
	])
	.sort("sample_number")
	.select(['study_title','sample_number','sample_type','run_accession','experiment_accession','sample_accession','scientific_name','sample_alias','instrument_model','fastq_md5','fastq_ftp'])
)

new_sample_info.write_csv("sample-info_stage1.tsv", separator="\t")

In [24]:
sample_count = (
	new_sample_info
	.group_by(["sample_number"])
	.agg([
		(pl.col("sample_type") == "FFPE").sum().alias("n_ffpe"),
		(pl.col("sample_type") == "FROZ").sum().alias("n_frozen")
	])
)

sample_count.write_csv("sample-count.tsv", separator="\t")

In [31]:
fastq_links = (
	new_sample_info
 	.with_columns(pl.col("fastq_ftp").str.split(";"))
	.select(["sample_alias", "fastq_ftp"])
  	.explode("fastq_ftp")
	.with_columns([
		(pl.lit("ftp://") + pl.col("fastq_ftp")).alias("fastq_ftp")
	])
)

fastq_links

sample_alias,fastq_ftp
str,str
"""BGI-FFPE1""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"
"""BGI-FFPE1""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"
"""BGI-FFPE1""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"
"""BGI-FFPE1""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"
"""BGI-FFPE1""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"
…,…
"""BGI-FROZ13""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"
"""BGI-FFPE13""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"
"""BGI-FFPE13""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"
"""BGI-FFPE13""","""ftp://ftp.sra.ebi.ac.uk/vol1/f…"


In [32]:
fastq_links.select("fastq_ftp").write_csv("fq_ftp_links.txt", include_header=False, separator="\t")
print("Saved fastq ftp links to: fq_ftp_links.txt")


Saved fastq ftp links to: fq_ftp_links.txt


In [43]:
# fastq_links

wget = ["#!/bin/bash"]

for i in range(fastq_links.shape[0]):
    
    link = fastq_links[i, "fastq_ftp"]
    sample_name = fastq_links[i, "sample_alias"]
    
    basename = os.path.basename(link)
    read_number = basename.split("_")[1].split(".")[0]
    dir_name = f"{link.split("/")[-2]}"
    
    wget.append(f"mkdir -p {sample_name} && wget {link} -O {sample_name}/{sample_name}_{basename}")

fastq_dir = "../data/fq"
os.makedirs(fastq_dir, exist_ok=True)

fastq_get_path = "../data/fq/fastq_ftp_download.sh"
with open(fastq_get_path, "w") as file:
    for line in wget:
        file.write(f"{line}\n")

print(f"Created a bash script to obtain fastqs from the EBI ftp at : {fastq_get_path}")

Created a bash script to obtain fastqs from the EBI ftp at : ../data/fq/fastq_ftp_download.sh


In [47]:
# parallel


script_dir = "../data/fq/parallel_downloads"
os.makedirs(script_dir, exist_ok=True)

for i in range(fastq_links.shape[0]):
    
    link = fastq_links[i, "fastq_ftp"]
    sample_name = fastq_links[i, "sample_alias"]
    basename = os.path.basename(link)
    read_number = basename.split("_")[1].split(".")[0]
    dir_name = f"{link.split("/")[-2]}"
    
    wget = ["#!/bin/bash"]
    wget.append(f"mkdir -p {sample_name} && wget --tries=50 --retry-connrefused -O {sample_name}/{sample_name}_{basename} {link}")
    
    script_path = f"{script_dir}/{sample_name}_{basename}.sh"
    with open(script_path, "w") as file: 
        for line in wget:
        	file.write(f"{line}\n")
        
