In [2]:
library(httr)
library(jsonlite)

Fetch data by project id: 
 - Visit https://www.ebi.ac.uk/ena/browser/search 
 - Search for the project you want to analysis

In [3]:
# Replace with the study accession you are interested in 
accession <- "PRJNA352076"  


fetch_ena_data <- function(accession) {
  base_url <- "https://www.ebi.ac.uk/ena/portal/api/search"
  query <- list(
    result = "read_run",
    query = paste("study_accession=\"", accession, "\"", sep = ""),
    fields = paste(
      "experiment_title,experiment_accession,experiment_alias,bam_bytes,bam_ftp,bam_md5,base_count,broker_name,center_name,",
      "fastq_aspera,fastq_bytes,fastq_ftp,fastq_galaxy,fastq_md5,first_created,first_public,instrument_model,instrument_platform,",
      "last_updated,library_layout,library_name,library_selection,library_source,library_strategy,nominal_length,nominal_sdev,",
      "read_count,run_accession,run_alias,sample_accession,sample_alias,sample_title,scientific_name,secondary_sample_accession,",
      "secondary_study_accession,sra_aspera,sra_bytes,sra_ftp,sra_galaxy,sra_md5,study_accession,study_alias,study_title,",
      "submission_accession,submitted_aspera,submitted_bytes,submitted_format,submitted_ftp,submitted_galaxy,submitted_md5,tax_id",
      sep = ""
    ),
    format = "json"
  )
  
  response <- GET(base_url, query = query)
  
  if (status_code(response) == 200) {
    content <- content(response, as = "text", encoding = "UTF-8")
    data <- fromJSON(content)
    return(data)
  } else {
    stop("Failed to fetch data from ENA. Status code: ", status_code(response))
  }
}

# the format of the file is: 
# center_name, experiment_title, instrument_model,instrument_platform, library_layout,library_name,sample_title,scientific_name, study_title, sra_ftp
ena_data <- fetch_ena_data(accession)

# Print the data
write.csv(ena_data, "./accession_data_to_file.csv", row.names = FALSE)


In [3]:
print(ena_data)

    run_accession experiment_accession experiment_alias bam_bytes bam_ftp
1      SRR4785816           SRX2316047       GSM2370974                  
2      SRR4785819           SRX2316050       GSM2370977                  
3      SRR4785823           SRX2316054       GSM2370981                  
4      SRR4785826           SRX2316057       GSM2370984                  
5      SRR4785834           SRX2316065       GSM2370992                  
6      SRR4785837           SRX2316068       GSM2370995                  
7      SRR4785838           SRX2316069       GSM2370996                  
8      SRR4785839           SRX2316070       GSM2370997                  
9      SRR4785840           SRX2316071       GSM2370998                  
10     SRR4785845           SRX2316076       GSM2371003                  
11     SRR4785848           SRX2316079       GSM2371006                  
12     SRR4785851           SRX2316082       GSM2371009                  
13     SRR4785855           SRX2316086

# Download files and prepare csv for nf-rnaseq
You can also download part of the file too. 

In [10]:
# Load necessary libraries
library(dplyr)
library(readr)

# Define the input CSV file (replace with your actual file path)
input_file <- "accession_data_to_file_6.csv" # Change to your input file if different
output_csv <- "nextflow_samples.csv"  # Output file for Nextflow

# Function to download FASTQ files, either fully or a subset (100 reads)
download_fastq <- function(fastq_ftp, sample_name, download_dir = "fastq_files", num_reads = NULL) {
  # Ensure the download directory exists
  if (!dir.exists(download_dir)) {
    dir.create(download_dir)
  }
  
  # Split the FTP links (since they are separated by ';')
  fastq_links <- strsplit(fastq_ftp, ";")[[1]]
  
  # Define output file paths based on sample name
  fastq_1_file <- file.path(download_dir, paste0(sample_name, "_1.fastq.gz"))
  fastq_2_file <- file.path(download_dir, paste0(sample_name, "_2.fastq.gz"))
  
  # If num_reads is specified, download only a subset (100 reads), otherwise download the entire file
  if (!is.null(num_reads)) {
    # Define how many lines to fetch (4 lines per read in FASTQ format)
    num_lines <- num_reads * 4
    
    # Download the first portion of the FASTQ files
    system(paste0("curl -s --range 0-", num_lines, "k ftp://", fastq_links[1], " | gzip > ", fastq_1_file))
    system(paste0("curl -s --range 0-", num_lines, "k ftp://", fastq_links[2], " | gzip > ", fastq_2_file))
    
  } else {
    # Download the entire FASTQ files
    system(paste0("curl -o ", fastq_1_file, " ftp://", fastq_links[1]))
    system(paste0("curl -o ", fastq_2_file, " ftp://", fastq_links[2]))
  }
  
  return(list(fastq_1_file, fastq_2_file))
}

# Function to prepare Nextflow-compatible CSV
prepare_nextflow_csv <- function(data, download_part = TRUE, num_reads = 100) {
  # Initialize an empty list to store rows
  nextflow_list <- list()
  
  # Loop through the data
  for (i in 1:nrow(data)) {
    # Convert the 'sample_title' to a valid sample name by replacing spaces with underscores
    sample_name <- gsub(" ", "_", data$sample_title[i])
    
    # Get the FASTQ FTP links
    fastq_ftp <- data$fastq_ftp[i]
    
    # Download either a subset or the full FASTQ files
    if (download_part) {
      fastq_files <- download_fastq(fastq_ftp, sample_name, num_reads = num_reads)
    } else {
      fastq_files <- download_fastq(fastq_ftp, sample_name)
    }
    
    # Append the row to the list (adding 'auto' strandedness)
    nextflow_list[[i]] <- c(sample_name, fastq_files[[1]], fastq_files[[2]], "auto")
  }
  
  # Convert the list to a data frame
  nextflow_df <- as.data.frame(do.call(rbind, nextflow_list), stringsAsFactors = FALSE)
  colnames(nextflow_df) <- c("sample", "fastq_1", "fastq_2", "strandedness")
  
  # Write the data frame to a CSV file
  write_csv(nextflow_df, output_csv)
  message("Nextflow CSV has been saved to: ", output_csv)
}

# Read the input CSV file
data <- read_csv(input_file)

# Set the option to download part of the file (TRUE for partial download, FALSE for full download)
download_part_of_file <- FALSE  # Set to TRUE for partial download, FALSE for full file download

# Run the preparation function
prepare_nextflow_csv(data, download_part = download_part_of_file)


[1mRows: [22m[34m1[39m [1mColumns: [22m[34m51[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (30): run_accession, experiment_accession, experiment_alias, fastq_aspe...
[32mdbl[39m   (4): base_count, read_count, sra_bytes, tax_id
[33mlgl[39m  (14): bam_bytes, bam_ftp, bam_md5, broker_name, center_name, library_na...
[34mdate[39m  (3): first_created, first_public, last_updated

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Nextflow CSV has been saved to: nextflow_samples.csv

