In [4]:
import requests
from bs4 import BeautifulSoup
import wget
import os
import missingno as msno
import matplotlib.pyplot as plt
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
##### Fetch and download amplicon data #####

# URL of the page with the .fastq.gz files
url = 'https://ibdmdb.org/downloads/html/rawfiles_HMP2_pilot.html'

# Send a GET request to fetch the page content
response = requests.get(url)
response.raise_for_status()  # Check for successful request

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Create a directory to save downloaded files
os.makedirs("fastq_files", exist_ok=True)

# Find all links to .fastq.gz files and download them
for link in soup.find_all('a', href=True):
    file_url = link['href']
    if file_url.endswith('.fastq.tgz'):
        # Full URL of the file
        print(f"Downloading {file_url}")
        # Download and save the file
        wget.download(file_url, out='fastq_files')

print("Download complete.")

Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/HSM5MD8J.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/ESM5GEZ1.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/MSM5LLGN.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/MSM5LLHA.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/ESM5ME9D.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/ESM5MEE6.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/MSM5LLEZ.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/MSM5LLGR.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/HSM5MD4U.fastq.tgz
Downloading https://g-227ca.190ebd.75bc.data.globus.org/ibdmdb/raw/HMP2_Pilot/16S/HSM5MD3L.

In [6]:
import os
import tarfile
import gzip
from tqdm import tqdm  # For progress bar

def process_tgz_files(input_dir, output_dir):
    """
    Decompress all .tgz files in input_dir and recompress them to .gz in output_dir.

    Parameters:
        input_dir (str): Directory containing .tgz files.
        output_dir (str): Directory to save .gz files.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # List all .tgz files in the input directory
    tgz_files = [f for f in os.listdir(input_dir) if f.endswith('.tgz')]

    if not tgz_files:
        print(f"No .tgz files found in {input_dir}")
        return

    print(f"Found {len(tgz_files)} .tgz files to process.")

    # Loop through each .tgz file
    for tgz_file in tqdm(tgz_files, desc="Processing files"):
        input_path = os.path.join(input_dir, tgz_file)
        output_file = tgz_file.replace('.tgz', '.gz')
        output_path = os.path.join(output_dir, output_file)

        # Temporary directory to hold extracted contents
        temp_dir = os.path.join(input_dir, "temp_extracted")
        os.makedirs(temp_dir, exist_ok=True)

        try:
            # Step 1: Decompress .tgz (tar.gz)
            with tarfile.open(input_path, mode='r:gz') as tar:
                tar.extractall(path=temp_dir)

            # Step 2: Find the extracted file(s)
            extracted_files = [os.path.join(temp_dir, f) for f in os.listdir(temp_dir)]
            if len(extracted_files) != 1:
                raise ValueError(f"Expected one file in the archive, found {len(extracted_files)} in {tgz_file}.")

            extracted_file = extracted_files[0]

            # Step 3: Compress the extracted file into .gz (gzip)
            with open(extracted_file, 'rb') as f_in, gzip.open(output_path, 'wb') as f_out:
                f_out.writelines(f_in)

        except Exception as e:
            print(f"Error processing {tgz_file}: {e}")

        finally:
            # Cleanup: Remove temporary files
            for file in extracted_files:
                os.remove(file)
            os.rmdir(temp_dir)

    print(f"All files processed. Converted .tgz files are saved in {output_dir}.")

# Define input and output directories
input_directory = "./fastq_files"
output_directory = "./fastq_files_converted"

# Process the files
process_tgz_files(input_directory, output_directory)

Found 180 .tgz files to process.


Processing files:   0%|          | 0/180 [00:00<?, ?it/s]

Processing files: 100%|██████████| 180/180 [48:16<00:00, 16.09s/it] 

All files processed. Converted .tgz files are saved in ./fastq_files_converted.





In [7]:
###### Create manifest ######

# Directory containing the downloaded fastq files
directory = './fastq_files_converted'

# Prepare data for the manifest
data = []
for filename in os.listdir(directory):
    if filename.endswith('.fastq.gz'):
        sample_id = filename.split('.')[0]  # Extract sample ID from filename
        absolute_path = os.path.join('$PWD', directory, filename)  # Create absolute path with $PWD
        data.append({'sample-id': sample_id, 'absolute-filepath': absolute_path}) # for colab
        # data.append({'sample-id': sample_id, 'forward-absolute-filepath': absolute_path}) # debug

# Create a DataFrame
manifest_df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
manifest_df.to_csv('manifest.tsv',sep='\t', index=None)

# Show the manifest DataFrame
manifest_df.head()

Unnamed: 0,sample-id,absolute-filepath
0,CSM5MCVX,$PWD/./fastq_files_converted/CSM5MCVX.fastq.gz
1,CSM5MCUS,$PWD/./fastq_files_converted/CSM5MCUS.fastq.gz
2,ESM5ME9H,$PWD/./fastq_files_converted/ESM5ME9H.fastq.gz
3,HSM5MD8P,$PWD/./fastq_files_converted/HSM5MD8P.fastq.gz
4,MSM5LLGR,$PWD/./fastq_files_converted/MSM5LLGR.fastq.gz
