Skip to content
Switch branches/tags
Go to file
Cannot retrieve contributors at this time
executable file 18 lines (16 sloc) 1.36 KB
#! /bin/bash
for file in $(<Whole_FastQ_FileNames.txt) # Whole_FastQ_FileNames.txt is a list of the unzipped metagenome file names.
sudo gunzip /data/MG_ZIPPED/${file}.gz #Here I am unzipping the first metagenome output read which is located in the directory /data/MG_ZIPPED/
split -l 4000000 /data/MG_ZIPPED/${file} Sample_ #Since the raw metagenome files are so large I need to split them into smaller chunks so I can efficiently subsample them. Split can cut files based on size, or number of lines. I use the -l flag here to tell split to cut my raw reads into smaller files each containing 4 millions lines which is the same as 1 million reads.
ls Sample_* > SplitNames.txt #Writing the names of all of the "split" files to a new file.
for item in $(<SplitNames.txt)
seqtk sample ${item} 65750 > ${item}_65750.fastq # Sampling 65750 from each of the "split" files.
rm ${item} #Cleaning up the directory to free up space. Removing the "split" files.
cat *_65750.fastq > ${file}_2GB.fastq #Combine each of the separately subsampled files for the Sample.
rm *65750.fastq # Clean up by removing the original subsampled files since we have a single file where we concatenated.
gzip /data/MG_ZIPPED/${file} #Zip up the raw sequence read.
cat *_2GB.fastq > CentraliaMG_7GB.fastq #Combine each of the Subsampled files of the raw files into one large file.