## Remember, each FASTQ record is exactly four lines long


## Sample 10% of reads

In [None]:
record_number = 0
with open("test.fastq") as input:
    with open("sample.fastq", "w") as output:
        for line1 in input:
            line2 = input.readline()
            line3 = input.readline()
            line4 = input.readline()
            if record_number % 10 == 0:
                    output.write(line1)
                    output.write(line2)
                    output.write(line3)
                    output.write(line4)
            record_number += 1

## Randomly sample 10% of reads (more or less)

In [None]:
import random

percentage = 10

with open("test.fastq") as input:
    with open("sample.fastq", "w") as output:
        for line1 in input:
            line2 = input.readline()
            line3 = input.readline()
            line4 = input.readline()
            if random.randrange(0,percentage) == 0:
                    output.write(line1)
                    output.write(line2)
                    output.write(line3)
                    output.write(line4)

## Sample a given number of reads

In [None]:
import random

records_to_sample = 100

with open("test.fastq") as input:
    num_lines = sum([1 for line in input])
total_records = int(num_lines / 4)
print("sampling {} out of {} records".format(records_to_sample, total_records))

percentage = (records_to_sample / total_records) * 100
print("sampling {p} % of records".format(p=percentage))

records_to_keep = random.sample(range(total_records), records_to_sample)

with open("test.fastq") as input:
    with open("sample.fastq", "w") as output:
        record_number = 0 
        for line1 in input:
            line2 = input.readline()
            line3 = input.readline()
            line4 = input.readline()
            if record_number  in records_to_keep:
                    output.writelines([line1, line2, line3, line4])                    
            record_number += 1

## Create multiple samples of records from a single file

In [None]:
import random

input_dataset = "test.fastq"
records_to_sample = 100
number_of_replicates = 10

with open(input_dataset) as input:
    num_lines = sum([1 for line in input])
total_records = int(num_lines / 4)
print("sampling {} out of {} records, replicated {} times".format(records_to_sample, total_records, number_of_replicates))

outputs = []

for i in range(number_of_replicates):
      outputs.append([open("sample.{}.fastq".format(i), "w"), 
                      random.sample(range(total_records), records_to_sample)]
                    )
   
record_number = 0
with open(input_dataset) as input:
        for line1 in input:
            line2 = input.readline()
            line3 = input.readline()
            line4 = input.readline()
            for output, keep in outputs:
                if record_number in keep:
                    output.writelines([line1, line2, line3, line4])    
            record_number += 1
            
for output, keep in outputs:
    output.close()

## Put all together with a minimal user interface

In [None]:
import argparse
import random
import sys


def make_parser():
    parser = argparse.ArgumentParser(description='Randomly sampling a FASTQ file')
    parser.add_argument("input", help="input FASTQ filename")
    parser.add_argument("output", help="output FASTQ filename")
    parser.add_argument("-n", "--number", type=int, help="number of reads to sample")
    parser.add_argument("-p", "--percentage", type=int, help="percentage of reads to sample")
    parser.add_argument("-r", "--replicates", type=int, help="number of output files to write", default=1)

    return parser


def count_records(filename, record_length=4):
    print("counting records....")
    with open(filename) as input:
        num_lines = sum([1 for line in input])
    total_records = int(num_lines / record_length)
    return total_records
    
    
def main():
    parser = make_parser()
    args = parser.parse_args()
    
    if args.percentage and args.number:
       sys.exit("give either a percentage or a number of reads to sample, not both")

    if not args.percentage and not args.number:
       sys.exit("you must give either a percentage or a number of reads to sample")

    total_records = count_records(args.input)
    records_to_sample = args.number if args.number else (total_records * args.percentage) // 100
    number_of_replicates = args.replicates

    input_filename = args.input
    output_filename = args.output
       
    print("sampling {} out of {} records, replicated {} times".format(records_to_sample, total_records, number_of_replicates))

    outputs = []

    for i in range(number_of_replicates):
        outputs.append([open("{}_{}".format(i, output_filename), "w"), 
                        random.sample(range(total_records), records_to_sample)]
                       )
   
    record_number = 0
    with open(input_filename) as input:
        for line1 in input:
            line2 = input.readline()
            line3 = input.readline()
            line4 = input.readline()
            for output, keep in outputs:
                if record_number in keep:
                    output.writelines([line1, line2, line3, line4])    
            record_number += 1
            if record_number % ((total_records * 10) / 100) == 0:
                print("{} % done".format((record_number / total_records) * 100))

    for output, keep in outputs:
        output.close()

    print("All done!")

    
if __name__ == '__main__':
    # execute only if run as a script
    main()
