In [7]:
# Required modules
import gzip
import os
import sys
import shutil

In [8]:
# Parsing arguments

In [9]:
# Making a notebook to standardize seqeuncing data and refernce genomes. Will late be switched to a script.

In [22]:
def decompress_fastq(fastq_input, sample_name):
    """
    This function takes a fastq input (either gzipped or no gzipped) and
    uncompresses it to a temporary file. If its already gzipped it just 
    makes a copy to the temp file instead. It will name the temp file 
    according to the sample_name string

    This returs the file name of the uncompressed fastq
    """
    # Determining if the file is in gzip format
    temp_uncompressed = sample_name + '_unc_tmp.fastq' #temp_file_name
    
    try:
        with open(fastq_input, 'rb') as f:
            # Read the first two bytes
            magic_number = f.read(2)
        
        # Check if the magic number indicates gzip format
        if magic_number == b'\x1f\x8b':
            # Decompress a file
            with gzip.open(fastq_input, 'rb') as f_in:
                with open(temp_uncompressed, 'wb') as f_out:
                    f_out.writelines(f_in)
            print('File ' + fastq_input + ' was unzipped and written to temporary file ' + temp_uncompressed)        
        
        # assume the file is plain text if not gzipped
        else:
            shutil.copy(fastq_input, temp_uncompressed)
            print('File ' + fastq_input + ' was already unzipped and written to temporary file ' + temp_uncompressed)
    
    except Exception as e:
            print(f"Error processing {fastq_input}: {e}", file=sys.stderr)

    return(temp_uncompressed)


def check_fastq(fastq):
    """
    This function checks to make sure a fastq file seems to be in the correct format.

    This function expects fastqs to be uncompressed

    This function returns a True if its in the correct format.
    """
    
    phred = {}
    for x in range(0, 94):
        phred[x] = chr(x + 33)
    
    
    # Determinging if the file is in standard fastq format
    try:
        with open(fastq, 'r') as f:
            valid_format = True
            line_num = 0
            for line in f:
                line = line.strip()
                line_num += 1
                
                if line_num % 4 == 1:
                    if not line.startswith('@'):
                        valid_format = False
                        raise ValueError("File Format Error Line " + str(line_num) + ' Does Not Start with a @')
                        
                elif line_num % 4 == 2:
                    # sequence line, check for valid characters
                    for char in line:
                        if char not in 'ACGTN':
                            valid_format = False
                            raise ValueError("File Format Error Line " + str(line_num) + ' Contains Charcters other than ATGCN')
                            
                elif line_num % 4 == 3:
                    if not line.startswith('+'):
                        valid_format = False
                        raise ValueError("File Format Error Line " + str(line_num) + ' Does Not Start with a +')
                        
                elif line_num % 4 == 0:
                    for char in line:
                        if not char in phred.values():
                            valid_format = False
                            print(valid_format)
                            raise ValueError("File Format Error Line " + str(line_num) + ' Contains a non-standard phred score')
                            
        
            # Check if the number of lines is a multiple of 4 (valid FASTQ format)
            if line_num % 4 != 0:
                raise ValueError("File Format Error: FastQ had " + str(line_num) + ' lines, not a multiple of 4')
                valid_format = False
            
    except Exception as e:
        print(f"Error processing {fastq}: {e}", file=sys.stderr)

    if valid_format:
        print(fastq + ' is a proper fasta file')
        return True
    else:
        print(fastq + ' is NOT a proper fasta file')
        return False



def rename_fastqs(in_fq, out_fq, prefix):
    """
    This function takes a fastq file (Uncompressed) and renames
    the sequences in the file. This function is used to rename
    complex sequene names typical of ONT sequencing. By defualt
    this function renames sequences using the supplied prefix and
    the read count after that ("{prefix}_{read#}")
    """
    
    try:
        with open(in_fq, 'r') as infile, open(out_fq, 'w') as outfile:
            line_num = 0
            read_count = 0
            for line in infile:
                line = line.strip()
                if line_num % 4 == 0:
                    # Sequence identifier line (starts with '@')
                    read_count += 1
                    sequence_name = f"{prefix}_{read_count}"
                    outfile.write(f"@{sequence_name}\n")
                else:
                    # Other lines (sequence, optional quality scores)
                    outfile.write(f"{line}\n")
                line_num += 1
            print(f"Successfully renamed sequences, written to {out_fq}")
            return out_fq
    except Exception as e:
        print(f"Error reading or writing files: {e}", file=sys.stderr)
    

    
def gzip_fq(in_fq, out_fq):
    """
    Takes a uncompressed fastq and gzips it.
    """

    try:
        with open(in_fq, 'rb') as f_in, gzip.open(out_fq, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
        
        print(f"Successfully compressed {in_fq} to {out_fq}")
        
    except Exception as e:
        print(f"Error compressing file: {e}", file=sys.stderr)

def clean_files(file_list):
    """
    cleans up temp files
    """

    try:
        for file in file_list:
            os.remove(file)
            print(f"Successfully deleted {file}")
        
    except Exception as e:
        print(f"Error deleting file: {e}", file=sys.stderr)

In [20]:
#Temp inputs for testing
fastq_input = 'test_notebook/barcode01_subset.fastq.gz'

sample_name = 'sample_1'

rename = True

In [21]:
temp_files = [] #Creating list of temp files to clean up

decompressed = decompress_fastq(fastq_input, sample_name) #Decompressin (or not if it already is) input file and renamming it
temp_files.append(decompressed) #adding temp file for later deltion

if check_fastq(decompressed): # Checking if it is a standard fastq
    if rename:
        decompressed = rename_fastqs(decompressed, 'renammed.fastq', sample_name) # renamming sequences if specified
        temp_files.append(decompressed) # adding renammed seq file to temp files to be deleted
        
    output_sample = sample_name + '.fastq.gz' # creating file file name based on sample name
    
    gzip_fq(decompressed, output_sample) # gziping final file
    
    clean_files(temp_files) # cleaning temp files
    
    print('fastq for ' + sample_name + ' was checked and compressed. It can be found at ' + output_sample)
    
else:
    print('Sample ' + sample_name + ' did not pass fastq check. See errors above')

File test_notebook/barcode01_subset.fastq.gz was unzipped and written to temporary file sample_1_unc_tmp.fastq
sample_1_unc_tmp.fastq is a proper fasta file
Successfully renamed sequences, written to output_uncompressed
Successfully compressed renammed.fastq to sample_1.fastq.gz
Successfully deleted sample_1_unc_tmp.fastq
Successfully deleted renammed.fastq
fastq for sample_1 was checked and compressed. It can be found at sample_1.fastq.gz
