In [3]:
# Required modules
import gzip
import os
import sys
import shutil

In [4]:
# Parsing arguments

In [5]:
# Making a notebook to standardize seqeuncing data and refernce genomes. Will late be switched to a script.

In [None]:
#Temp inputs for testing
fastq_input = 'test_notebook/barcode01_subset.fastq'

sample_name = 'sample_1'

remane = True

In [13]:
def decompress_fastq(fastq_input, sample_name):
    """
    This function takes a fastq input (either gzipped or no gzipped) and
    uncompresses it to a temporary file. If its already gzipped it just 
    makes a copy to the temp file instead. It will name the temp file 
    according to the sample_name string

    This returs the file name of the uncompressed fastq
    """
    # Determining if the file is in gzip format
    temp_uncompressed = sample_name + '_unc_tmp.fastq' #temp_file_name
    
    try:
        with open(fastq_input, 'rb') as f:
            # Read the first two bytes
            magic_number = f.read(2)
        
        # Check if the magic number indicates gzip format
        if magic_number == b'\x1f\x8b':
            # Decompress a file
            with gzip.open(fastq_input, 'rb') as f_in:
                with open(temp_uncompressed, 'wb') as f_out:
                    f_out.writelines(f_in)
            print('File ' + fastq_input + ' was unzipped and written to temporary file ' + temp_uncompressed)        
        
        # assume the file is plain text if not gzipped
        else:
            shutil.copy(fastq_input, temp_uncompressed)
            print('File ' + fastq_input + ' was already unzipped and written to temporary file ' + temp_uncompressed)
    
    except Exception as e:
            print(f"Error processing {fastq_input}: {e}", file=sys.stderr)

    return(temp_uncompressed)


def check_fastq(fastq):
    """
    This function checks to make sure a fastq file seems to be in the correct format.

    This function expects fastqs to be uncompressed

    This function returns a True if its in the correct format.
    """
    
    phred = {}
    for x in range(0, 94):
        phred[x] = chr(x + 33)
    
    
    # Determinging if the file is in standard fastq format
    try:
        with open(fastq, 'r') as f:
            valid_format = True
            line_num = 0
            for line in f:
                line = line.strip()
                line_num += 1
                
                if line_num % 4 == 1:
                    if not line.startswith('@'):
                        raise ValueError("File Format Error Line " + str(line_num) + ' Does Not Start with a @')
                        valid_format = False
                elif line_num % 4 == 2:
                    # sequence line, check for valid characters
                    for char in line:
                        if char not in 'ACGTN':
                            raise ValueError("File Format Error Line " + str(line_num) + ' Contains Charcters other than ATGCN')
                            valid_format = False
                elif line_num % 4 == 3:
                    if not line.startswith('+'):
                        raise ValueError("File Format Error Line " + str(line_num) + ' Does Not Start with a +')
                        valid_format = False
                elif line_num % 4 == 0:
                    for char in line:
                        if not char in phred.values():
                            raise ValueError("File Format Error Line " + str(line_num) + ' Contains a non-standard phred score')
                            valid_format = False
        
            # Check if the number of lines is a multiple of 4 (valid FASTQ format)
            if line_num % 4 != 0:
                raise ValueError("File Format Error: FastQ had " + str(line_num) + ' lines, not a multiple of 4')
                valid_format = False
            
    except Exception as e:
        print(f"Error processing {fastq}: {e}", file=sys.stderr)

    if valid_format:
        print(fastq + ' is a proper fasta file')
        return True
    else:
        print(fastq + ' is NOT a proper fasta file')
        return False

# Renaming sequence files (optional)
# Some ONT sequencing reads come with very long headders that do not work with certain programs.

output_uncompressed = sample_name + '.fastq'

try:
    with open(temp_uncompressed, 'r') as infile, open(output_uncompressed, 'w') as outfile:
        line_num = 0
        read_count = 0
        for line in infile:
            line = line.strip()
            if line_num % 4 == 0:
                # Sequence identifier line (starts with '@')
                read_count += 1
                sequence_name = f"{sample_name}_{read_count}"
                outfile.write(f"@{sequence_name}\n")
            else:
                # Other lines (sequence, optional quality scores)
                outfile.write(f"{line}\n")
            line_num += 1
except Exception as e:
    print(f"Error reading or writing files: {e}", file=sys.stderr)

print(f"Successfully renamed sequences")
    
# Writing gzipping

output_compressed = sample_name + '.fastq.gz'

try:
    with open(output_uncompressed, 'rb') as f_in, gzip.open(output_compressed, 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)
    
    print(f"Successfully compressed {output_uncompressed} to {output_compressed}")
    
except Exception as e:
    print(f"Error compressing file: {e}", file=sys.stderr)

# Removing temp files

try:
    os.remove(output_uncompressed)
    print(f"Successfully deleted {output_uncompressed}")
    os.remove(temp_uncompressed)
    print(f"Successfully deleted {temp_uncompressed}")
    
except Exception as e:
    print(f"Error deleting file: {e}", file=sys.stderr)

File test_notebook/barcode01_subset.fastq was already unzipped and written to temporary file sample_1_unc_tmp.fastq
sample_1_unc_tmp.fastq is a proper fasta file
Successfully renamed sequences
Successfully compressed sample_1.fastq to sample_1.fastq.gz
Successfully deleted sample_1.fastq
Successfully deleted sample_1_unc_tmp.fastq


In [14]:
phred.values()

dict_values(['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~'])