In [5]:
def load_fasta(file_path):
    fasta_dict = {}
    with open(file_path, 'r') as f:
        header = None
        sequence = []
        
        for line in f:
            line = line.strip()  # Remove any leading/trailing whitespace
            if line.startswith(">"):  # This is a header line
                if header:  # If there was a previous sequence, save it
                    fasta_dict[header] = ''.join(sequence)
                header = line[1:]  # Remove the ">" from the header
                sequence = []  # Start a new sequence
            else:
                sequence.append(line)  # Add the sequence part to the list
        
        # Save the last sequence
        if header:
            fasta_dict[header] = ''.join(sequence)

    return fasta_dict

In [13]:
def break_into_chunks(fasta_dict, chunk_size=1000):
    new_fasta_dict = {}
    for scaffold, sequence in fasta_dict.items():
        # Calculate the number of chunks for the current scaffold
        num_chunks = (len(sequence) + chunk_size - 1) // chunk_size  # This ensures rounding up
        
        # Break the sequence into chunks
        for i in range(num_chunks):
            chunk_sequence = sequence[i * chunk_size : (i + 1) * chunk_size]
            # Generate new scaffold name (e.g., scaffold_1, scaffold_2, ...)
            new_scaffold_name = f"{scaffold}_{i+1}"
            # Add the chunk to the new dictionary
            new_fasta_dict[new_scaffold_name] = chunk_sequence
            
    return new_fasta_dict

In [23]:
def write_fasta(fasta_dict, output_file):
    with open(output_file, 'w') as f:
        for header, sequence in fasta_dict.items():
            # Write the header line, starting with '>'
            f.write(f">{header}\n")
            # Write the sequence, breaking it into lines of 80 characters (for readability)
            for i in range(0, len(sequence), 80):
                f.write(sequence[i:i+80] + "\n")

In [14]:

# Example usage:
fasta_file = "idCulSono.KS.ABADRU.1.0.fasta"

genome_full = load_fasta(fasta_file)

In [15]:
len(genome_full.keys())

10

In [16]:
# Break into 1000bp chunks
chunked_fasta_dict = break_into_chunks(genome_full, chunk_size=1000)

In [18]:
len(chunked_fasta_dict.keys())

136870

In [22]:
chunked_fasta_dict['PGA_scaffold_1__1_contigs__length_48100000_1']

1000

In [24]:
write_fasta(chunked_fasta_dict, 'AK_Genome_1kb_chunks.fasta')

In [28]:
(136870*1000)/1000000000

0.13687

In [29]:
test_all_dna = ''
for k,v in genome_full.items():
    test_all_dna += v

len(test_all_dna)

136865895

In [30]:
test_all_dna = ''
for k,v in chunked_fasta_dict.items():
    test_all_dna += v

len(test_all_dna)

136865895