In [None]:
# Collecting 100k random sequences from the fasta corpus without ambigious nucleotides to pretrain model
# Importing necessary modules
import random
from Bio import SeqIO

# Path to the input fasta file containing SARS-CoV-2 whole genome sequences
db_path = "your/path/sequences.fasta"  # GISAID's SARS-CoV-2 WGS corpus fasta file

# Path to output file to store 100k selected sequences
subset_path = "your/path/100k_sequences.fasta"  # Output fasta file with 100k SARS-CoV-2 WGS files

# Set the total number of sequences we aim to collect
total_viruses = 100000  # Target number of sequences

# Initialize an empty list to store selected sequences
selected_sequences = []

# Initialize a counter to track the number of sequences processed
count = 0

# Open the output file in write mode and the input file in read mode
with open(subset_path, "w") as subset, open(db_path, "r", encoding="latin-1") as handle:
    
    # Parse each sequence record in the input fasta file
    for record in SeqIO.parse(handle, "fasta"):
        
        # Increment the counter with each processed sequence
        count += 1
        
        # Continue collecting sequences until the target is reached
        if len(selected_sequences) < total_viruses:
            
            # Convert sequence to a string format
            sequence = str(record.seq)
            
            # Count the number of 'N' characters in the sequence
            n_count = sequence.count('N')
            
            # Calculate the percentage of 'N' characters in the sequence
            n_percent = n_count / len(sequence)
            
            # Only include sequences with less than 0.1% ambiguous bases
            if n_percent < 0.001:
                
                # Write the sequence to the output file in fasta format
                SeqIO.write(record, subset, "fasta")
                
                # Add the selected sequence to the list of selected sequences
                selected_sequences.append(record)
        else:
            # Stop collecting sequences if the target count is reached
            break

# Calculate and print the number of sequences that met the filtering criteria
num_sequences = len(selected_sequences)
print("Total number of sequences after N% filter: ", num_sequences)

In [None]:
# Writing out 1-512 nt long segmented sequences in subset_path(100k corpus sequences) into csv file for pretraining
import random
import csv
from Bio import SeqIO

# Path to the fasta file containing the 100k selected sequences
subset_path = "your/path/100k_sequences.fasta"

# Path to the output CSV file where segmented sequences will be saved
csv_file = "your/path/100k_seq_segmented.csv"

# Open the CSV file for writing segmented sequences
with open(csv_file, "w", newline='') as csvfile:
    
    # Create a CSV writer object to write rows to the CSV file
    writer = csv.writer(csvfile)
    
    # Write a header row for the CSV file with a single column named 'Segment'
    writer.writerow(['Segment'])

    # Iterate over each sequence record in the input fasta file
    for record in SeqIO.parse(subset_path, "fasta"):
        
        # Convert the sequence to string format
        sequence = str(record.seq)
        
        # Get the length of the current sequence
        seq_length = len(sequence)
        
        # Initialize the starting position for segmentation
        start = 0
        
        # Randomly choose an endpoint for the segment (1 to 512 nt in length)
        end = random.randint(1, 512)

        # Segment and write parts of the sequence to the CSV file until the entire sequence is processed
        while start < seq_length:
            
            # Extract a segment from the sequence based on start and end positions
            segment = sequence[start:end]
            
            # Write the extracted segment as a new row in the CSV file
            writer.writerow([segment])
            
            # Move the start position to just after the current end position
            start = end + 1
            
            # Define a new end position for the next segment, keeping it between 1 and 512 nt
            end = start + random.randint(1, 512)

# Open the CSV file again, this time for reading, to count the total number of rows
with open(csv_file, "r") as csvfile:
    
    # Count the rows by iterating through each line in the CSV file
    num_rows = sum(1 for row in csvfile)

# Print the total number of rows in the CSV file (i.e., the number of segments)
print("Number of rows in the CSV file: ", num_rows)