In [2]:
# Take one complete SARS-CoV-2 genome and split it to several non-overlapping pieces of random lengths (0-512 bases)
# and use the reads to pretrain transformer-based model

import random
import csv
from Bio import SeqIO

def generate_random_sequences(input_file, output_file):
    sequences = []
    with open(input_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            sequence = record.seq
            seq_length = len(sequence)
            if seq_length > 512:
                start = 0
                while start + 512 <= seq_length:
                    end = start + random.randint(1, 512)
                    sub_sequence = sequence[start:end]
                    sequences.append(str(sub_sequence))
                    start = end + 1

    with open(output_file, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Sequence"])
        writer.writerows([[seq] for seq in sequences])

    print(f"Generated {len(sequences)} sequences and saved them to {output_file}.")


# Example usage:
input_file = "your/path/cov_sequence.fasta"
output_file = "your/path/cov_sequence.csv"
generate_random_sequences(input_file, output_file)


Generated 114 sequences and saved them to /Users/DB/LineageEvolution/cov_sequence.csv.
