In [1]:
### Open Reading Frame for ncbi reference sequence RBD from https://www.ncbi.nlm.nih.gov/gene/1489668:

################################################
# Start sequence of RBD in NCBI: AGGGTTGTTCCCTCA
# End sequence of RBD in NCBI: CCAGTGTGTCAATTTT

# FROM MSA between GISAID ref genome and NCBI ref genome 
# Start sequence of RBD in GISAID: AGAGTCCAACCAACAG
# End sequence of RBD in GISAID: CAAATGTGTCAATTTC
################################################

# NOTE: Since ORF indices for RBD belongs to NCBI ref genome, it needs to be converted to that for 
# GISAID ref genome since MSA file is based on that. So, to do that, the two ref genomes were aligned 
# using MUSCLE: https://www.ebi.ac.uk/Tools/msa/muscle/
# With this MSA, and start and end sites for RBD being 22407 and 23072 in NCBI reference genome, 
# the position of these two sites were obtained from the MSA file below:

def find_positions_RBD_in_GISAID_msa(msa_fasta_file):
    with open(msa_fasta_file, "r") as file:
        lines = file.readlines()

    # Combine the lines to form a single sequence.
    sequence = "".join([line.strip() for line in lines[1:]])  # Skip the header line.

    # Initialize counters for non-gap and total characters, and positions.
    non_gap_count = 0
    total_count = 0
    position_RBD_start = None
    position_RBD_end = None

    # Iterate through the sequence to find the positions and count characters.
    for i, char in enumerate(sequence):
        total_count += 1
        if char != "-":
            non_gap_count += 1
            if non_gap_count == 22407:
                position_RBD_start = i + 1  # Adjust for 1-based indexing.
            if non_gap_count == 23072:
                position_RBD_end = i + 1

    if position_RBD_start is not None and position_RBD_end is not None:
        print("Starting position of RBD in GISAID ref genome (with gaps):", position_RBD_start)
        print("Ending position of RBD in GISAID ref genome (with gaps):", position_RBD_end)

def main():
    msa_fasta_file = "your/path/MUSCLE_MSA_refGenomes.fasta"  # Change this to the actual file path.
    find_positions_RBD_in_GISAID_msa(msa_fasta_file)

if __name__ == "__main__":
    main()

Starting position of RBD in GISAID ref genome (with gaps): 22567
Ending position of RBD in GISAID ref genome (with gaps): 23235


In [6]:
## After the start and stop sites for RBD in MSA between the two ref genomes were obtained, the gap characters 
## were removed from the GISAID ref genome to identify the position of the start and stop sites of RBD below:

from Bio import SeqIO

def count_and_find_positions(msa_file):
    # Read the MSA FASTA file
    records = list(SeqIO.parse(msa_file, "fasta"))

    # Check if the file has at least two sequences
    if len(records) < 2:
        print("Error: MSA file should contain at least two sequences.")
        return

    # Extract the second sequence
    sequence = str(records[1].seq)

    # Count characters at positions 22567 and 23235 (including gaps)
    count_at_22567 = sequence.count('-', 0, 22566) + sequence[0:22567].count('A') + sequence[0:22567].count('T') + sequence[0:22567].count('G') + sequence[0:22567].count('C')
    count_at_23235 = sequence.count('-', 0, 23234) + sequence[0:23235].count('A') + sequence[0:23235].count('T') + sequence[0:23235].count('G') + sequence[0:23235].count('C')

    print(f"Position of RBD start site in MSA between NCBI and GISAID (including gaps): {count_at_22567}")
    print(f"Position of RBD stop site in MSA between NCBI and GISAID (including gaps): {count_at_23235}")

    # Find the positions of characters without gaps
    pos_22567_no_gaps = count_at_22567 - sequence[0:22567].count('-')
    pos_23235_no_gaps = count_at_23235 - sequence[0:23235].count('-')

    print(f"Position of RBD start site in GISAID  without gaps: {pos_22567_no_gaps}")
    print(f"Position of RBD stop site in GISAID without gaps: {pos_23235_no_gaps}")

if __name__ == "__main__":
    # Provide the path to your MSA FASTA file
    msa_file_path = "your/path/MUSCLE_MSA_refGenomes.fasta"
    
    count_and_find_positions(msa_file_path)


Position of RBD start site in MSA between NCBI and GISAID (including gaps): 22567
Position of RBD stop site in MSA between NCBI and GISAID (including gaps): 23235
Position of RBD start site in GISAID  without gaps: 22517
Position of RBD stop site in GISAID without gaps: 23185


In [None]:
## RBD extraction from GISAID MSA file here onwards

In [7]:
# Python script identifies the start and end sites for RBD in reference genome when gaps are added for the
# GISAID MSA sequences by counting nucleotides (A,T,G or C) only and ignoring gap character in "-" and then 
# counting these positions after gaps are added to get the actual location for these two 
# positions when gap is present

### Open Reading Frame for RBD from https://www.ncbi.nlm.nih.gov/gene/1489668:

##########################################################

## Start position of RBD in GISAID reference genome: 22517
## End position of RBD in GISAID reference genome: 23187 (2 NT ADDED TO MAKE MULTIPLE OF 3)

def find_positions_RBD_in_msa_fasta(msa_fasta_file):
    with open(msa_fasta_file, "r") as file:
        lines = file.readlines()

    # Combine the lines to form a single sequence.
    sequence = "".join([line.strip() for line in lines[1:]])  # Skip the header line.

    # Initialize counters for non-gap and total characters, and positions.
    non_gap_count = 0
    total_count = 0
    position_RBD_start = None
    position_RBD_end = None

    # Iterate through the sequence to find the positions and count characters.
    for i, char in enumerate(sequence):
        total_count += 1
        if char != "-":
            non_gap_count += 1
            if non_gap_count == 22517:
                position_RBD_start = i + 1  # Adjust for 1-based indexing.
            if non_gap_count == 23187:
                position_RBD_end = i + 1

    if position_RBD_start is not None and position_RBD_end is not None:
        print("Starting position of RBD in GISAID ref genome (with gaps):", position_RBD_start)
        print("Ending position of RBD in GISAID ref genome (with gaps):", position_RBD_end)

def main():
    msa_fasta_file = "your/path/ref_genome_MSA.fasta"  # Change this to the actual file path.
    find_positions_RBD_in_msa_fasta(msa_fasta_file)

if __name__ == "__main__":
    main()


Starting position of RBD in GISAID ref genome (with gaps): 35011
Ending position of RBD in GISAID ref genome (with gaps): 36610


In [2]:
# Python script to cleave the rest of the MSA sequences at the position corresponding 
# to spike RBD sequence in reference sequence after adding the gaps:

########################################################################
# RBD Start Position (22517 in ref genome) in MSA with gaps: 35011           
# RBD End Ending Position (23187 in ref genome) in MSA with gaps: 36610

######################### WORKING SCRIPT ###############################

import torch
from tqdm import tqdm

def trim_and_write_sequences(input_fasta, output_fasta, start_position, end_position):
    # Count the number of sequences for progress tracking
    with open(input_fasta, "r") as infile:
        num_sequences = sum(1 for line in infile if line.startswith(">"))

    with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
        current_sequence = ""
        current_header = ""

        for line in tqdm(infile, total=num_sequences, desc="Processing sequences"):
            if line.startswith(">"):
                # Write the previous sequence if any
                if current_sequence:
                    # Convert the sequence to a tensor
                    sequence_tensor = torch.tensor([ord(c) for c in current_sequence], dtype=torch.int32, device='cuda')
                    # Trim and replace "-" with ""
                    trimmed_sequence_tensor = sequence_tensor[start_position - 1:end_position]
                    trimmed_sequence = ''.join(chr(c) for c in trimmed_sequence_tensor.cpu().numpy() if chr(c) != "-")
                    outfile.write(f"{current_header}\n{trimmed_sequence}\n")

                # Update current header
                current_header = line.strip()
                current_sequence = ""
            else:
                current_sequence += line.strip()

        # Write the last sequence
        if current_sequence:
            # Convert the sequence to a tensor
            sequence_tensor = torch.tensor([ord(c) for c in current_sequence], dtype=torch.int32, device='cuda')
            # Trim and replace "-" with ""
            trimmed_sequence_tensor = sequence_tensor[start_position - 1:end_position]
            trimmed_sequence = ''.join(chr(c) for c in trimmed_sequence_tensor.cpu().numpy() if chr(c) != "-")
            outfile.write(f"{current_header}\n{trimmed_sequence}\n")

    print(f"Trimmed sequences between positions {start_position} and {end_position} to '{output_fasta}'.")

def main():
    input_fasta = "/mmfs1/projects/changhui.yan/DeewanB/gisaid_data/main_MSA_files/msaCodon_1024_filtered.fasta"
    output_fasta = "/mmfs1/projects/changhui.yan/DeewanB/gisaid_data/main_MSA_files/msaCodon_1024_trimmed_RBD_new.fasta"
    start_position = 35011
    end_position = 36610

    trim_and_write_sequences(input_fasta, output_fasta, start_position, end_position)

if __name__ == "__main__":
    main()


Processing sequences: 3321710it [2:15:18, 409.14it/s]                             

Trimmed sequences between positions 35011 and 36610 to '/mmfs1/projects/changhui.yan/DeewanB/gisaid_data/main_MSA_files/trimmed_RBD.fasta'.



