In [1]:
### Open Reading Frame for ncbi reference sequence RBD from https://www.ncbi.nlm.nih.gov/gene/1489668:

################################################
# Start sequence of RBD in NCBI: AGGGTTGTTCCCTCA
# End sequence of RBD in NCBI: CCAGTGTGTCAATTTT

# FROM MSA between GISAID ref genome and NCBI ref genome 
# Start sequence of RBD in GISAID: AGAGTCCAACCAACAG
# End sequence of RBD in GISAID: CAAATGTGTCAATTTC
################################################

# NOTE: Since ORF indices for RBD belongs to NCBI ref genome, it needs to be converted to that for 
# GISAID ref genome since MSA file is based on that. So, to do that, the two ref genomes were aligned 
# using MUSCLE: https://www.ebi.ac.uk/Tools/msa/muscle/
# With this MSA, and start and end sites for RBD being 22407 and 23072 in NCBI reference genome, 
# the position of these two sites were obtained from the MSA file below:

def find_positions_RBD_in_GISAID_msa(msa_fasta_file):
    with open(msa_fasta_file, "r") as file:
        lines = file.readlines()

    # Combine the lines to form a single sequence.
    sequence = "".join([line.strip() for line in lines[1:]])  # Skip the header line.

    # Initialize counters for non-gap and total characters, and positions.
    non_gap_count = 0
    total_count = 0
    position_RBD_start = None
    position_RBD_end = None

    # Iterate through the sequence to find the positions and count characters.
    for i, char in enumerate(sequence):
        total_count += 1
        if char != "-":
            non_gap_count += 1
            if non_gap_count == 22407:
                position_RBD_start = i + 1  # Adjust for 1-based indexing.
            if non_gap_count == 23072:
                position_RBD_end = i + 1

    if position_RBD_start is not None and position_RBD_end is not None:
        print("Starting position of RBD in GISAID ref genome (with gaps):", position_RBD_start)
        print("Ending position of RBD in GISAID ref genome (with gaps):", position_RBD_end)

def main():
    msa_fasta_file = "your/path/MUSCLE_MSA_refGenomes.fasta"  # Change this to the actual file path.
    find_positions_RBD_in_GISAID_msa(msa_fasta_file)

if __name__ == "__main__":
    main()

Starting position of RBD in GISAID ref genome (with gaps): 22567
Ending position of RBD in GISAID ref genome (with gaps): 23235


In [6]:
## After the start and stop sites for RBD in MSA between the two ref genomes were obtained, the gap characters 
## were removed from the GISAID ref genome to identify the position of the start and stop sites of RBD below:

from Bio import SeqIO

def count_and_find_positions(msa_file):
    # Read the MSA FASTA file
    records = list(SeqIO.parse(msa_file, "fasta"))

    # Check if the file has at least two sequences
    if len(records) < 2:
        print("Error: MSA file should contain at least two sequences.")
        return

    # Extract the second sequence
    sequence = str(records[1].seq)

    # Count characters at positions 22567 and 23235 (including gaps)
    count_at_22567 = sequence.count('-', 0, 22566) + sequence[0:22567].count('A') + sequence[0:22567].count('T') + sequence[0:22567].count('G') + sequence[0:22567].count('C')
    count_at_23235 = sequence.count('-', 0, 23234) + sequence[0:23235].count('A') + sequence[0:23235].count('T') + sequence[0:23235].count('G') + sequence[0:23235].count('C')

    print(f"Position of RBD start site in MSA between NCBI and GISAID (including gaps): {count_at_22567}")
    print(f"Position of RBD stop site in MSA between NCBI and GISAID (including gaps): {count_at_23235}")

    # Find the positions of characters without gaps
    pos_22567_no_gaps = count_at_22567 - sequence[0:22567].count('-')
    pos_23235_no_gaps = count_at_23235 - sequence[0:23235].count('-')

    print(f"Position of RBD start site in GISAID  without gaps: {pos_22567_no_gaps}")
    print(f"Position of RBD stop site in GISAID without gaps: {pos_23235_no_gaps}")

if __name__ == "__main__":
    # Provide the path to your MSA FASTA file
    msa_file_path = "your/path/MUSCLE_MSA_refGenomes.fasta"
    
    count_and_find_positions(msa_file_path)


Position of RBD start site in MSA between NCBI and GISAID (including gaps): 22567
Position of RBD stop site in MSA between NCBI and GISAID (including gaps): 23235
Position of RBD start site in GISAID  without gaps: 22517
Position of RBD stop site in GISAID without gaps: 23185


In [None]:
## RBD extraction from GISAID MSA file here onwards

In [7]:
# Python script identifies the start and end sites for RBD in reference genome when gaps are added for the
# GISAID MSA sequences by counting nucleotides (A,T,G or C) only and ignoring gap character in "-" and then 
# counting these positions after gaps are added to get the actual location for these two 
# positions when gap is present

### Open Reading Frame for RBD from https://www.ncbi.nlm.nih.gov/gene/1489668:

##########################################################

## Start position of RBD in GISAID reference genome: 22517
## End position of RBD in GISAID reference genome: 23187 (2 NT ADDED TO MAKE MULTIPLE OF 3)

def find_positions_RBD_in_msa_fasta(msa_fasta_file):
    with open(msa_fasta_file, "r") as file:
        lines = file.readlines()

    # Combine the lines to form a single sequence.
    sequence = "".join([line.strip() for line in lines[1:]])  # Skip the header line.

    # Initialize counters for non-gap and total characters, and positions.
    non_gap_count = 0
    total_count = 0
    position_RBD_start = None
    position_RBD_end = None

    # Iterate through the sequence to find the positions and count characters.
    for i, char in enumerate(sequence):
        total_count += 1
        if char != "-":
            non_gap_count += 1
            if non_gap_count == 22517:
                position_RBD_start = i + 1  # Adjust for 1-based indexing.
            if non_gap_count == 23187:
                position_RBD_end = i + 1

    if position_RBD_start is not None and position_RBD_end is not None:
        print("Starting position of RBD in GISAID ref genome (with gaps):", position_RBD_start)
        print("Ending position of RBD in GISAID ref genome (with gaps):", position_RBD_end)

def main():
    msa_fasta_file = "your/path/ref_genome_MSA.fasta"  # Change this to the actual file path.
    find_positions_RBD_in_msa_fasta(msa_fasta_file)

if __name__ == "__main__":
    main()


Starting position of RBD in GISAID ref genome (with gaps): 35011
Ending position of RBD in GISAID ref genome (with gaps): 36610


In [None]:
# Python script to cleave the rest of the MSA sequences at the position corresponding 
# to spike RBD sequence in reference sequence after adding the gaps:

########################################################################
# RBD Start Position (22517 in ref genome) in MSA with gaps: 35011           
# RBD End Ending Position (23187 in ref genome) in MSA with gaps: 36610

######################### WORKING SCRIPT ###############################

import torch
from tqdm import tqdm

def trim_and_write_sequences(input_fasta, output_fasta, start_position, end_position):
    # Count the number of sequences for progress tracking
    with open(input_fasta, "r") as infile:
        num_sequences = sum(1 for line in infile if line.startswith(">"))

    with open(input_fasta, "r") as infile, open(output_fasta, "w") as outfile:
        current_sequence = ""
        current_header = ""

        for line in tqdm(infile, total=num_sequences, desc="Processing sequences"):
            if line.startswith(">"):
                # Write the previous sequence immediately after trimming
                if current_sequence:
                    # Convert the sequence to a tensor
                    sequence_tensor = torch.tensor([ord(c) for c in current_sequence], dtype=torch.int32, device='cuda')
                    # Trim and replace "-" with ""
                    trimmed_sequence_tensor = sequence_tensor[start_position - 1:end_position]
                    trimmed_sequence = ''.join(chr(c) for c in trimmed_sequence_tensor.cpu().numpy() if chr(c) != "-")
                    outfile.write(f"{current_header}\n{trimmed_sequence}\n")

                # Update current header for the next sequence
                current_header = line.strip()
                current_sequence = ""
            else:
                # Accumulate the sequence
                current_sequence += line.strip()

        # Trim and write the last sequence
        if current_sequence:
            sequence_tensor = torch.tensor([ord(c) for c in current_sequence], dtype=torch.int32, device='cuda')
            trimmed_sequence_tensor = sequence_tensor[start_position - 1:end_position]
            trimmed_sequence = ''.join(chr(c) for c in trimmed_sequence_tensor.cpu().numpy() if chr(c) != "-")
            outfile.write(f"{current_header}\n{trimmed_sequence}\n")

    print(f"Trimmed sequences between positions {start_position} and {end_position} to '{output_fasta}'.")

def main():
    input_fasta = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/msaCodon_0522.fasta"
    output_fasta = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/msaCodon_0522_trimmed_RBD.fasta"
    start_position = 35011
    end_position = 36610

    trim_and_write_sequences(input_fasta, output_fasta, start_position, end_position)

if __name__ == "__main__":
    main()

Processing sequences:  68%|██████▊   | 10352067/15129074 [7:18:14<3:21:20, 395.44it/s]

In [3]:
# Counting sequence statistics for trimmed RBD fasta file to find out if outliers exist

from Bio import SeqIO

def generate_sequence_statistics(fasta_file):
    sequence_lengths = []
    total_sequences = 0
    total_bases = 0

    with open(fasta_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            total_sequences += 1
            sequence_length = len(record.seq)
            sequence_lengths.append(sequence_length)
            total_bases += sequence_length

    # Calculate statistics
    min_length = min(sequence_lengths)
    max_length = max(sequence_lengths)
    average_length = total_bases / total_sequences

    # Print the statistics
    print("Sequence Statistics:")
    print(f"Total Sequences: {total_sequences}")
    print(f"Total Bases: {total_bases}")
    print(f"Minimum Length: {min_length} bases")
    print(f"Maximum Length: {max_length} bases")
    print(f"Average Length: {average_length:.2f} bases")

if __name__ == "__main__":
    fasta_file = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/msaCodon_0522_trimmed_RBD.fasta"  # Replace with your FASTA file
    generate_sequence_statistics(fasta_file)


Sequence Statistics:
Total Sequences: 15129074
Total Bases: 9590532564
Minimum Length: 0 bases
Maximum Length: 714 bases
Average Length: 633.91 bases


In [1]:
# Calculate first quartile (Q1), third quartile (Q3), and the interquartile range (IQR) and 
# use the IQR to determine the lower and upper bounds for outlier detection and remove outlier readlengths.

import numpy as np
from collections import defaultdict

def count_sequence_lengths(fasta_file):
    sequence_lengths = []
    sequences = []

    with open(fasta_file, "r") as file:
        sequence = ""
        header = ""

        for line in file:
            if line.startswith(">"):
                if sequence:
                    sequence_lengths.append(len(sequence))
                    sequences.append((header, sequence))
                header = line.strip()
                sequence = ""
            else:
                sequence += line.strip()

        # Don't forget to count the last sequence
        if sequence:
            sequence_lengths.append(len(sequence))
            sequences.append((header, sequence))

    return sequence_lengths, sequences

def calculate_iqr_outliers(sequence_lengths):
    q1 = np.percentile(sequence_lengths, 25)
    q3 = np.percentile(sequence_lengths, 75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return lower_bound, upper_bound

def filter_outliers_and_write(sequences, lower_bound, upper_bound, output_file):
    with open(output_file, "w") as outfile:
        for header, sequence in sequences:
            if lower_bound <= len(sequence) <= upper_bound:
                outfile.write(f"{header}\n{sequence}\n")

def main():
    fasta_file = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/msaCodon_0522_trimmed_RBD.fasta"
    output_file = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_0522_trimmed_RBD.fasta"

    sequence_lengths, sequences = count_sequence_lengths(fasta_file)
    lower_bound, upper_bound = calculate_iqr_outliers(sequence_lengths)
    filter_outliers_and_write(sequences, lower_bound, upper_bound, output_file)

    print(f"Sequences between lengths {int(lower_bound)} and {int(upper_bound)} written to '{output_file}'.")

if __name__ == "__main__":
    main()


Sequences between lengths 634 and 634 written to '/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_0522_trimmed_RBD.fasta'.


In [2]:
# Counting sequence statistics for filtered trimmed RBD fasta file to find out if outliers exist

from Bio import SeqIO

def generate_sequence_statistics(fasta_file):
    sequence_lengths = []
    total_sequences = 0
    total_bases = 0

    with open(fasta_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            total_sequences += 1
            sequence_length = len(record.seq)
            sequence_lengths.append(sequence_length)
            total_bases += sequence_length

    # Calculate statistics
    min_length = min(sequence_lengths)
    max_length = max(sequence_lengths)
    average_length = total_bases / total_sequences

    # Print the statistics
    print("Sequence Statistics:")
    print(f"Total Sequences: {total_sequences}")
    print(f"Total Bases: {total_bases}")
    print(f"Minimum Length: {min_length} bases")
    print(f"Maximum Length: {max_length} bases")
    print(f"Average Length: {average_length:.2f} bases")

if __name__ == "__main__":
    fasta_file = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_0522_trimmed_RBD.fasta"  # Replace with your FASTA file
    generate_sequence_statistics(fasta_file)


Sequence Statistics:
Total Sequences: 14931769
Total Bases: 9466741546
Minimum Length: 634 bases
Maximum Length: 634 bases
Average Length: 634.00 bases


In [4]:
## Python script to make a subset of the filtered_msaCodon_1024_trimmed_RBD.fasta without N and  
## also make sure length of each sequence is multiple of 3 

from Bio import SeqIO
import random

# Input FASTA file and output file
input_fasta = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_0522_trimmed_RBD.fasta"
output_fasta = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.fasta"

# Read all sequences from the input FASTA file
all_sequences = list(SeqIO.parse(input_fasta, "fasta"))

# Randomly select 1 million sequences
num_sequences_to_select = 14900000
subset_sequences = random.sample(all_sequences, min(num_sequences_to_select, len(all_sequences)))

# Filter sequences to contain only A, T, G, and C, and have lengths as multiples of 3
valid_sequences = [seq for seq in subset_sequences if set(str(seq.seq)).issubset("ATGC")]

# Ensure each selected sequence length is a multiple of 3 or remove 1 or 2 nucleotides
for i in range(len(valid_sequences)):
    seq_len = len(valid_sequences[i])
    if seq_len % 3 != 0:
        trim_length = seq_len % 3
        valid_sequences[i] = valid_sequences[i][:-trim_length]

# Write the selected sequences to the output FASTA file
with open(output_fasta, "w") as output_handle:
    SeqIO.write(valid_sequences, output_handle, "fasta")

print(f"Randomly selected {num_sequences_to_select} valid sequences and saved to {output_fasta}")


Randomly selected 14900000 valid sequences and saved to /mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.fasta


In [1]:
# Script to write out matching fasta files with Accession ID values in metadata file to csv files with Variant and Lineage values

from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
from tqdm import tqdm
from multiprocessing import Pool, cpu_count, Manager

# Load your metadata DataFrame and prepare it for lookups
csv_file = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/metadata_2024_09_05.tsv"
df_metadata = pd.read_csv(csv_file, sep='\t', dtype=str, low_memory=False, encoding='latin-1')

# Convert metadata DataFrame to a dictionary for fast access by Accession ID
metadata_dict = df_metadata.set_index('Accession ID')[['Pango lineage', 'Variant']].to_dict(orient='index')

# Define the output CSV file
output_csv_file = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.csv"

# Function to process each sequence
def process_sequence(record):
    accession_id_fasta = record.description.split("|")[1]
    sequence = str(record.seq)

    # Check if the Accession ID is present in the metadata dictionary
    if accession_id_fasta in metadata_dict:
        metadata = metadata_dict[accession_id_fasta]
        lineage = metadata['Pango lineage']
        variant = metadata['Variant']

        # Create a dictionary for the current record
        return {
            'Accession ID': accession_id_fasta,
            'Lineage': lineage,
            'RBD nucleotide': sequence,
            'Variant': variant
        }
    return None

# Function to handle batch writing to CSV
def write_results_to_csv(output_csv_file, queue):
    with open(output_csv_file, 'w', newline='') as f:
        writer = pd.DataFrame(columns=['Accession ID', 'Lineage', 'RBD nucleotide', 'Variant'])
        writer.to_csv(f, index=False)

    # Batch size for writing to CSV
    batch_size = 1000
    batch = []
    
    while True:
        result = queue.get()
        if result == 'DONE':
            break
        batch.append(result)
        
        if len(batch) >= batch_size:
            df = pd.DataFrame(batch)
            df.to_csv(output_csv_file, mode='a', header=False, index=False)
            batch.clear()

    # Write any remaining records
    if batch:
        df = pd.DataFrame(batch)
        df.to_csv(output_csv_file, mode='a', header=False, index=False)

# Main function to handle multiprocessing
def main():
    fasta_file = "/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.fasta"
    
    # Estimate the total number of sequences for tqdm
    with open(fasta_file, "r") as fasta_file_handle:
        total_sequences_estimated = sum(1 for line in fasta_file_handle if line.startswith(">"))

    # Manager to handle a shared queue for multiprocessing
    manager = Manager()
    result_queue = manager.Queue()

    # Start the writer process
    writer_process = Pool(processes=1)
    writer_process.apply_async(write_results_to_csv, (output_csv_file, result_queue))

    # Use multiprocessing pool to process sequences in parallel
    with Pool(processes=cpu_count()) as pool, tqdm(total=total_sequences_estimated, desc="Processing sequences") as pbar:
        for result in pool.imap(process_sequence, SeqIO.parse(fasta_file, "fasta")):
            pbar.update()
            if result:
                result_queue.put(result)

    # Signal the writer process to stop
    result_queue.put('DONE')
    writer_process.close()
    writer_process.join()

    print(f"Output CSV file saved as {output_csv_file}")

# Execute the script
if __name__ == "__main__":
    main()

Processing sequences: 100%|██████████| 12027544/12027544 [33:04<00:00, 6059.97it/s] 


Output CSV file saved as /mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.csv


In [11]:
import pandas as pd
rbd_csv = pd.read_csv("/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.csv")


In [21]:
rbd_csv.head()

Unnamed: 0,Accession ID,Lineage,RBD nucleotide,Variant
0,EPI_ISL_10839200,BA.1.15,ACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACT...,Former VOC Omicron GRA (B.1.1.529+BA.*)
1,EPI_ISL_5652980,P.1.11,ACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACT...,Former VOC Gamma GR/501Y.V3 (P.1+P.1.*)
2,EPI_ISL_17397934,XBB.1.9.2,ACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACT...,VUM GRA (XBB.1.9.2+XBB1.9.2.*)
3,EPI_ISL_9443030,BA.1,ACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACT...,Former VOC Omicron GRA (B.1.1.529+BA.*)
4,EPI_ISL_4353158,AY.4,ACTTGACCCTCTCTCAGAAACAAAGTGTACGTTGAAATCCTTCACT...,Former VOC Delta GK (B.1.617.2+AY.*)


In [16]:
# Remove everything after the closing parenthesis ")" in each value of the 'Variant' column
rbd_csv['Variant'] = rbd_csv['Variant'].str.split(')').str[0] + ')'

# Access the 'Variant' column
unique_Variant = rbd_csv['Variant'].unique()

# Print the unique values in the 'Variant' column
print(unique_Variant)

['Former VOC Omicron GRA (B.1.1.529+BA.*)'
 'Former VOC Gamma GR/501Y.V3 (P.1+P.1.*)'
 'VUM GRA (XBB.1.9.2+XBB1.9.2.*)' 'Former VOC Delta GK (B.1.617.2+AY.*)'
 'Former VOC Alpha GRY (B.1.1.7+Q.*)' nan
 'VUM GRA (XBB.1.9.1+XBB.1.9.1.*)' 'VOI GRA (XBB.1.5+XBB.1.5.*)'
 'VOI GRA (EG.5+EG.5.*)' 'VUM GRA (CH.1.1+CH.1.1.*)'
 'VOI GRA (XBB.1.16+XBB.1.16.*)'
 'Former VOI Lambda GR/452Q.V1 (C.37+C.37.1)'
 'VUM GRA (BA.2.75+BA.2.75.*)' 'Former VOI Zeta GR/484K.V2 (P.2)'
 'VUM GRA (XBB.2.3+XBB.2.3.*)'
 'Former VOI Epsilon GH/452R.V1 (B.1.429+B.1.427)'
 'Former VOI Kappa G/452R.V3 (B.1.617.1)'
 'VUM GRA (XBB+XBB.* excluding XBB.1.5, XBB.1.16, XBB.1.9.1, XBB.1.9.2, XBB.2.3)'
 'Former VOC Beta GH/501Y.V2 (B.1.351+B.1.351.2+B.1.351.3)'
 'Former VOI Mu GH (B.1.621+B.1.621.1)'
 'Former VOI Eta G/484K.V3 (B.1.525)' 'VOI GRA (JN.1+JN.1.*)'
 'Former VOI Iota GH/253G.V1 (B.1.526)'
 'Former VUM GH/490R (B.1.640+B.1.640.*)'
 'VOI GRA (BA.2.86+BA.2.86.* excluding JN.1, JN.1.*)'
 'Former VOI Theta GR/1092K.V1 (

In [17]:
# Get the value counts for the 'Variant' column
variant_counts = rbd_csv['Variant'].value_counts()

# Print the counts of each unique 'Variant'
print(variant_counts)

Variant
Former VOC Omicron GRA (B.1.1.529+BA.*)                                           4979165
Former VOC Delta GK (B.1.617.2+AY.*)                                              3888578
Former VOC Alpha GRY (B.1.1.7+Q.*)                                                1043401
VOI GRA (XBB.1.5+XBB.1.5.*)                                                        292598
VOI GRA (EG.5+EG.5.*)                                                              167378
Former VOC Gamma GR/501Y.V3 (P.1+P.1.*)                                            113831
VOI GRA (XBB.1.16+XBB.1.16.*)                                                       92813
VUM GRA (BA.2.75+BA.2.75.*)                                                         86212
VUM GRA (XBB.1.9.1+XBB.1.9.1.*)                                                     77582
VUM GRA (XBB+XBB.* excluding XBB.1.5, XBB.1.16, XBB.1.9.1, XBB.1.9.2, XBB.2.3)      70754
Former VOI Epsilon GH/452R.V1 (B.1.429+B.1.427)                                     58365
VU

In [27]:
import pandas as pd

# Load the CSV file
#rbd_csv = pd.read_csv("/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.csv")

# Dictionary to map old Variant values to new Variant values
variant_replacements = {
    "VOI GRA (XBB.1.5+XBB.1.5.*)": "VOI Omicron-subtype GRA (XBB.1.5+XBB.1.5.*)",
    "VOI GRA (EG.5+EG.5.*)": "VOI Eris GRA (EG.5+EG.5.*)",
    "VOI GRA (XBB.1.16+XBB.1.16.*)": "VOI Omicron-subtype GRA (XBB.1.16+XBB.1.16.*)",
    "VUM GRA (BA.2.75+BA.2.75.*)": "VUM Omicron-subtype GRA (BA.2.75+BA.2.75.*)",
    "VOI GRA (JN.1+JN.1.*)": "VOI FLiRT GRA (JN.1+JN.1.*)"
}

# Replace the values in the 'Variant' column based on the dictionary
rbd_csv['Variant'] = rbd_csv['Variant'].replace(variant_replacements)

# Get the value counts for the 'Variant' column
variant_counts = rbd_csv['Variant'].value_counts()

# Print the counts of each unique 'Variant'
print(variant_counts)



Variant
Former VOC Omicron GRA (B.1.1.529+BA.*)                                           4979165
Former VOC Delta GK (B.1.617.2+AY.*)                                              3888578
Former VOC Alpha GRY (B.1.1.7+Q.*)                                                1043401
VOI Omicron-subtype GRA (XBB.1.5+XBB.1.5.*)                                        292598
VOI Eris GRA (EG.5+EG.5.*)                                                         167378
Former VOC Gamma GR/501Y.V3 (P.1+P.1.*)                                            113831
VOI Omicron-subtype GRA (XBB.1.16+XBB.1.16.*)                                       92813
VUM Omicron GRA (BA.2.75+BA.2.75.*)                                                 86212
VUM GRA (XBB.1.9.1+XBB.1.9.1.*)                                                     77582
VUM GRA (XBB+XBB.* excluding XBB.1.5, XBB.1.16, XBB.1.9.1, XBB.1.9.2, XBB.2.3)      70754
Former VOI Epsilon GH/452R.V1 (B.1.429+B.1.427)                                     58365
VU

In [29]:
import pandas as pd

# Load the CSV file
#rbd_csv = pd.read_csv("/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.csv")

# Define the mapping of keywords to VOC values
variant_to_voc = {
    "Former VOC Omicron GRA": "Omicron",
    "Former VOC Delta GK": "Delta",
    "Former VOC Alpha GRY": "Alpha",
    "VOI Omicron-subtype GRA": "Omicron",
    "VUM GRA": "VUM_GRA",
    "VOI Eris GRA": "Eris",
    "Former VOC Gamma GR/501Y.V3": "Gamma",
    "VUM Omicron GRA": "Omicron",
    "Former VOI Epsilon GH/452R.V1": "Epsilon",
    "Former VOI Iota GH/253G.V1": "Iota",
    "Former VOC Beta GH/501Y.V2": "Beta",
    "Former VOI Mu GH": "Mu",
    "Former VOI Lambda GR/452Q.V1": "Lambda",
    "Former VOI Eta G/484K.V3": "Eta",
    "Former VOI Zeta GR/484K.V2": "Zeta",
    "VOI FLiRT GRA": "FLiRT",
    "Former VOI Kappa G/452R.V3": "Kappa",
    "Former VUM GH/490R": "VUM_GH",
    "VOI GRA": "VOI_GRA",
    "Former VOI Theta GR/1092K.V1": "Theta"
}

# Function to map the Variant to VOC
def map_variant_to_voc(variant):
    if pd.isna(variant):
        return "nonVOC"
    for key in variant_to_voc:
        if key in variant:
            return variant_to_voc[key]
    return "nonVOC"

# Ensure the Variant column is treated as a string
rbd_csv['Variant'] = rbd_csv['Variant'].astype(str)

# Apply the function to the Variant column and create the VOC column
rbd_csv['VOC'] = rbd_csv['Variant'].apply(map_variant_to_voc)

# Get the value counts for the 'VOC' column
voc_counts = rbd_csv['VOC'].value_counts()

# Print the counts of each unique VOC
print(voc_counts)

# Save the updated DataFrame to a new CSV file if needed
rbd_csv.to_csv("/mmfs1/projects/changhui.yan/DeewanB/recent_hcov19_MSA/msaCodon_0522/filtered_msaCodon_1024_trimmed_RBD_15mil.csv", index=False)


VOC
Omicron    5450788
Delta      3888578
Alpha      1043401
nonVOC      940219
VUM_GRA     269553
Eris        167378
Gamma       113831
Epsilon      58365
Iota         37878
Beta         14825
Mu           11307
Lambda        9169
Eta           7079
Zeta          6188
FLiRT         4277
Kappa         3594
VUM_GH         805
VOI_GRA        207
Theta           85
Name: count, dtype: int64
