In [9]:
# Path1) Use Lineage value from metadata to get Variant Label from voc.json file 
# voc.json file contains all the Lineage values that NCBI has classified under each VOC/VOI/VUM category and non VOC Lineage values

import csv
import json
from tqdm import tqdm

# Paths for input CSV and JSON files and output CSV file
input_csv_path = '/your/path/to/filtered_msaCodon_1024_trimmed_RBD_3mil.csv'
voc_json_path = '/your/path/to/voc.json'
output_csv_path = '/your/path/to/filtered_msaCodon_1024_trimmed_RBD_3mil_with_LineageMappedVOC.csv'

# Load the VOC data from JSON file
with open(voc_json_path, 'r') as voc_file:
    voc_data = json.load(voc_file)

# Create a mapping of lineages to VOC labels
lineage_to_voc = {}
for voc_label, lineages in voc_data.items():
    for lineage in lineages:
        lineage_to_voc[lineage] = voc_label

# Open input CSV for reading
with open(input_csv_path, 'r') as input_file:
    reader = csv.DictReader(input_file)

    # Open output CSV for writing
    with open(output_csv_path, 'w', newline='') as output_file:
        # Get the fieldnames from the input CSV and add 'VOC_label' as a new field
        fieldnames = reader.fieldnames + ['VOC_label']
        writer = csv.DictWriter(output_file, fieldnames=fieldnames)

        # Write header to output CSV
        writer.writeheader()

        # Initialize tqdm for progress bar
        for row in tqdm(reader, desc='Processing rows'):
            # Get the lineage from the current row
            lineage = row['Lineage']

            # Determine the VOC_label for the lineage from lineage_to_voc mapping
            voc_label = lineage_to_voc.get(lineage, 'Unknown')

            # Add the VOC_label to the row data
            row['VOC_label'] = voc_label

            # Write the updated row to the output CSV
            writer.writerow(row)


Processing rows: 2999815it [01:15, 39969.37it/s]


In [12]:
# Path1) Use Lineage value from metadata to get Variant Label from voc.json file 
# Creating uniform variant label values from trimmed RBD dataset and mapping labels to numeric values
import pandas as pd
import csv

def transform_csv(input_csv, output_csv):
    # Read the CSV file
    df = pd.read_csv(input_csv)

    # Create the new 'label_name' column by prefixing 'sars_cov_2_' to the 'VOC' column
    df['label_name'] = 'sars_cov_2_' + df['VOC_label']

    # Remove rows where 'label_name' is 'sars_cov_2_nonVOC'
    df = df[df['label_name'] != 'sars_cov_2_nonVOC']
    df = df[df['label_name'] != 'sars_cov_2_Unknown']
    # Create a mapping from unique 'label_name' to a numeric value
    label_mapping = {label: idx for idx, label in enumerate(df['label_name'].unique())}

    # Create the 'label_number' column by mapping 'label_name' to its corresponding numeric value
    df['label_number'] = df['label_name'].map(label_mapping)

    # Rename columns
    df = df.rename(columns={'Accession ID': 'EPI_ID', 'RBD nucleotide': 'sequence'})

    # Select relevant columns and write them to the output CSV file
    df[['EPI_ID', 'sequence', 'label_name', 'label_number']].to_csv(output_csv, index=False)

    # Print the mapping of label names to numbers
    print("Mapping of label names to numeric values:")
    for label, number in label_mapping.items():
        print(f"{label}: {number}")

if __name__ == "__main__":
    input_csv_path = "/your/path/to/filtered_msaCodon_1024_trimmed_RBD_3mil_with_LineageMappedVOC.csv"
    output_csv_path = "/your/path/to/RBD_nucleotides_3mil_LineageMappedVOC_wo_nonvoc.csv"
    transform_csv(input_csv_path, output_csv_path)


Mapping of label names to numeric values:
sars_cov_2_delta: 0
sars_cov_2_alpha: 1
sars_cov_2_omicron: 2
sars_cov_2_WuhanHu1: 3
sars_cov_2_gamma: 4
sars_cov_2_iota: 5
sars_cov_2_mu: 6
sars_cov_2_kappa: 7
sars_cov_2_zeta: 8
sars_cov_2_beta: 9
sars_cov_2_epsilon: 10
sars_cov_2_lambda: 11
sars_cov_2_eta: 12
sars_cov_2_theta: 13


In [2]:
# Path2) Use Variant column value from metadata to extract Variant label (done in extract_RBD_from_MSA.ipynb script)
# Creating uniform variant label values from trimmed RBD dataset and mapping labels to numeric values
import pandas as pd
import csv

def transform_csv(input_csv, output_csv):
    # Read the CSV file
    df = pd.read_csv(input_csv)

    # Create the new 'label_name' column by prefixing 'sars_cov_2_' to the 'VOC' column
    df['label_name'] = 'sars_cov_2_' + df['Variant']

    # Uncomment this line of script if you want to exclude 'sars_cov_2_nonVOC' labels from training dataset
    #df = df[df['label_name'] != 'sars_cov_2_nonVOC']

    # Create a mapping from unique 'label_name' to a numeric value
    label_mapping = {label: idx for idx, label in enumerate(df['label_name'].unique())}

    # Create the 'label_number' column by mapping 'label_name' to its corresponding numeric value
    df['label_number'] = df['label_name'].map(label_mapping)

    # Rename columns
    df = df.rename(columns={'Accession ID': 'EPI_ID', 'RBD nucleotide': 'sequence'})

    # Select relevant columns and write them to the output CSV file
    df[['EPI_ID', 'sequence', 'label_name', 'label_number']].to_csv(output_csv, index=False)

    # Print the mapping of label names to numbers
    print("Mapping of label names to numeric values:")
    for label, number in label_mapping.items():
        print(f"{label}: {number}")

if __name__ == "__main__":
    input_csv_path = "/your/path/to/filtered_msaCodon_1024_trimmed_RBD_3mil_with_VOC.csv"
    output_csv_path = "/your/path/to/RBD_nucleotides_3mil_wt_nonvoc.csv"
    transform_csv(input_csv_path, output_csv_path)


Mapping of label names to numeric values:
sars_cov_2_delta: 0
sars_cov_2_alpha: 1
sars_cov_2_omicron: 2
sars_cov_2_gamma: 3
sars_cov_2_iota: 4
sars_cov_2_mu: 5
sars_cov_2_kappa: 6
sars_cov_2_eta: 7
sars_cov_2_beta: 8
sars_cov_2_epsilon: 9
sars_cov_2_lambda: 10


In [20]:
# Generating counts for each variant label and total number of RBD sequences
import pandas as pd
rbd_df= pd.read_csv('/your/path/to/RBD_nucleotides_3mil_wt_nonvoc.csv')

# Get the counts for each unique value in the 'Variant' column
variant_counts = rbd_df['label_name'].value_counts()

# Print the counts
print("Counts for each Variant value:")
for variant, count in variant_counts.items():
    print(f"{variant}: {count}")
    
# Calculate the total number of entries in the 'Variant' column
total_variants = rbd_df['label_name'].count()
# Print the total number of entries
print(f"\nTotal number of sequences: {total_variants}")

Counts for each Variant value:
sars_cov_2_omicron: 1286237
sars_cov_2_delta: 837496
sars_cov_2_nonVOC: 456912
sars_cov_2_alpha: 355994
sars_cov_2_gamma: 28391
sars_cov_2_iota: 11602
sars_cov_2_epsilon: 10211
sars_cov_2_beta: 4175
sars_cov_2_eta: 3673
sars_cov_2_mu: 3151
sars_cov_2_lambda: 1302
sars_cov_2_kappa: 671

Total number of sequences: 2999815


In [2]:
# Checking for ambigious nucleotides
import pandas as pd
rbd_df= pd.read_csv('/your/path/to/RBD_nucleotides_3mil_wt_nonvoc.csv')

# Look for presence of ambigious nucleotide N in any sequence
if rbd_df['sequence'].str.contains('N').any():
    print("There is at least one 'N' in the 'RBD nucleotide' column.")
else:
    print("No 'N' found in the 'RBD nucleotide' column.")

No 'N' found in the 'RBD nucleotide' column.


In [21]:
# Creating subset of above output csv with desired number of unique EPI_IDs for each hCOV19 variant label
import pandas as pd
import csv
from collections import defaultdict

def filter_and_write_csv(input_csv, output_csv, min_rows_per_variant=100000, max_rows_per_variant=100000):
    # Initialize a dictionary to count the number of valid rows per variant
    variant_counts = defaultdict(int)
    variant_valid_rows = defaultdict(list)

    with open(input_csv, 'r') as infile:
        reader = csv.DictReader(infile)
        for row in reader:
            sequence = row['sequence'].upper()

            # Check if the sequence contains only A, T, G, C
            if set(sequence) <= {'A', 'T', 'G', 'C'}:
                variant = row['label_name']
                variant_valid_rows[variant].append(row)

    # Filter out variants that do not have at least min_rows_per_variant valid rows
    valid_variants = {variant: rows for variant, rows in variant_valid_rows.items() if len(rows) >= min_rows_per_variant}

    # Write to output CSV
    with open(output_csv, 'w', newline='') as outfile:
        fieldnames = ['EPI_ID', 'label_name', 'label_number', 'sequence']
        writer = csv.DictWriter(outfile, fieldnames=fieldnames)
        writer.writeheader()

        total_rows = 0
        for variant, rows in valid_variants.items():
            # Write only max_rows_per_variant rows for each variant
            for row in rows[:max_rows_per_variant]:
                writer.writerow(row)
                variant_counts[variant] += 1
                total_rows += 1

    print(f"Total number of rows in the output CSV: {total_rows}")
    for variant, count in variant_counts.items():
        print(f"Variant: {variant}, Number of rows: {count}")

if __name__ == "__main__":
    input_csv_path = "/your/path/to/RBD_nucleotides_3mil_wt_nonvoc.csv"
    output_csv_path = "/your/path/to/RBD_nucleotides_3mil_wt_nonvoc_100k_epi.csv"
    filter_and_write_csv(input_csv_path, output_csv_path)


Total number of rows in the output CSV: 428391
Variant: sars_cov_2_delta, Number of rows: 100000
Variant: sars_cov_2_alpha, Number of rows: 100000
Variant: sars_cov_2_omicron, Number of rows: 100000
Variant: sars_cov_2_nonVOC, Number of rows: 100000
Variant: sars_cov_2_gamma, Number of rows: 28391


In [22]:
# Creating 250bps fragments with 50 bp overlaps from RBD segments generated above
# Generating reverse complementary sequence for each generated fragment 
 
import os
import pandas as pd
import csv
from tqdm import tqdm
from Bio.Seq import Seq  # Import Seq class for reverse complement

# Main directory path
main_dir = "/path/to/virus/finetune/csv/"

# Path to the input and output CSV files
input_csv_path = "/your/path/to/RBD_nucleotides_3mil_wt_nonvoc_100k_epi.csv"
output_csv_path = os.path.join(main_dir, "RBD_nucleotides_3mil_wt_nonvoc_100k_epi_250bp_50overlap_complementary.csv")

# Read the input CSV file
data = pd.read_csv(input_csv_path, low_memory=False)

# Define the length of kmer and sliding window
kmer_length = 250
sliding_window = 50 

def extract_kmers(sequence, kmer_length, sliding_window):
    """Extract kmers with a specified sliding window from the given sequence."""
    kmers = []
    for start in range(0, len(sequence) - kmer_length + 1, sliding_window):
        kmer = sequence[start:start + kmer_length]
        kmers.append(kmer)
    return kmers

# Open the output CSV file for writing
with open(output_csv_path, mode='w', newline='') as csv_file:
    # Define the fieldnames for the output CSV
    fieldnames = data.columns.tolist()
    if 'sequence' in fieldnames:
        fieldnames.remove('sequence')  # remove 'sequence' to add kmers later
    fieldnames.append('sequence')
    
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    # Process each row in the input data with a progress bar
    for _, row in tqdm(data.iterrows(), total=data.shape[0], desc='Processing Rows'):
        sequence = row['sequence']
        kmers = extract_kmers(sequence, kmer_length, sliding_window)
        
        # Write each kmer and its reverse complement with the associated row data
        for kmer in kmers:
            row_data = row.to_dict()  # Convert row to dictionary
            row_data['sequence'] = kmer
            writer.writerow(row_data)  # Write the original kmer
            
            # Generate and write the reverse complementary kmer
            reverse_complementary_kmer = str(Seq(kmer).reverse_complement())
            row_data['sequence'] = reverse_complementary_kmer
            writer.writerow(row_data)  # Write the reverse complementary kmer

print(f"KMers and their reverse complements extracted and written to {output_csv_path}")

Processing Rows: 100%|██████████| 428391/428391 [01:41<00:00, 4230.68it/s]

KMers and their reverse complements extracted and written to /mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/RBD_nucleotides_3mil_wt_nonvoc_100k_epi_250bp_50overlap_complementary.csv





In [23]:
# Generating counts for each variant label and total number of RBD sequences
import pandas as pd
rbd_df= pd.read_csv('/your/path/to/RBD_nucleotides_3mil_wt_nonvoc_100k_epi_250bp_50overlap_complementary.csv')

# Get the counts for each unique value in the 'Variant' column
variant_counts = rbd_df['label_name'].value_counts()

# Print the counts
print("Counts for each Variant value:")
for variant, count in variant_counts.items():
    print(f"{variant}: {count}")
    
# Calculate the total number of entries in the 'Variant' column
total_variants = rbd_df['label_name'].count()
# Print the total number of entries
print(f"\nTotal number of sequences: {total_variants}")

Counts for each Variant value:
sars_cov_2_delta: 1800000
sars_cov_2_alpha: 1800000
sars_cov_2_omicron: 1800000
sars_cov_2_nonVOC: 1800000
sars_cov_2_gamma: 511038

Total number of sequences: 7711038


In [1]:
# Checking to see if label_names are correctly mapped to label_numbers
import pandas as pd

def check_label_numbers_and_count_epi_ids(input_csv):
    """
    Check the label_number values for each unique label_name value and print the number of EPI_ID values for each label_name.
    """
    # Load the input CSV file
    df = pd.read_csv(input_csv)

    # Group by label_name and analyze label_number and EPI_ID counts
    grouped = df.groupby('label_name')

    for label_name, group in grouped:
        unique_label_numbers = group['label_number'].unique()
        epi_id_count = group['EPI_ID'].nunique()  # Count unique EPI_ID values

        print(f"Label Name: {label_name}")
        print(f"  - Unique Label Numbers: {unique_label_numbers}")
        print(f"  - Number of Unique EPI_IDs: {epi_id_count}")
        print("-" * 40)

input_csv = "/your/path/to/RBD_nucleotides_3mil_wo_nonvoc_100k_epi_250bp_50overlap_complementary.csv"
# Run the function
check_label_numbers_and_count_epi_ids(input_csv)

Label Name: sars_cov_2_alpha
  - Unique Label Numbers: [1]
  - Number of Unique EPI_IDs: 100000
----------------------------------------
Label Name: sars_cov_2_delta
  - Unique Label Numbers: [0]
  - Number of Unique EPI_IDs: 100000
----------------------------------------
Label Name: sars_cov_2_gamma
  - Unique Label Numbers: [3]
  - Number of Unique EPI_IDs: 28391
----------------------------------------
Label Name: sars_cov_2_omicron
  - Unique Label Numbers: [2]
  - Number of Unique EPI_IDs: 100000
----------------------------------------
