In [17]:
# Trimming genome msa files to only extract RBD segment and remove gaps to get true RBD nucleotide sequences

def extract_sequences_in_range(msa_fasta_file, start_position, end_position, output_file):
    with open(msa_fasta_file, "r") as file:
        header = ""
        sequence = ""

        # Open the output file for writing
        with open(output_file, "w") as output_file:

            for line in file:
                line = line.strip()

                if line.startswith(">"):
                    # Start a new sequence
                    if header and sequence:
                        # Process the previous sequence if it's complete
                        sequence = sequence.replace("*", "")
                        extracted_sequence = sequence[start_position - 1:end_position]

                        # Remove gaps from the extracted sequence.
                        extracted_sequence = extracted_sequence.replace("-", "")

                        # Write the extracted sequence to the output file.
                        output_file.write(f">{header}\n{extracted_sequence}\n")

                    header = line
                    sequence = ""
                else:
                    sequence += line

            # Process the last sequence if it's complete
            if header and sequence:
                sequence = sequence.replace("*", "")
                extracted_sequence = sequence[start_position - 1:end_position]
                extracted_sequence = extracted_sequence.replace("-", "")
                output_file.write(f">{header}\n{extracted_sequence}\n")

    print(f"Extracted sequences between positions {start_position} and {end_position} to '{output_file}'.")

def main():
    msa_fasta_file = "your/path/ref_genome_MSA.fasta"
    start_position = 34732
    end_position = 36397
    output_file = "your/path/ref_genome_RBD_nucleotides.fasta"
    extract_sequences_in_range(msa_fasta_file, start_position, end_position, output_file)

if __name__ == "__main__":
    main()


Extracted sequences between positions 34732 and 36397 to '<_io.TextIOWrapper name='ref_genome_RBD_nucleotides.fasta' mode='w' encoding='UTF-8'>'.


In [1]:
from Bio import SeqIO

def generate_sequence_statistics(fasta_file):
    sequence_lengths = []
    total_sequences = 0
    total_bases = 0

    with open(fasta_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            total_sequences += 1
            sequence_length = len(record.seq)
            sequence_lengths.append(sequence_length)
            total_bases += sequence_length

    # Calculate statistics
    min_length = min(sequence_lengths)
    max_length = max(sequence_lengths)
    average_length = total_bases / total_sequences

    # Print the statistics
    print("Sequence Statistics:")
    print(f"Total Sequences: {total_sequences}")
    print(f"Total Bases: {total_bases}")
    print(f"Minimum Length: {min_length} bases")
    print(f"Maximum Length: {max_length} bases")
    print(f"Average Length: {average_length:.2f} bases")

if __name__ == "__main__":
    fasta_file = "your/path/RBD_nucleotides.fasta"  # Replace with your FASTA file
    generate_sequence_statistics(fasta_file)


Sequence Statistics:
Total Sequences: 3816508
Total Bases: 2615413945
Minimum Length: 0 bases
Maximum Length: 54657965 bases
Average Length: 685.29 bases


In [2]:
# Biopython library to filter sequences longer than 650 nucleotides from a FASTA file and write them to a new file

from Bio import SeqIO

def filter_sequences_by_length(input_file, output_file, min_length):
    with open(input_file, "r") as handle:
        records = [record for record in SeqIO.parse(handle, "fasta") if len(record.seq) > min_length]

    with open(output_file, "w") as output_handle:
        SeqIO.write(records, output_handle, "fasta")

if __name__ == "__main__":
    input_file = "your/path/RBD_nucleotides.fasta"  # Replace with your input FASTA file
    output_file = "your/path/RBD_valid_nucleotides.fasta"
    min_length = 600
    filter_sequences_by_length(input_file, output_file, min_length)


In [3]:
## Python script to make a subset of the RBD_valid_NT.fasta MSA file down to manageable 
## size and also make sure length of each sequence is multiple of 3 

from Bio import SeqIO
import random

# Input FASTA file and output file
input_fasta = "your/path/RBD_valid_nucleotides.fasta"
output_fasta = "your/path/RBD_valid_nucleotides_million.fasta"

# Read all sequences from the input FASTA file
all_sequences = list(SeqIO.parse(input_fasta, "fasta"))

# Randomly select 1 million sequences
num_sequences_to_select = 1000000
subset_sequences = random.sample(all_sequences, min(num_sequences_to_select, len(all_sequences)))

# Filter sequences to contain only A, T, G, and C, and have lengths as multiples of 3
valid_sequences = [seq for seq in subset_sequences if set(str(seq.seq)).issubset("ATGC")]

# Ensure each selected sequence length is a multiple of 3 or remove 1 or 2 nucleotides
for i in range(len(valid_sequences)):
    seq_len = len(valid_sequences[i])
    if seq_len % 3 != 0:
        trim_length = seq_len % 3
        valid_sequences[i] = valid_sequences[i][:-trim_length]

# Write the selected sequences to the output FASTA file
with open(output_fasta, "w") as output_handle:
    SeqIO.write(valid_sequences, output_handle, "fasta")

print(f"Randomly selected 1,000,000 valid sequences and saved to {output_fasta}")


Randomly selected 1,000,000 valid sequences and saved to RBD_valid_nucleotides_million.fasta


In [4]:
from Bio import SeqIO

def generate_sequence_statistics(fasta_file):
    sequence_lengths = []
    total_sequences = 0
    total_bases = 0

    with open(fasta_file, "r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            total_sequences += 1
            sequence_length = len(record.seq)
            sequence_lengths.append(sequence_length)
            total_bases += sequence_length

    # Calculate statistics
    min_length = min(sequence_lengths)
    max_length = max(sequence_lengths)
    average_length = total_bases / total_sequences

    # Print the statistics
    print("Sequence Statistics:")
    print(f"Total Sequences: {total_sequences}")
    print(f"Total Bases: {total_bases}")
    print(f"Minimum Length: {min_length} bases")
    print(f"Maximum Length: {max_length} bases")
    print(f"Average Length: {average_length:.2f} bases")

if __name__ == "__main__":
    fasta_file = "your/path/RBD_valid_nucleotides_million.fasta"  # Replace with your FASTA file
    generate_sequence_statistics(fasta_file)

Sequence Statistics:
Total Sequences: 1000000
Total Bases: 668994048
Minimum Length: 600 bases
Maximum Length: 708 bases
Average Length: 668.99 bases


In [8]:
## Maching the fasta RBD sequences with metadata 

from Bio import SeqIO
import pandas as pd

# Load your metadata DataFrame (df_metadata) and FASTA file here.

import pandas as pd
csv_file = "metadata.tsv"
df_metadata = pd.read_csv(csv_file, sep='\t', dtype=str, low_memory=False, encoding='latin-1')
df_metadata.head()
df_metadata.columns


# Initialize a count for matches
match_count = 0
total_sequences = 0

# Iterate through the sequences in your FASTA file
for record in SeqIO.parse("your/path/RBD_valid_nucleotides_million.fasta", "fasta"):
    total_sequences += 1
    accession_id_fasta = record.description.split("|")[1]
    
    # Check if the Accession ID is present in the metadata DataFrame
    if accession_id_fasta in df_metadata['Accession ID'].values:
        match_count += 1

# Print the results
print(f"Matches found: {match_count}")
print(f"Total sequences: {total_sequences}")


Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/DB/miniconda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/gl/qdqcbbk10hs36kcxf_06nkch0000gn/T/ipykernel_12776/344364984.py", line -1, in <module>
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/DB/miniconda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2057, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/Users/DB/miniconda/lib/python3.10/site-packages/IPython/core/ultratb.py", line 1288, in structured_traceback
    return FormattedTB.structured_traceback(
  File "/Users/DB/miniconda/lib/python3.10/site-packages/IPython/core/ultratb.py", line 1177, in structured_traceback
    return VerboseTB.structured_traceback(
  File "/Users/DB/miniconda/lib/python3.10/site-packages/IPython/core

In [1]:
# SAMPLE run to test if the Accession ID values in the MSA files are present in metadata.tsv
df_metadata.loc[df_metadata["Accession ID"]=="EPI_ISL_402124"]

NameError: name 'df_metadata' is not defined

In [1]:
# Script to write out matching fasta files with Accession ID values in metadata file to csv 
# files with Variant values and convert each nucleotide sequence to protein sequences

from Bio import SeqIO
from Bio.Seq import Seq
import pandas as pd
import json

# Load your metadata DataFrame (df_metadata) and provide the correct path to your FASTA file and "RBD.voc.json" file.
csv_file = "your/path/metadata.tsv"
df_metadata = pd.read_csv(csv_file, sep='\t', dtype=str, low_memory=False, encoding='latin-1')

# Load the voc.json file
with open('voc.json', 'r') as json_file:
    voc_data = json.load(json_file)

# Define a function to determine the VOC value based on Pango Lineage
def determine_voc(lineage):
    for voc, lineages in voc_data.items():
        if lineage in lineages:
            return voc
    return "nonVOC"

# Define the translation function
def translate_nucleotides_to_protein(nucleotide_sequence):
    return str(Seq(nucleotide_sequence).translate())

# Define the output CSV file
output_csv_file = "your/path/RBD_valid_nucleotides_million.csv"

# Initialize a count for matches
match_count = 0
total_sequences = 0

# Iterate through the sequences in your FASTA file
for record in SeqIO.parse("your/path/RBD_valid_nucleotides_million.fasta", "fasta"):
    total_sequences += 1
    accession_id_fasta = record.description.split("|")[1]
    sequence = str(record.seq)

    # Check if the Accession ID is present in the metadata DataFrame
    match_row = df_metadata[df_metadata['Accession ID'] == accession_id_fasta]
    if not match_row.empty:
        lineage = match_row['Pango lineage'].values[0]

        # Create a DataFrame for the current record
        output_data = {
            'Accession ID': [accession_id_fasta],
            'Lineage': [lineage],
            'RBD nucleotide': [sequence],
            'Variant': [determine_voc(lineage)],
            'RBD protein': [translate_nucleotides_to_protein(sequence)],
        }
        output_df = pd.DataFrame(output_data)

        # Save the DataFrame to the output CSV file
        with open(output_csv_file, 'a', newline='') as f:
            output_df.to_csv(f, mode='a', header=not total_sequences, index=False)

# Print the results
print(f"Matches found: {match_count}")
print(f"Total sequences: {total_sequences}")
print(f"Output CSV file saved as {output_csv_file}")






KeyboardInterrupt: 