In [4]:
# Extracting WGS from fasta files belonging to hCOV19, IAV, IBV, rhinovirus and RSV
# Removing sequences with ambigious nucleotides
# Mapping virus labels to numerical labels
# Write out sequences, respective labels, segment name and EPI ID/Accession ID to output csv

import os
import pandas as pd
from multiprocessing import Pool, cpu_count

def get_label_from_filename(filename):
    if "hcov-19" in filename:
        return "sars_cov_2"
    elif "influenzaA" in filename:
        return "influenza_a"
    elif "influenzaB" in filename:
        return "influenza_b"
    elif "rsv" in filename:
        return "rsv"
    elif "rhino" in filename:
        return "rhinovirus"
    else:
        return None

def process_fasta_file(fasta_file):
    valid_nucleotides = {'A', 'T', 'G', 'C'}
    label = get_label_from_filename(fasta_file)
    if label is None:
        return []

    sequences = []

    with open(fasta_file, 'r') as file:
        sequence = ''
        epi_id = ''
        for line in file:
            if line.startswith('>'):
                if sequence and set(sequence.upper()).issubset(valid_nucleotides):
                    sequences.append((sequence.upper(), epi_id, label))
                header_parts = line.strip().split('|')
                epi_id = header_parts[1] if len(header_parts) > 1 else ''
                sequence = ''
            else:
                sequence += line.strip()
        
        if sequence and set(sequence.upper()).issubset(valid_nucleotides):
            sequences.append((sequence.upper(), epi_id, label))

    return sequences

def label_to_number(label):
    label_map = {
        "sars_cov_2": 1,
        "influenza_a": 2,
        "influenza_b": 3,
        "rsv": 4,
        "rhinovirus": 5
    }
    return label_map.get(label, 0)

def process_fasta_files(directory, output_csv):
    fasta_files = os.popen(f"ls {directory}/*.fasta").read().split()

    with Pool(cpu_count()) as pool:
        results = pool.map(process_fasta_file, fasta_files)

    with open(output_csv, mode='w', newline='') as csv_file:
        fieldnames = ['sequence', 'EPI_ID', 'label_name', 'label_number']
        df = pd.DataFrame(columns=fieldnames)
        df.to_csv(csv_file, mode='w', index=False, header=True)

        for result in results:
            for sequence, epi_id, label in result:
                label_number = label_to_number(label)
                df = pd.DataFrame([[sequence, epi_id, label, label_number]], columns=fieldnames)
                df.to_csv(csv_file, mode='a', header=False, index=False)

    # Load the CSV and count the number of sequences for each label
    data = pd.read_csv(output_csv)
    label_counts = data['label_name'].value_counts()

    print("Number of sequences for each label:")
    for label, count in label_counts.items():
        print(f"Label {label}: {count} sequences")

# Prompt user for the main directory path
main_dir = "/path/to/virus/fasta/files/"

process_fasta_files(main_dir, 'WGS_by_virus_finetune.csv')

Number of sequences for each label:
Label influenza_b: 75303 sequences
Label influenza_a: 71620 sequences
Label sars_cov_2: 18636 sequences
Label rsv: 6060 sequences
Label rhinovirus: 1343 sequences


In [2]:
## Path1 # create nonoverlapping 250 bps fragments with labels from above output csv

import pandas as pd
import csv
from tqdm import tqdm

# Function to generate XXX bp non-overlapping fragments
def generate_non_overlapping_fragments(sequence, fragment_len=250):
    fragments = []
    for start in range(0, len(sequence), fragment_len):
        fragment = sequence[start:start + fragment_len]
        if len(fragment) == fragment_len:
            fragments.append(fragment)
    return fragments


# Prompt user for the main directory path
main_dir = "/path/to/virus/finetune/csv/"

# Path to the input and output CSV files
input_csv_path = os.path.join(main_dir, "WGS_by_virus_finetune.csv")
output_csv_path = os.path.join(main_dir, "WGS_by_virus_finetune1_250bp_fragments.csv")

# Read the input CSV file in chunks
chunksize = 1000  # Adjust the chunk size as needed
input_columns = ["EPI_ID", "label_name", "label_number", "sequence"]

# Open the output CSV file for writing
with open(output_csv_path, mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["EPI_ID", "label_name", "label_number", "sequence"])  # Write header
    
    for chunk in tqdm(pd.read_csv(input_csv_path, usecols=input_columns, chunksize=chunksize)):
        for index, row in chunk.iterrows():
            epi_id = row["EPI_ID"]
            label_name = row["label_name"]
            label_number = row["label_number"]
            sequence = row["sequence"]
            
            fragments = generate_non_overlapping_fragments(sequence)
            
            for fragment in fragments:
                writer.writerow([epi_id, label_name, label_number, fragment])


172it [00:27,  6.25it/s]


In [2]:
input_csv_path = "WGS_by_virus_finetune.csv"
data = pd.read_csv(input_csv_path,low_memory=False)

In [3]:
data.head()

Unnamed: 0,sequence,EPI_ID,label_name,label_number
0,GATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTG...,EPI_ISL_8801366,sars_cov_2,1
1,GATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTG...,EPI_ISL_8801365,sars_cov_2,1
2,ACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATC...,EPI_ISL_9404688,sars_cov_2,1
3,GATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGTGTG...,EPI_ISL_8801370,sars_cov_2,1
4,TACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGAT...,EPI_ISL_9404748,sars_cov_2,1


In [8]:
# Creating 250bps fragments with 50 bp overlaps from WGS generated above
# Generating reverse complementary sequence for each generated fragment 

import os
import pandas as pd
import csv
from tqdm import tqdm
from Bio.Seq import Seq

# Function to generate overlapping 250 bp fragments
def generate_overlapping_fragments(sequence, fragment_len=250, overlap=50):
    fragments = []
    for start in range(0, len(sequence) - fragment_len + 1, fragment_len - overlap):
        fragment = sequence[start:start + fragment_len]
        if len(fragment) == fragment_len:
            fragments.append(fragment)
    return fragments

# Prompt user for the main directory path
input_dir = "/path/to/virus/finetune/csv/"
main_dir = "/path/to/virus/finetune/csv/intermediate/files/"

# Path to the input and output CSV files
input_csv_path = os.path.join("WGS_by_virus_finetune.csv")
output_csv_path = os.path.join(main_dir, "WGS_by_virus_5labels_250bp_50overlap_complementary.csv")

# Read the input CSV file in chunks
chunksize = 1000  # Adjust the chunk size as needed
input_columns = ["EPI_ID", 
                 "label_name", "label_number", 
                 "sequence"]

# Open the output CSV file for writing
with open(output_csv_path, mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["EPI_ID", 
                     "label_name", "label_number",  
                     "sequence"])  # Write header
    
    for chunk in tqdm(pd.read_csv(input_csv_path, usecols=input_columns, chunksize=chunksize)):
        for index, row in chunk.iterrows():
            epi_id = row["EPI_ID"]
            variant_label = row["label_name"]
            variant_label_number = row["label_number"]
            sequence = row["sequence"]
            
            fragments = generate_overlapping_fragments(sequence)
            
            for fragment in fragments:
                # Write the original fragment
                writer.writerow([epi_id, variant_label, variant_label_number, fragment])
                
                # Generate and write the reverse complementary fragment
                reverse_complement = str(Seq(fragment).reverse_complement())
                writer.writerow([epi_id, variant_label, variant_label_number, reverse_complement])

print(f"Fragments with reverse complements have been written to {output_csv_path}")

173it [01:10,  2.45it/s]

Fragments with reverse complements have been written to /mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/intermediate_csvs/WGS_by_virus_5labels_250bp_50overlap_complementary.csv





In [9]:
# Calculating the number of unique EPI IDs for each unique value of virus label_name

import pandas as pd

csv_file_path = '/path/to/virus/finetune/csv/intermediate/files/WGS_by_virus_5labels_250bp_50overlap_complementary.csv'

# Load the CSV file
data = pd.read_csv(csv_file_path)

# Group by label_name and count unique EPI_ID values
label_counts = data.groupby('label_name')['EPI_ID'].nunique()

# Print out the counts
for label, count in label_counts.items():
    print(f"Label: {label}, Unique EPI_ID Count: {count}")


Label: influenza_a, Unique EPI_ID Count: 9101
Label: influenza_b, Unique EPI_ID Count: 9977
Label: rhinovirus, Unique EPI_ID Count: 1343
Label: rsv, Unique EPI_ID Count: 6058
Label: sars_cov_2, Unique EPI_ID Count: 18636


In [10]:
# Creating subset of above output csv with desired number of unique EPI_IDs for each virus label_name
import os
import pandas as pd

def filter_sequences(input_csv, output_csv):
    # Load the data from the CSV file
    data = pd.read_csv(input_csv,low_memory=False)
    
    # Count unique EPI IDs per label_name
    unique_epi_counts = data.groupby('label_name')['EPI_ID'].nunique()
    
    # Find labels with at least 100 unique EPI IDs
    valid_labels = unique_epi_counts[unique_epi_counts >= 1300].index
    
    # Filter the dataset to include only the valid labels
    filtered_data = data[data['label_name'].isin(valid_labels)]
    
    # Initialize a list to store the filtered sequences
    filtered_sequences = []
    
    # For each valid label, select 100 random EPI IDs and filter sequences
    for label in valid_labels:
        # Get unique EPI IDs for the current label
        label_data = filtered_data[filtered_data['label_name'] == label]
        unique_ep_ids = label_data['EPI_ID'].unique()
        
        # Randomly select 1300 unique EPI IDs
        if len(unique_ep_ids) > 1300:
            selected_ep_ids = pd.Series(unique_ep_ids).sample(n=1300, random_state=1).tolist()
        else:
            selected_ep_ids = unique_ep_ids
        
        # Filter data based on selected EPI IDs
        selected_data = label_data[label_data['EPI_ID'].isin(selected_ep_ids)]
        filtered_sequences.append(selected_data)
    
    # Concatenate all filtered sequences into a single DataFrame
    result_df = pd.concat(filtered_sequences)
    
    # Save the filtered data to a new CSV file
    result_df.to_csv(output_csv, index=False)
    
    # Print number of sequences and unique EPI ID values for each label_name
    filtered_label_counts = result_df.groupby('label_name').agg(
        num_sequences=('sequence', 'count'),
        num_unique_epi_ids=('EPI_ID', 'nunique')
    )
    
    print("Number of sequences and unique EPI ID values for each label:")
    for label, row in filtered_label_counts.iterrows():
        print(f"Label {label}: {row['num_sequences']} sequences, {row['num_unique_epi_ids']} unique EPI IDs")


# Prompt user for the main directory path
input_dir ="/path/to/virus/finetune/csv/intermediate/files/"
main_dir = "/path/to/virus/finetune/csv/"

# Path to the input and output CSV files
input_csv = os.path.join(input_dir, "WGS_by_virus_5labels_250bp_50overlap_complementary.csv")
output_csv = os.path.join(main_dir, "WGS_by_virus_5labels_250bp_50overlap_complementary_1300epi.csv")

# Execute the filtering and reporting
filter_sequences(input_csv, output_csv)


Number of sequences and unique EPI ID values for each label:
Label influenza_a: 154868 sequences, 1300 unique EPI IDs
Label influenza_b: 159818 sequences, 1300 unique EPI IDs
Label rhinovirus: 92938 sequences, 1343 unique EPI IDs
Label rsv: 178824 sequences, 1300 unique EPI IDs
Label sars_cov_2: 384960 sequences, 1300 unique EPI IDs
