In [18]:
import os
import pandas as pd
from multiprocessing import Pool, cpu_count

def get_label_from_filename(filename):
    if "alpha" in filename:
        return "sars_cov_2_alpha"
    elif "beta" in filename:
        return "sars_cov_2_beta"
    elif "gamma" in filename:
        return "sars_cov_2_gamma"
    elif "delta" in filename:
        return "sars_cov_2_delta"
    elif "omicron" in filename:
        return "sars_cov_2_omicron"
    elif "influenzaA_H1N1" in filename:
        return "influenza_a_H1N1"
    elif "influenzaA_H3N2" in filename:
        return "influenza_a_H3N2"
    else:
        return None

def process_fasta_file(fasta_file):
    valid_nucleotides = {'A', 'T', 'G', 'C'}
    label = get_label_from_filename(fasta_file)
    if label is None:
        return []

    sequences = []

    with open(fasta_file, 'r') as file:
        sequence = ''
        epi_id = ''
        for line in file:
            if line.startswith('>'):
                if sequence and set(sequence.upper()).issubset(valid_nucleotides):
                    sequences.append((sequence.upper(), epi_id, label))
                header_parts = line.strip().split('|')
                if "hcov-19" in fasta_file:
                    epi_id = header_parts[3] if len(header_parts) > 3 else ''
                elif "influenzaA" in fasta_file:
                    epi_id = header_parts[1] if len(header_parts) > 1 else ''
                sequence = ''
            else:
                sequence += line.strip()
        
        if sequence and set(sequence.upper()).issubset(valid_nucleotides):
            sequences.append((sequence.upper(), epi_id, label))

    return sequences

def label_to_number(label):
    label_map = {
        "sars_cov_2_alpha": 1,
        "sars_cov_2_beta": 2,
        "sars_cov_2_gamma": 3,
        "sars_cov_2_delta": 4,
        "sars_cov_2_omicron": 5,
        "influenza_a_H1N1": 6,
        "influenza_a_H3N2": 7
    }
    return label_map.get(label, 0)

def process_fasta_files(directory, output_csv):
    fasta_files = os.popen(f"ls {directory}/*.fasta").read().split()

    with Pool(cpu_count()) as pool:
        results = pool.map(process_fasta_file, fasta_files)

    with open(output_csv, mode='w', newline='') as csv_file:
        fieldnames = ['sequence', 'EPI_ID', 'label_name', 'label_number']
        df = pd.DataFrame(columns=fieldnames)
        df.to_csv(csv_file, mode='w', index=False, header=True)

        for result in results:
            for sequence, epi_id, label in result:
                label_number = label_to_number(label)
                df = pd.DataFrame([[sequence, epi_id, label, label_number]], columns=fieldnames)
                df.to_csv(csv_file, mode='a', header=False, index=False)

    # Load the CSV and count the number of sequences for each label
    data = pd.read_csv(output_csv)
    label_counts = data['label_name'].value_counts()

    print("Number of sequences for each label:")
    for label, count in label_counts.items():
        print(f"Label {label}: {count} sequences")

# Prompt user for the main directory path
main_dir = input("Enter the main directory path to fasta files for variants/strains: ")
process_fasta_files(main_dir, 'WGS_by_VOC_finetune.csv')


Enter the main directory path:  /mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/variant_data


Number of sequences for each label:
Label influenza_a_H3N2: 37081 sequences
Label influenza_a_H1N1: 36966 sequences
Label sars_cov_2_alpha: 10000 sequences
Label sars_cov_2_beta: 10000 sequences
Label sars_cov_2_delta: 10000 sequences
Label sars_cov_2_gamma: 10000 sequences
Label sars_cov_2_omicron: 10000 sequences


In [7]:
## Separating voc csv file into two files based on virus name to train a model for each virus separately:

import pandas as pd

# Define input and output paths
input_csv_path = "/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_finetune.csv"
output_csv_sars_cov2_path = "/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_hcov19_finetune.csv"
output_csv_influenza_a_path = "/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_IAV_finetune.csv"

# Create file handlers for output CSV files
with open(output_csv_sars_cov2_path, 'w') as sars_cov2_file, open(output_csv_influenza_a_path, 'w') as influenza_a_file:
    # Read the header line from the input CSV file
    with open(input_csv_path, 'r') as file:
        header = file.readline().strip()
        sars_cov2_file.write(header + '\n')
        influenza_a_file.write(header + '\n')
    
    # Read and process the input CSV file in chunks
    chunk_size = 10000  # Adjust chunk size based on memory constraints
    for chunk in pd.read_csv(input_csv_path, chunksize=chunk_size, low_memory=False):
        # Identify rows for sars_cov2 and influenza_a
        sars_cov2_rows = chunk[chunk['label_name'].str.contains('sars_cov_2', na=False)]
        influenza_a_rows = chunk[chunk['label_name'].str.contains('influenza_a', na=False)]
        
        # Write rows to respective CSV files
        sars_cov2_rows.to_csv(sars_cov2_file, mode='a', header=False, index=False)
        influenza_a_rows.to_csv(influenza_a_file, mode='a', header=False, index=False)
    
print(f"Splitting completed. Files saved to {output_csv_sars_cov2_path} and {output_csv_influenza_a_path}.")


Splitting completed. Files saved to /mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_hcov19_finetune.csv and /mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_IAV_finetune.csv.


In [28]:
# Create sequences for the two virus' variants with 5k unique EPI_IDs
import os
import pandas as pd

def filter_sequences(input_csv, output_csv):
    # Load the data from the CSV file
    data = pd.read_csv(input_csv,low_memory=False)
    
    # Count unique EPI IDs per label_name
    unique_epi_counts = data.groupby('label_name')['EPI_ID'].nunique()
    
    # Find labels with at least 100 unique EPI IDs
    valid_labels = unique_epi_counts[unique_epi_counts >= 5000].index
    
    # Filter the dataset to include only the valid labels
    filtered_data = data[data['label_name'].isin(valid_labels)]
    
    # Initialize a list to store the filtered sequences
    filtered_sequences = []
    
    # For each valid label, select 5k random EPI IDs and filter sequences
    for label in valid_labels:
        # Get unique EPI IDs for the current label
        label_data = filtered_data[filtered_data['label_name'] == label]
        unique_ep_ids = label_data['EPI_ID'].unique()
        
        # Randomly select 5k unique EPI IDs
        if len(unique_ep_ids) > 5000:
            selected_ep_ids = pd.Series(unique_ep_ids).sample(n=5000, random_state=1).tolist()
        else:
            selected_ep_ids = unique_ep_ids
        
        # Filter data based on selected EPI IDs
        selected_data = label_data[label_data['EPI_ID'].isin(selected_ep_ids)]
        filtered_sequences.append(selected_data)
    
    # Concatenate all filtered sequences into a single DataFrame
    result_df = pd.concat(filtered_sequences)
    
    # Save the filtered data to a new CSV file
    result_df.to_csv(output_csv, index=False)
    
    # Print number of sequences and unique EPI ID values for each label_name
    filtered_label_counts = result_df.groupby('label_name').agg(
        num_sequences=('sequence', 'count'),
        num_unique_epi_ids=('EPI_ID', 'nunique')
    )
    
    print("Number of sequences and unique EPI ID values for each label:")
    for label, row in filtered_label_counts.iterrows():
        print(f"Label {label}: {row['num_sequences']} sequences, {row['num_unique_epi_ids']} unique EPI IDs")


# Prompt user for the main directory path
main_dir = "/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes"

# Path to the input and output CSV files
# Assuming the CSV file path is 'WGS_by_VOC_finetune.csv'
#input_csv = '/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_hcov19_finetune.csv'
input_csv = '/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_IAV_finetune.csv'

#output_csv = os.path.join(main_dir, "/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_hcov19_finetune_5k_epi.csv")
output_csv = os.path.join(main_dir, "/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_IAV_finetune_5k_epi.csv")

# Execute the filtering and reporting
filter_sequences(input_csv, output_csv)


Number of sequences and unique EPI ID values for each label:
Label sars_cov_2_alpha: 2000 sequences, 2000 unique EPI IDs
Label sars_cov_2_beta: 2000 sequences, 2000 unique EPI IDs
Label sars_cov_2_delta: 2000 sequences, 2000 unique EPI IDs
Label sars_cov_2_gamma: 2000 sequences, 2000 unique EPI IDs
Label sars_cov_2_omicron: 2000 sequences, 2000 unique EPI IDs


In [7]:
# creating overlapping 250bps fragments with labels for each one of the two hcov19 5k sequences and IAV 5k sequences 

import os
import pandas as pd
import csv
from tqdm import tqdm

# Function to generate overlapping 250 bp fragments
def generate_overlapping_fragments(sequence, fragment_len=250, overlap=200):
    fragments = []
    for start in range(0, len(sequence) - fragment_len + 1, fragment_len - overlap):
        fragment = sequence[start:start + fragment_len]
        if len(fragment) == fragment_len:
            fragments.append(fragment)
    return fragments

# Prompt user for the main directory path
main_dir = "/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/"

# Path to the input and output CSV files
input_csv_path = os.path.join(main_dir, "WGS_by_VOC_hcov19_finetune_2k_epi.csv")
#input_csv_path = os.path.join(main_dir, "WGS_by_VOC_IAV_finetune_5k_epi.csv")

output_csv_path = os.path.join(main_dir, "WGS_by_VOC_hcov19_finetune_2k_epi_250bp_200overlap.csv")
#output_csv_path = os.path.join(main_dir, "WGS_by_VOC_IAV_finetune_5k_epi_250bp_fragments.csv")

# Read the input CSV file in chunks
chunksize = 1000  # Adjust the chunk size as needed
input_columns = ["EPI_ID", 
                 "label_name", "label_number", 
                 "sequence"]

# Open the output CSV file for writing
with open(output_csv_path, mode='w', newline='') as outfile:
    writer = csv.writer(outfile)
    writer.writerow(["EPI_ID", 
                     "label_name", "label_number",  
                     "sequence"])  # Write header
    
    for chunk in tqdm(pd.read_csv(input_csv_path, usecols=input_columns, chunksize=chunksize)):
        for index, row in chunk.iterrows():
            epi_id = row["EPI_ID"]
            variant_label = row["label_name"]
            variant_label_number = row["label_number"]
            sequence = row["sequence"]
            
            fragments = generate_overlapping_fragments(sequence)
            
            for fragment in fragments:
                writer.writerow([epi_id, variant_label, variant_label_number, fragment])


10it [00:03,  2.69it/s]


In [33]:
# Calculate the number of unique EPI IDs for each unique value of label_name
import pandas as pd

# Assuming the CSV file path is 'WGS_by_VOC_finetune.csv'
cov_csv_file_path = '/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_hcov19_finetune_2k_epi_250bp_fragments.csv'
iav_csv_file_path = '/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/WGS_by_VOC_IAV_finetune_2k_epi_250bp_fragments.csv'

# Load the CSV file
data_cov = pd.read_csv(cov_csv_file_path)
data_iav = pd.read_csv(iav_csv_file_path)

# Group by label_name and count unique EPI_ID values
label_counts_cov = data_cov.groupby('label_name')['EPI_ID'].nunique()
label_counts_iav = data_iav.groupby('label_name')['EPI_ID'].nunique()

# Print out the counts
for label, count in label_counts_cov.items():
    print(f"SARS-CoV-2 labels: {label}, Unique EPI_ID Count: {count}")
for label, count in label_counts_iav.items():
    print(f"IAV labels: {label}, Unique EPI_ID Count: {count}")


SARS-CoV-2 labels: sars_cov_2_alpha, Unique EPI_ID Count: 2000
SARS-CoV-2 labels: sars_cov_2_beta, Unique EPI_ID Count: 2000
SARS-CoV-2 labels: sars_cov_2_delta, Unique EPI_ID Count: 2000
SARS-CoV-2 labels: sars_cov_2_gamma, Unique EPI_ID Count: 2000
SARS-CoV-2 labels: sars_cov_2_omicron, Unique EPI_ID Count: 2000
IAV labels: influenza_a_H1N1, Unique EPI_ID Count: 2000
IAV labels: influenza_a_H3N2, Unique EPI_ID Count: 2000
