In [7]:
# Convert the variant values in input csv into label_name values and label_number values:

import pandas as pd

# Define function to get variant label
def get_variant_label(variant):
    return f'sars_cov_2_{variant.lower()}'

# Load the input CSV file
input_csv_path = "RBD_valid_nucleotides_500k_VOC.csv"
df = pd.read_csv(input_csv_path)

# Apply the variant label function to the 'Variant' column
df['label_name'] = df['variant'].apply(get_variant_label)

# Assign each row a unique number
df['EPI_ID'] = range(1, len(df) + 1)

# Convert label names to numerical labels
label_name_to_number = {name: idx for idx, name in enumerate(df['label_name'].unique())}
df['label_number'] = df['label_name'].map(label_name_to_number)

# Save the updated DataFrame to a new CSV file
output_csv_path = "spike_sequences_0709_VOC_labeled.csv"
df.to_csv(output_csv_path, index=False)

# Print out the number of sequences belonging to each unique label_name value
label_counts = df['label_name'].value_counts()
for label_name, count in label_counts.items():
    print(f"{label_name}: {count} sequences")


sars_cov_2_omicron: 177091 sequences
sars_cov_2_delta: 173300 sequences
sars_cov_2_nonvoc: 77063 sequences
sars_cov_2_alpha: 49234 sequences
sars_cov_2_wuhanhu1: 19990 sequences
sars_cov_2_gamma: 5692 sequences
sars_cov_2_epsilon: 2327 sequences
sars_cov_2_iota: 2318 sequences
sars_cov_2_beta: 952 sequences
sars_cov_2_mu: 626 sequences
sars_cov_2_lambda: 340 sequences
sars_cov_2_zeta: 301 sequences
sars_cov_2_eta: 283 sequences
sars_cov_2_kappa: 59 sequences
sars_cov_2_theta: 3 sequences


In [3]:
# python script that creates a subset that contains 900 sequences for each label_name value in the csv from above

import pandas as pd

# Define the paths
input_csv_path = "RBD_valid_nucleotides_500k_VOC_labeled.csv"
output_csv_path = "RBD_valid_nucleotides_500k_VOC_labeled_5000_epi.csv"

# Read the input CSV file
data = pd.read_csv(input_csv_path, low_memory=False)

# Print the number of EPI_ID values for each label_name
label_counts = data['label_name'].value_counts()
print("Number of EPI_ID values for each label_name before filtering:")
print(label_counts)

# Drop rows with label_name values that have less than 900 EPI_ID values
filtered_data = data[data['label_name'].map(data['label_name'].value_counts()) >= 5000]

# Remove rows where 'label_name' is 'sars_cov_2_nonvoc'
#filtered_data = filtered_data[filtered_data['label_name'] != 'sars_cov_2_nonvoc']

# Define an empty DataFrame to store the filtered results
final_filtered_data = pd.DataFrame()

# Group by 'label_name' and select up to 900 unique 'EPI_ID' values for each label
for label_name, group in filtered_data.groupby('label_name'):
    # Get up to 900 unique EPI_ID values for the current label_name
    unique_epi_ids = group['EPI_ID'].unique()[:5000]
    
    # Filter the group to include only rows with the selected unique EPI_ID values
    filtered_group = group[group['EPI_ID'].isin(unique_epi_ids)]
    
    # Append the filtered group to the final_filtered_data DataFrame
    final_filtered_data = pd.concat([final_filtered_data, filtered_group])

# Write the filtered data to the new CSV file
final_filtered_data.to_csv(output_csv_path, index=False)

# Print the number of EPI_ID values for each label_name after filtering
filtered_label_counts = final_filtered_data['label_name'].value_counts()
print("Number of EPI_ID values for each label_name after filtering:")
print(filtered_label_counts)

print(f"Filtered data written to {output_csv_path}")


Number of EPI_ID values for each label_name before filtering:
label_name
sars_cov_2_omicron     177091
sars_cov_2_delta       173300
sars_cov_2_nonvoc       77063
sars_cov_2_alpha        49234
sars_cov_2_wuhanhu1     19990
sars_cov_2_gamma         5692
sars_cov_2_epsilon       2327
sars_cov_2_iota          2318
sars_cov_2_beta           952
sars_cov_2_mu             626
sars_cov_2_lambda         340
sars_cov_2_zeta           301
sars_cov_2_eta            283
sars_cov_2_kappa           59
sars_cov_2_theta            3
Name: count, dtype: int64
Number of EPI_ID values for each label_name after filtering:
label_name
sars_cov_2_alpha       5000
sars_cov_2_delta       5000
sars_cov_2_gamma       5000
sars_cov_2_nonvoc      5000
sars_cov_2_omicron     5000
sars_cov_2_wuhanhu1    5000
Name: count, dtype: int64
Filtered data written to RBD_valid_nucleotides_500k_VOC_labeled_5000_epi.csv


In [1]:
# creating overlapping 250bps fragments with labels for sarscov2 RBD fragments

import os
import pandas as pd
import csv
from tqdm import tqdm

# Prompt user for the main directory path
main_dir = "/mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/"

# Path to the input and output CSV files
input_csv_path = os.path.join("RBD_valid_nucleotides_500k_VOC_labeled.csv")
output_csv_path = os.path.join(main_dir, "RBD_valid_nucleotides_500k_VOC_labeled_250bp_1bp_overlap.csv")

# Read the input CSV file
data = pd.read_csv(input_csv_path, low_memory=False)

# Define the length of kmer and sliding window
kmer_length = 250
sliding_window = 1 

def extract_kmers(sequence, kmer_length, sliding_window):
    """Extract kmers with a specified sliding window from the given sequence."""
    kmers = []
    for start in range(0, len(sequence) - kmer_length + 1, sliding_window):
        kmer = sequence[start:start + kmer_length]
        kmers.append(kmer)
    return kmers

# Open the output CSV file for writing
with open(output_csv_path, mode='w', newline='') as csv_file:
    # Define the fieldnames for the output CSV
    fieldnames = data.columns.tolist()
    if 'sequence' in fieldnames:
        fieldnames.remove('sequence')  # remove 'sequence' to add kmers later
    fieldnames.append('sequence')
    
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()
    
    # Process each row in the input data with a progress bar
    for _, row in tqdm(data.iterrows(), total=data.shape[0], desc='Processing Rows'):
        sequence = row['sequence']
        kmers = extract_kmers(sequence, kmer_length, sliding_window)
        
        # Write each kmer with the associated row data
        for kmer in kmers:
            row_data = row.to_dict()  # Convert row to dictionary
            row_data['sequence'] = kmer
            writer.writerow(row_data)

print(f"KMers extracted and written to {output_csv_path}")


Processing Rows: 100%|██████████| 509579/509579 [46:25<00:00, 182.95it/s]  

KMers extracted and written to /mmfs1/projects/changhui.yan/DeewanB/DNABert2_rnaseq/genome_files/unfiltered_multiple_genomes/RBD_valid_nucleotides_500k_VOC_labeled_250bp_1bp_overlap.csv



