In [3]:
from Bio import SeqIO
import os

# Path to the main FASTA file
fasta_file = 'fasta.homo_sapiens.0000'

# Directory to save the individual FASTA files
output_dir = 'fasta_files'
os.makedirs(output_dir, exist_ok=True)

# Iterate through the sequences in the main FASTA file
for record in SeqIO.parse(fasta_file, "fasta"):
    # Extract the name attribute from the header
    sequence_name = [part.split(":")[1] for part in record.description.split() if part.startswith("name:")][0]
    
    # Construct the output file path
    output_file_path = os.path.join(output_dir, sequence_name + '.fasta')
    
    # Write the sequence to the individual FASTA file
    with open(output_file_path, 'w') as output_file:
        SeqIO.write(record, output_file, "fasta")

In [7]:
import torch
import pandas as pd
from Bio import SeqIO
import os

# Hyperparameters
batch_size = 16
seq_length = 300

# Directories containing the CSV files and corresponding FASTA files
traces_dir = 'traces'
fasta_dir = 'fasta_files'

# Lists to store the data
chromatogram_list = []
sequence_list = []

# One-hot encoding mapping
nucleotide_mapping = {'A': [1, 0, 0, 0, 0], 'T': [0, 1, 0, 0, 0], 'C': [0, 0, 1, 0, 0], 'G': [0, 0, 0, 1, 0], 'N': [0, 0, 0, 0, 1]}

# Iterate through the CSV files in the traces folder
for chromatogram_file in os.listdir(traces_dir):
    chromatogram_path = os.path.join(traces_dir, chromatogram_file)
    chromatogram_data = pd.read_csv(chromatogram_path).values[:seq_length]

    # Check if chromatogram data is empty and continue to the next file if so
    if chromatogram_data.size == 0:
        print(f"Ignoring empty chromatogram file {chromatogram_path}")
        continue

    normalized_chromatogram_data = torch.tensor(chromatogram_data / chromatogram_data.max()).float()

    # Construct the corresponding FASTA file path
    sequence_name = os.path.splitext(chromatogram_file)[0]
    fasta_path = os.path.join(fasta_dir, sequence_name + '.fasta')

    # Read the sequence from the corresponding FASTA file
    matching_sequence = str(list(SeqIO.parse(fasta_path, "fasta"))[0].seq)[:seq_length]

    # Check if sequence is empty and continue to the next file if so
    if not matching_sequence:
        print(f"Ignoring empty sequence in file {fasta_path}")
        continue

    # One-hot encode the sequence
    one_hot_sequence = torch.tensor([nucleotide_mapping[n] for n in matching_sequence]).float()

    # Add data to lists
    chromatogram_list.append(normalized_chromatogram_data)
    sequence_list.append(one_hot_sequence)

# Concatenate the data and split into batches
normalized_chromatogram_data = torch.stack(chromatogram_list).split(batch_size)
one_hot_sequences = torch.stack(sequence_list).split(batch_size)

Ignoring empty chromatogram file traces/X09140N00-01.csv


In [None]:
from Bio import SeqIO
import os

# Directory containing the FASTA files
fasta_dir = 'extracted_fasta'

# Directory to save the individual FASTA files
output_dir = 'fasta_files'
os.makedirs(output_dir, exist_ok=True)

# Iterate through all FASTA files in the directory
for fasta_filename in os.listdir(fasta_dir):
    if fasta_filename.endswith('.fasta') or fasta_filename.endswith('.fa'):
        fasta_file = os.path.join(fasta_dir, fasta_filename)

        # Iterate through the sequences in each FASTA file
        for record in SeqIO.parse(fasta_file, "fasta"):
            # Extract the name attribute from the header
            sequence_name = [part.split(":")[1] for part in record.description.split() if part.startswith("name:")][0]

            # Construct the output file path
            output_file_path = os.path.join(output_dir, sequence_name + '.fasta')

            # Write the sequence to the individual FASTA file
            with open(output_file_path, 'w') as output_file:
                SeqIO.write(record, output_file, "fasta")
