## Preprocess
### Process toxin-antitoxin (TA) genomic sequence to prepare dataset for EVO fine-tuning
- Dataset: TADB 3.0 (https://bioinfo-mml.sjtu.edu.cn/TADB3/index.php)
- Toxin-Antitoxin type: Type II, Experimentally validated + in silico predicted

Evo: Fine-tuning requirements:
- MAX_seq_LENGTH = 1024 # Context length for finetuning
- Special characters:
    - '`': Toxin start
    - '!': Antitoxin start
    - '@': Type II TA pair

### Basic set up

Hyperparameters for Evo (8k)

In [2]:
MAX_CONTEXT_LENGTH = 1024 # Context length for finetuning (including special tokens)

TA_SPECIAL_TOKEN = {
    'T': '`', # Toxin gene
    'AT': '!', # Antitoxin gene
    '2' : '@' # Type II Toxin-Antitoxin
}

File directories and other information

In [1]:
# Define file paths
at_exp_file_path = '..\\raw_data\\type_II_AT_exp_nucl.fas'
at_pre_file_path = '..\\raw_data\\type_II_AT_pre_nucl.fas'
t_exp_file_path = '..\\raw_data\\type_II_T_exp_nucl.fas'
t_pre_file_path = '..\\raw_data\\type_II_T_pre_nucl.fas'

csv_output = '..\\data\\processed_data.csv'
genome_csv_output = '..\\data\\NCBI_genome_data.csv'
json_output = '..\\data\\training_data_2.json'

# NCBI Log in info
email_id = 'chang.m.yun@stanford.edu'

Import modules

In [2]:
import argparse
from Bio import SeqIO, Entrez
import gzip
import json
import os
import pandas as pd
import re

### Read FASTA files

Open, parse, and store as pandas dataFrames

In [5]:
# Functions
def parse_description(description=str):
    '''
    Parse description into 'Pair #', 'Accession', 'Start', 'End', 'Species'
    '''

    # Define the regular expression pattern
    pattern = r'^(\w+)\s+(\S+):c?(\d+)-(\d+)\s+\[(.*?)\]$'

    match = re.match(pattern, description)
    if match:
        # Extract information from matched groups
        code = match.group(1)
        accession = match.group(2)
        start = match.group(3)
        end = match.group(4)
        species = match.group(5)

        # Return extracted information
        return code, accession, start, end, species
    else:
        # Return None if no match found
        return None

def fasta_to_df(fasta_file_path=str, A_T=str, source=str, tat_type=str):
    '''
    Convert FASTA file into Pandas DataFrame
    '''
    # Define new pandas dataFrame: Source, Pair #, Accession #, Start position, End position, Sequence
    columns = ['type', 'source', 'pair_no', 'accession', 'start', 'end', 'seq']
    df = pd.DataFrame(columns=columns)

    # Open FASTA file
    with open(fasta_file_path, "r") as fasta_handle:
        fasta_seqio = SeqIO.parse(fasta_handle, 'fasta')

        # Populate FASTA into DataFrame
        for fasta in fasta_seqio:
            # Identify Header, Sequence, Description in FASTA
            header, seq, description = str(fasta.id), str(fasta.seq), str(fasta.description)
            if description:
                # Parse description
                code, accession, start, end, species = parse_description(description) # Ignore species name
                
                if A_T == 'AT':
                    pair_no = code[2:]
                else:
                    pair_no = code[1:]

                # Append to DataFrame
                df = pd.concat([df, pd.DataFrame({'type': [str(tat_type)], 'source': [str(source)], 'pair_no': [pair_no], 'accession': [str(accession)], \
                                                    'start': [int(start)],'end': [int(end)],'seq': [str(seq)]})], ignore_index=True)
    return df

# Open each FASTA file and parse into pandas dataframe: at_exp_DF, T_exp_DF, AT_pre_DF, T_pre_DF
at_exp_df = fasta_to_df(at_exp_file_path, 'AT', 'exp', '2')
t_exp_df = fasta_to_df(t_exp_file_path, 'T', 'exp','2')

'''
at_pre_df = fasta_to_df(at_pre_file_path)
t_pre_df = fasta_to_df(t_pre_file_path)
'''

'\nat_pre_df = fasta_to_df(at_pre_file_path)\nt_pre_df = fasta_to_df(t_pre_file_path)\n'

Pair Toxin-Antitoxin (TAT) pairs together

In [6]:
# Combine toxin-antitoxin pairs, based on source, pair_no, accession
exp_paired_df = pd.merge(at_exp_df, t_exp_df, on=['type', 'source', 'pair_no', 'accession'], how='inner', suffixes=('_at','_t'))
'''
pre_paired_df = pd.merge(at_pre_DF, t_pre_DF, on=['type', 'source', 'pair_no', 'accession'], how='inner', suffixes=('_at','_t'))

# Combine experimental + in silico predicted pairs
tat_paired_df = pd.concat([exp_paired_df, pre_paired_df], ignore_index=True)
'''
tat_paired_df = exp_paired_df # Temporary: Delete once uncommented

Identify operon strand in genome: Forward strand, Reverse strand

In [7]:
# Function
def strand_direction(row=pd.Series, A_T=str):
    if A_T == 'AT':
        # Identify antitoxin gene direction
        if row['start_at'] < row['end_at']:
            dir = 'F'
        elif row['start_at'] > row['end_at']:
            dir = 'R'

    elif A_T == 'T':
        # Identify toxin gene direction
        if row['start_t'] < row['end_t']:
            dir = 'F'
        elif row['start_t'] > row['end_t']:
            dir = 'R'
    
    return dir

# Identify strand direction: Forward vs. Reverse
tat_paired_df['dir_at'] = tat_paired_df.apply(strand_direction, axis=1, args=('AT',))
tat_paired_df['dir_t'] = tat_paired_df.apply(strand_direction, axis=1, args=('T',))

Find operon position in genome

In [8]:
# Check if Toxin-Antitoxin pair always in same operon
mask = tat_paired_df['dir_at'] != tat_paired_df['dir_t']
print(tat_paired_df.index[mask].tolist()) # If empty, Toxin-Antitoxin pair always in same operon

# Take direction of Toxin gene as direction of operon
tat_paired_df['dir'] = tat_paired_df['dir_t']

[]


In [9]:
# Find operon position: Start, End, Length (of Operon)
tat_paired_df['start_operon'] = tat_paired_df.apply(lambda row: min(row['start_at'], row['start_t'], row['end_at'], row['end_t']), axis=1)
tat_paired_df['end_operon'] = tat_paired_df.apply(lambda row: max(row['start_at'], row['start_t'], row['end_at'], row['end_t']), axis=1)
tat_paired_df['len_operon'] = tat_paired_df['end_operon'] - tat_paired_df['start_operon'] + 1

'''
# Select max operon length (= context length for all operons)
max_len_operon = tat_paired_df['len_operon'].max()
if max_len_operon < MAX_SEQ_LENGTH:
    max_len = max_len_operon
else:
    max_len = MAX_SEQ_LENGTH
'''

"\n# Select max operon length (= context length for all operons)\nmax_len_operon = tat_paired_df['len_operon'].max()\nif max_len_operon < MAX_SEQ_LENGTH:\n    max_len = max_len_operon\nelse:\n    max_len = MAX_SEQ_LENGTH\n"

### Find genomic regions

Identify and fetch relevant genomes

In [64]:
# Function
def NCBI_genome(accession=str):
    '''
    Access NCBI with email to return genome: Sequence
    '''
    Entrez.email = email_id
    handle = Entrez.efetch(db="nucleotide", id=accession, rettype="fasta", retmode="text")
    record = SeqIO.read(handle, "fasta")
    handle.close()
    return str(record.seq)

# Create new Pandas DataFrame with unique accession no.s
genome_df = pd.DataFrame(tat_paired_df['accession'].drop_duplicates())

# Find NCBI genome sequence for each accession no.
genome_df['seq'] = genome_df['accession'].apply(NCBI_genome)

Export NCBI genome sequences as CSV file

In [181]:
genome_df.to_csv(genome_csv_output)

Import NCBI genome sequences from CSV file

In [10]:
genome_df = pd.read_csv(genome_csv_output)

Identify context length regions (+special token)

In [11]:
# Assign full genome to each TAT pair
tat_paired_df = pd.merge(tat_paired_df, genome_df, on=['accession'], how='outer', suffixes=('_pair','_genome'))

In [12]:
# Function
def assemble_seq_context(row=pd.Series):
    # Tokens
    token_type = TA_SPECIAL_TOKEN[row['type']]
    token_t = TA_SPECIAL_TOKEN['T']
    token_at = TA_SPECIAL_TOKEN['AT']

    # Case 1: Forward strand
    if row['dir'] == 'F':
        # Gene sequence
        gene_t = row['seq_genome'][row['start_t']:row['end_t']]
        gene_at = row['seq_genome'][row['start_at']:row['end_at']]

        # Case 1-A: Toxin - Antitoxin
        # Type token + T token + Toxin gene + T token + Spacer + AT token + Antitoxin gene + AT token
        if row['start_t'] < row['start_at']:
            # Spacer sequence
            spacer = row['seq_genome'][row['end_t']+1:row['start_at']-1]
            
            seq_context = token_type + token_t + gene_t + token_t + spacer + token_at + gene_at + token_at

        # Case 1-B: Antitoxin - Toxin
        # Type token + AT token + Antitoxin gene + Spacer + T token + Toxin gene
        elif row['start_t'] > row['start_at']:
            # Spacer sequence
            spacer = row['seq_genome'][row['end_at']+1:row['start_t']-1]
            
            seq_context = token_type + token_at + gene_at + token_at + spacer + token_t + gene_t + token_t

    # Case 2: Reverse strand
    elif row['dir'] == 'R':
        # Gene sequence
        gene_t = row['seq_genome'][row['end_t']:row['start_t']]
        gene_at = row['seq_genome'][row['end_at']:row['start_at']]

        # Case 2-A: Toxin - Antitoxin
        # AT token + Antitoxin gene + AT token + Spacer + T token + Toxin gene + T token + Type token
        if row['end_at'] < row['end_t']:
            # Spacer sequence
            spacer = row['seq_genome'][row['start_at']+1:row['end_t']-1]

            seq_context = token_at + gene_at + token_at + spacer + token_t + gene_t + token_t + token_type

        # Case 2-B: Antitoxin - Toxin
        # T token + Toxin gene + T token + Spacer + AT token + Antitoxin gene + AT token + Type token
        elif row['end_at'] > row['end_t']:
            # Spacer sequence
            spacer = row['seq_genome'][row['start_t']+1:row['end_at']-1]
        
            seq_context = token_t + gene_t + token_t + spacer + token_at + gene_at + token_at + token_type

    # Add spaces if context length is shorter than max_len (eg when genome length is short)
    global MAX_CONTEXT_LENGTH
    if len(seq_context) < MAX_CONTEXT_LENGTH:
        num_spaces = MAX_CONTEXT_LENGTH - len(seq_context)
        padding_3 = ' ' * num_spaces
    else:
        padding_3 = ''

    seq_context += padding_3

    return seq_context

# Add padding + operon + special token
tat_paired_df['seq_context'] = tat_paired_df.apply(assemble_seq_context, axis=1)                                             

Export as CSV file

In [92]:
# tat_paired_df.to_csv(csv_output)

### Export as JSON file

In [13]:
# Functions
def write_json(data, OUTPUT_PATH):
    with open(OUTPUT_PATH, 'at') as file:
        json.dump(data, file)

def convert_df_to_list(df=pd.DataFrame):
    records = []
    global MAX_CONTEXT_LENGTH

    for index, row in df.iterrows():
        # Append rows with context length less than or equal to max context length
        if len(row['seq_context']) <= MAX_CONTEXT_LENGTH:
            records.append({
                'record': row['pair_no'],
                'text': row['seq_context']
                })

    return records

# Convert Pandas DataFrame into list
data_list = convert_df_to_list(tat_paired_df)

# Write JSON file from list
write_json(data_list, json_output)

### Miscellaneous

In [202]:
'''
# From Evo CRISPR training
def parse_fasta_with_biopython(fname, id_to_cas_id):
    records = []

    if fname.endswith('.gz'):
        file_ = gzip.open(fname, 'rt')
    else:
        file_ = open(fname, 'r')

    for record in SeqIO.parse(file_, 'fasta'):
        seq = str(record.seq)

        if 'NNN' in seq:
            continue

        if len(seq) >= MAX_seq_LENGTH - 2: # Minus start and end tokens.
            extra = len(seq) - (MAX_seq_LENGTH - 2)
            seq = seq[:-extra]

        cas_id = id_to_cas_id[record.id]

        seq = CAS_ID_TO_START_TOKEN[cas_id] + seq # Encode start token.
        # Stop token (EOD) is appended in downstream preprocess_data.py script.

        records.append({
            'record': record.id,
            'text': seq,
        })

    file_.close()

    return records


def write_json(fname, data, output_path):
    with open(output_path, 'at') as f:
        for record in data:
            f.write(json.dumps(record) + '\n')


def process_file(fname, id_to_cas_id, output_file):
    parsed_data = parse_fasta_with_biopython(fname, id_to_cas_id)
    write_json(fname, parsed_data, output_file)


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Parse Cas sequences and output JSONL.")
    parser.add_argument("fasta_path", type=str,
                        help="Path to directory containing input FASTA file.")
    parser.add_argument("metadata_path", type=str,
                        help="Path to Cas metadata file.")
    parser.add_argument("output_path", type=str,
                        help="Path to output JSON file.")
    args = parser.parse_args()

    id_to_cas_id = load_metadata(args.metadata_path)

    process_file(args.fasta_path, id_to_cas_id, args.output_path)
'''

usage: ipykernel_launcher.py [-h] fasta_path metadata_path output_path
ipykernel_launcher.py: error: the following arguments are required: fasta_path, metadata_path, output_path


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
