## Retain Dataset
- Select non-Salmonella enterica sequences from the 16S rRNA and plasmid datasets.
- Focus on bacterial species with most frequently observed plasmids (i.e., the most common host species in the IMG/PR dataset).
- Filter out very long plasmid sequences with length > 100,000 bp.

| Species name   | Description                        |
|----------------|------------------------------------|
| Staphylococcus aureus     | Causative agent for Staph infections           |
| Klebsiella pneumoniae     |  Usually harmless in intestines but can cause serious infections in the lungs and kidneys           |
| Escherichia coli      | Mostly harmless but some strains can cause food poisoning           |
| Pseudomonas aeruginosa      | An opportunistic pathogen that causes a wide range of infections in humans and animals           |
| Yersinia pestis      |  Responsible for plague          |
| Bifidobacterium longum      | Non-pathogenic probiotic bacteria           |
| Lactococcus lactis      | Lactic acid bacteria widely used in dairy industry           |

In [None]:
ssu_fasta_dir = "/temp/evo-circuit-breaker/data/ssu_fasta/"
plasmid_fasta_dir = "/temp/evo-circuit-breaker/data/plasmid_fasta/"

retain_species = ['S.aureus', 'E.coli', 'B.longum', 'P.aeruginosa', 'K.pneumoniae', 'Y.pestis', 'L.lactis']

species_to_prefix = {'S.aureus': 'd__Bacteria;p__Firmicutes;c__Bacilli;o__Staphylococcales;f__Staphylococcaceae;g__Staphylococcus;s__Staphylococcus aureus',
                     'E.coli': 'd__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli',
                     'B.longum': 'd__Bacteria;p__Actinobacteriota;c__Actinomycetia;o__Actinomycetales;f__Bifidobacteriaceae;g__Bifidobacterium;s__Bifidobacterium longum',
                     'P.aeruginosa': 'd__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Pseudomonadales;f__Pseudomonadaceae;g__Pseudomonas;s__Pseudomonas aeruginosa',
                     'K.pneumoniae': 'd__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Klebsiella;s__Klebsiella pneumoniae',
                     'Y.pestis': 'd__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Yersinia;s__Yersinia pestis',
                     'L.lactis': 'd__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Lactococcus;s__Lactococcus lactis'}


In [None]:
import random
import json
from Bio import SeqIO

def get_random_species():
    return random.choice(retain_species)

def get_random_sequence(fasta_file):
    sequences = list(SeqIO.parse(fasta_file, "fasta"))
    return random.choice(sequences)

def generate_random_sequences(num_sequences):
    sampled_data = []
    for _ in range(num_sequences):
        species = get_random_species()
        # Get one random sequence from each file
        ssu_seq = get_random_sequence(ssu_fasta_dir + species + ".ssu.fasta")
        plasmid_seq = get_random_sequence(plasmid_fasta_dir + species + ".plasmid.fasta")

        combined_seq = ssu_seq.seq + plasmid_seq.seq
        combined_length = len(combined_seq)

        # Arbitrary split point
        split_ratio = random.uniform(0.5, 0.8)
        split_point = int(split_ratio * combined_length)

        first_part = str(combined_seq[:split_point])
        second_part = str(combined_seq[split_point:])

        plasmid_id = plasmid_seq.id.split('|')[0]
        species_prefix = species_to_prefix[species]
        strain_taxonomy = ssu_seq.description.split(" ", 1)[1]

        json_object = {
            "species_prefix": species_prefix,
            "16S_strain_taxonomy": strain_taxonomy,
            "plasmid_id": plasmid_id,
            "sequence_prompt": first_part,
            "sequence_completion": second_part
        }

        sampled_data.append(json_object)

    output_json_file = "salmonella_retain_dataset.json"
    with open(output_json_file, "w") as json_file:
        json.dump(sampled_data, json_file, indent=4)

num_synthetic_seqs = 1000
generate_random_sequences(num_synthetic_seqs)

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

with open('salmonella_cb_dataset.json', 'r') as f:
    circuit_breaker_data = json.load(f)

with open('salmonella_retain_dataset.json', 'r') as f:
    retain_data = json.load(f)

circuit_breaker_df = pd.DataFrame(circuit_breaker_data)
retain_df = pd.DataFrame(retain_data)

circuit_breaker_dataset = Dataset.from_pandas(circuit_breaker_df)
retain_dataset = Dataset.from_pandas(retain_df)

datasets = DatasetDict({
    "circuit_breaker": circuit_breaker_dataset,
    "retain": retain_dataset
})

# Push the datasets to the Hugging Face Hub
datasets.push_to_hub("onuralp/evo-circuit-breaker")