In [None]:
from wit import Wit
import json
import random
import requests
from Bio import Entrez
import string

with open("creds.json", "r") as file:
    access_token = json.load(file)["wit_access_token"]

In [None]:
client = Wit(access_token=access_token)
client.message("Please get a plasmid from AddGene: 32515")

{'entities': {'syc_database:syc_database': [{'body': 'AddGene',
    'confidence': 0.9995,
    'end': 33,
    'entities': [],
    'id': '1362456991854582',
    'name': 'syc_database',
    'role': 'syc_database',
    'start': 26,
    'type': 'value',
    'value': 'AddGene'}],
  'wit$number:number': [{'body': '32515',
    'confidence': 0.9984034184705346,
    'end': 40,
    'entities': [],
    'id': '1323047038868057',
    'name': 'wit$number',
    'role': 'number',
    'start': 35,
    'type': 'value',
    'value': 32515}]},
 'intents': [{'confidence': 0.9999985694905718,
   'id': '609306308290239',
   'name': 'RepositoryIdSource'}],
 'text': 'Please get a plasmid from AddGene: 32515',
 'traits': {}}

In [None]:
def save_template(template: str):
    with open("datasets/database_utterances.tsv", "a") as file:
        file.write(template + "\n")

def save_ids(ids: list, db: str):
    with open(f"datasets/{db}_ids.tsv", "a") as file:
        for id in ids:
            file.write(f"{id}\n")

def random_addgene_id():
    valid = False
    while not valid:
        id = random.randint(1, 999999)
        if requests.get(f"https://www.addgene.org/{id}").status_code == 200:
            valid = True
    return id

def get_random_genbank_accession(validate=True):
    Entrez.email = "dgruano.code@gmail.com"  # Replace with your email

    while True:
        # Generate a random accession number
        prefix = ''.join(random.choices(string.ascii_uppercase, k=2))
        digits = ''.join(random.choices(string.digits, k=6))
        version = 1
        accession = f"{prefix}{digits}.{version}"

        if not validate:
            return accession

        # Validate the accession number
        try:
            handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text")
            record = handle.read()
            handle.close()
            if record:
                return accession
        except:
            pass  # If an error occurs, generate a new accession number

querty_typos = {
    'q': ['1', '2', 'w', 's', 'a'],
    'w': ['2', '3', 'e', 's', 'a', 'q'],
    'e': ['3', '4', 'r', 'd', 's', 'w'],
    'r': ['4', '5', 't', 'f', 'd', 'e'],
    't': ['5', '6', 'y', 'g', 'f', 'r'],
    'y': ['6', '7', 'u', 'h', 'g', 't'],
    'u': ['7', '8', 'i', 'j', 'h', 'y'],
    'i': ['8', '9', 'o', 'k', 'j', 'u'],
    'o': ['9', '0', 'p', 'l', 'k', 'i'],
    'p': ['0', 'l', 'o'],
    'a': ['q', 'w', 's', 'x', 'z'],
    's': ['w', 'e', 'd', 'x', 'z', 'a'],
    'd': ['e', 'r', 'f', 'c', 'x', 's'],
    'f': ['r', 't', 'g', 'v', 'c', 'd'],
    'g': ['t', 'y', 'h', 'b', 'v', 'f'],
    'h': ['y', 'u', 'j', 'n', 'b', 'g'],
    'j': ['u', 'i', 'k', 'm', 'n', 'h'],
    'k': ['i', 'o', 'l', 'm', 'j'],
    'l': ['o', 'p', 'k'],
    'z': ['a', 's', 'x'],
    'x': ['z', 's', 'd', 'c'],
    'c': ['x', 'd', 'f', 'v'],
    'v': ['c', 'f', 'g', 'b'],
    'b': ['v', 'g', 'h', 'n'],
    'n': ['b', 'h', 'j', 'm'],
    'm': ['n', 'j', 'k']
}


def introduce_typos(utterance, typo_rate=0.1):
    # Function to introduce typos into the utterance
    characters = list(utterance)
    for i in range(len(characters)):
        if random.random() < typo_rate:  # Introduce a typo based on the typo rate
            typo_type = random.choice(['replace', 'swap', 'remove', 'insert'], p=[0.25, 0.25, 0.25, 0.25])
            if typo_type == 'replace':
                characters[i] = random.choice(querty_typos.get(characters[i], [characters[i]]))
            elif typo_type == 'swap':
                if i < len(characters) - 1:
                    characters[i], characters[i + 1] = characters[i + 1], characters[i]
            elif typo_type == 'remove':
                characters[i] = ''
            else:
                # Insert character
                characters.insert(i, random.choice(querty_typos.get(characters[i], [characters[i]])))
                
    return ''.join(characters)

In [None]:
# Define templates
common_templates = [
    "Can you get sequence {seqid} from {db}?",
    "Please get sequence {seqid} from {db}.",
    "Download sequence {seqid} from {db}.",
    "Get {seqid} from {db}",
    "Retrieve {seqid} from {db}",
    "Download from {db} se%%bashquence {seqid}",
    "{seqid} from {db}",
    "Import sequence {seqid} from {db}",
]

addgene_templates = [
    "Can you get plasmid {seqid} from AddGene?",
    "Please get plasmid {seqid} from AddGene.",
    "Download plasmid {seqid} from AddGene.",
    "Get {seqid} from AddGene",
    "Retrieve {seqid} from AddGene",
    "Download from AddGene plasmid {seqid}",
    "{seqid} from AddGene",
    "Import plasmid {seqid} from AddGene",
]

GenBank_templates = [
    "Get gene {seqid} from GenBank",
    "Retrieve gene {seqid} from GenBank",
    "Download gene {seqid} from GenBank",
    "Can you get the mRNA {seqid} from GenBank?",
    "Please get the mRNA {seqid} from GenBank.",
    "Download the mRNA {seqid} from GenBank.",
    "Import the cDNA with id {seqid} from GenBank",
    "Go to GenBank and get the cDNA with id {seqid}",
]

synonyms = {
    "AddGene": ["AddGene", "Add Gene", "Addgene", "addgene", "Adgene", "adgene", "addgen"],
    "GenBank": ["GenBank", "Gen Bank", "Genbank", "NCBI", "Gene Bank", "gene bank", "genbank", "GeneBank"],
}

In [None]:
# db = "AddGene"
addgene_ids = [random_addgene_id() for _ in range(len(common_templates))]
save_ids(addgene_ids, "AddGene")

for c, template in enumerate(common_templates):
    seqid = addgene_ids[c]
    db = random.choice(synonyms["AddGene"])
    t = template.format(seqid=seqid, db=db)
    save_template(t)
    client.message(t)  # Send the template to the Wit.ai

addgene_ids = [random_addgene_id() for _ in range(len(addgene_templates))]
save_ids(addgene_ids, "AddGene")

for template in addgene_templates:
    t = template.format(seqid=addgene_ids.pop())
    save_template(t)
    client.message(t)  # Send the template to the Wit.ai


# db = "GenBank"
genbank_ids = [get_random_genbank_accession() for _ in range(len(common_templates))]
save_ids(genbank_ids, "GenBank")

for c, template in enumerate(common_templates):
    seqid = genbank_ids[c]
    db = random.choice(synonyms["GenBank"])
    t = template.format(seqid=seqid, db=db)
    save_template(t)
    client.message(t)  # Send the template to the Wit.ai


In [None]:
genbank_ids = [get_random_genbank_accession() for _ in range(len(GenBank_templates))]
save_ids(genbank_ids, "GenBank")

for template in GenBank_templates:
    save_template(template.format(seqid=seqid))

In [None]:
# Load and train the model
with open("datasets/database_utterances.tsv", "r") as file:
    for line in file:
        line = line.strip()
        print(line)
        client.message(line)

Can you get sequence 149455 from AddGene?
Please get sequence 12519 from AddGene.
Download sequence 129652 from AddGene.
Get 203080 from AddGene
Retrieve 102284 from AddGene
Download from AddGene sequence 111200
137135 from AddGene
Import sequence 22161 from AddGene
Can you get sequence 187851 from AddGene?
Please get sequence 230305 from AddGene.
Download sequence 198978 from AddGene.
Get 111548 from AddGene
Retrieve 177777 from AddGene
Download from AddGene sequence 78182
186654 from AddGene
Import sequence 145799 from AddGene
Can you get plasmid 145799 from AddGene?
Please get plasmid 145799 from AddGene.
Download plasmid 145799 from AddGene.
Get 145799 from AddGene
Retrieve 145799 from AddGene
Download from AddGene plasmid 145799
145799 from AddGene
Import plasmid 145799 from AddGene
Can you get sequence GL075182.1 from GenBank?
Please get sequence JB142456.1 from GenBank.
Download sequence CT826467.1 from GenBank.
Get GO564332.1 from GenBank
Retrieve CL357457.1 from GenBank
Downlo