# 1. Creation the SQNce-db file with all tables 
<a class="anchor" id="section1"></a>

In [None]:
import os
import gzip
import sqlite3
import zlib
from sqlite3 import Error
import pandas as pd
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [2]:
# Current implementation requires re-parsing of all the input files to create SQNce
# TODO SQNce update functions to parse input data only if not previously included 

# Establish connection with SQNce.db, generating a new SQLite3 database if needed
def sql_connection():
    try:
        con = sqlite3.connect('SQNce.db')
        print("Connection established.")
        return(con)
    except Error:
        print(Error)

# if os.path.exists("SQNce.db"): os.remove("SQNce.db")
con = sql_connection()

Connection established.


# 2. Populate SQNce-db with the required data
* SQNce database is initiated using predifined TSV files
* TSV either contain the input data or reference input files to parse
* TODO add documentation to SQNce data input parsing

### Table of Contents
<a class="anchor" id="section2"></a>
* [1. Creation the SQNce-db file with all tables](#section1)
* [2. Populate SQNce-db with the required data](#section2)
* [Insert species and genotype IDs](#genotypes)
* [Insert gene genomic coordinates](#coordinates)
* [Insert protein sequences](#proteins)
* [Insert gene family annotations](#families)
* [Insert best blast hists (BBHs)](#BBHs)
* [Insert promoter sequences](#promoters)
* [Gene annotation insert](#annotations)
* [Insert gene symbols](#symbols)
* [Insert GO Terms](#GO)
* [Insert RNA-seq Files](#RNAseq)

# Insert species and genotype IDs 
<a class="anchor" id="genotypes"></a>
* [Go back to section 2](#section2)

In [37]:
df = pd.read_csv("init/species.tsv", sep="\t")
for index, row in df.iterrows():
    #print(list(row))
    #species_insert(con, entities=list(row))
    continue

# Insert GO Terms
<a class="anchor" id="GO"></a>
* [Go back to section 2](#section2)

In [3]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS genomes(
                     genome_id text,
                     species_name text,
                     genotype_id text,
                     source_id text,
                     file_name text,
                     gene_number integer)""")
con.commit()

df = pd.read_csv("init/genomes.tsv", sep="\t")
for index, row in df.iterrows():
    row = list(row)
    # If from phytozome use the annotation table to read number of genes
    if row[3] == "Phytozome":
        annot = pd.read_csv("inputs/annotations/"+row[4], sep="\t")
        annot = annot.drop_duplicates(subset="locusName")

        con.cursor().execute("""INSERT INTO genomes(
                             genome_id, species_name, genotype_id,
                             source_id, file_name, gene_number) 
                             VALUES(?, ?, ?, ?, ?, ?)""", row+[annot.shape[0]])
        con.commit()

In [17]:
# Add the GO database OBO file to keep the GO annotations
df = pd.read_csv("inputs/GO/go-basic.csv").drop("Unnamed: 0", axis=1)
df.columns = ["GO_id", "GO_short", "process", "GO_long"]
df.to_sql('GO_basic', con, if_exists='replace', index=False)

con.cursor().execute("CREATE INDEX GO_basic_id ON GO_basic(GO_id)")
con.commit()

In [None]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS genomes(
                     genome_id text,
                     species_name text,
                     genotype_id text,
                     source_id text,
                     file_name text,
                     gene_number integer)""")
con.commit()

df = pd.read_csv("init/genomes.tsv", sep="\t")
for index, row in df.iterrows():
    row = list(row)
    # If from phytozome use the annotation table to read number of genes
    if row[3] == "Phytozome":
        annot = pd.read_csv("inputs/annotations/"+row[4], sep="\t")
        annot = annot.drop_duplicates(subset="locusName")

        con.cursor().execute("""INSERT INTO genomes(
                             genome_id, species_name, genotype_id,
                             source_id, file_name, gene_number) 
                             VALUES(?, ?, ?, ?, ?, ?)""", row+[annot.shape[0]])
        con.commit()

In [8]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS gene_GOs(
                     gene_id text, 
                     GO_id text, 
                     GO_count integer,
                     genome_id text,
                     source_id text)""")

df = pd.read_csv("init/GO_files.tsv", sep="\t")
for row in df.values:
    # Check annotation column
    long_list = [] # have a row for each gene-GO combination
    if row[2] == "Phytozome": # currently only support phytozome GO annotations
        annot = pd.read_csv("inputs/annotations/"+row[3], sep="\t")[['locusName', 'GO']]
        # I am assuming that all variants have the same GO term but should double check
        annot = annot.drop_duplicates(subset="locusName")
        # https://stackoverflow.com/questions/41244981/how-to-extract-comma-separated-values-to-individual-rows-in-pandas
        annot = annot.set_index('locusName').GO.str.split(',', expand=True).stack().reset_index('locusName')
        annot.columns = ["gene_id", "GO"]
        annot = annot.merge(annot.groupby("GO").count().reset_index(), on="GO", how="left")
        annot["genome_id"] = row[0]
        annot["source_id"] = row[2]

        con.cursor().executemany("""INSERT INTO gene_GOs(
                                 gene_id, GO_id, GO_count, genome_id, source_id)  
                                 VALUES(?,?,?,?,?)""", annot.values.tolist())
con.commit()

# Insert gene genomic coordinates
<a class="anchor" id="coordinates"></a>
* [Go back to section 2](#section2)

In [36]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS gene_coordinates(
                     gene_id text,
                     genome_id text,
                     gene_chr text,
                     gene_start integer,
                     gene_end integer,
                     gene_orientation text)""")

df = pd.read_csv("init/coordinates.tsv", sep="\t")
for index, row in df.iterrows():
    gff3_file = gzip.open("inputs/gff3/"+row[3], mode='rt')
    gene_coordinate_list = []
    for gene in gff3_file:
        gene = gene.split("\t")
        if len(gene) == 1:
            # skip the first row
            continue
        if gene[2] == "gene":
            gene_id = gene[-1].split(";")
            gene_id = [i for i in gene_id if i.startswith('Name=')][0].replace('Name=', '').replace('\n', '') # delete \n if exists
            # Append list of: gene ID, genotype, chromsome, start, end, orientation
            gene_coordinate_list.append([gene_id, row[1], gene[0], gene[3], gene[4], gene[6]])
        else:
            continue
    con.cursor().executemany("""INSERT INTO gene_coordinates(
                         gene_id, genome_id, gene_chr, 
                         gene_start, gene_end, gene_orientation) 
                         VALUES(?, ?, ?, ?, ?, ?)""", gene_coordinate_list)

# Create a secondary key on the name column
cursorObj = con.cursor()
cursorObj.execute("CREATE INDEX coordinate_index_start ON gene_coordinates(genome_id, gene_chr, gene_start)")
cursorObj.execute("CREATE INDEX coordinate_index_end ON gene_coordinates(genome_id, gene_chr, gene_end)")
con.commit()

# Insert protein sequences
<a class="anchor" id="proteins"></a>
* [Go back to section 2](#section2)

In [35]:
# https://stackoverflow.com/questions/18219779/bulk-insert-huge-data-into-sqlite-using-python
con.cursor().execute("""CREATE TABLE IF NOT EXISTS protein_seqs(
                     protein_id text,
                     species_id text,
                     genome_id text,
                     protein_length text,
                     protein_sequence blob)""")

df = pd.read_csv("init/proteins.tsv", sep="\t")
for index, row in df.iterrows():
    fasta_file = gzip.open("inputs/proteins/"+row[2], mode='rt')
    protein_seq_list = []
    for seq in SeqIO.parse(fasta_file, "fasta"):
        # Protein sequences are saved as as a Binary data type for compression
        protein_seq_list.append([seq.id, row[0], row[1], len(seq.seq), 
                                 sqlite3.Binary(zlib.compress(str(seq.seq).encode('utf-8')))])
    con.cursor().executemany("""INSERT INTO protein_seqs(
                     protein_id, species_id, genome_id, 
                     protein_length, protein_sequence) 
                     VALUES(?, ?, ?, ?, ?)""", protein_seq_list)

# Create keys for protein IDs with isoform and without
con.cursor().execute("CREATE INDEX proteins_ids_index ON protein_seqs(protein_id)")
con.commit()

# Insert gene family annotations
<a class="anchor" id="families"></a>
* [Go back to section 2](#section2)

In [28]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS gene_families(
                     protein_id text,
                     species_id text,
                     genome_id text,
                     source_id text,
                     family_id text,
                     family_name text)""")


def family_annotation_insert(con, entity_list):
    cursorObj = con.cursor()
    
    con.commit()

df = pd.read_csv("init/families.tsv", sep="\t")
for index, row in df.iterrows():
    annot = pd.read_csv("inputs/families/"+row[3], sep="\t")
    annot.insert(1, 'species', row[0])
    annot.insert(2, 'genome', row[1])
    annot.insert(3, 'source', row[2])
    con.cursor().executemany("""INSERT INTO gene_families(
                         protein_id, species_id, genome_id,
                         source_id, family_id, family_name) 
                         VALUES(?, ?, ?, ?, ?, ?)""", annot.values.tolist())
# Create keys for protein IDs with isoform and without
con.cursor().execute("CREATE INDEX family_gene_index ON gene_families(protein_id)")
#cursorObj.execute("CREATE INDEX family_species_index ON gene_families(species_id)")
#cursorObj.execute("CREATE INDEX family_genotype_index ON gene_families(genotype_id)")
#cursorObj.execute("CREATE INDEX family_id_index ON gene_families(family_id)")
con.cursor().execute("CREATE INDEX family_name_index ON gene_families(family_name)")
con.cursor().execute("CREATE INDEX family_genotype_name_index ON gene_families(genome_id, family_name)")
con.commit()   

# Insert best blast hists (BBHs)
<a class="anchor" id="BBHs"></a>
* [Go back to section 2](#section2)

In [25]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS BBHs(
                     subject_id text,
                     query_id text,
                     bit_score integer,
                     subject_genome text,
                     query_genome text)""")

df = pd.read_csv("init/BBHs_files.tsv", sep="\t")
for index, row in df.iterrows():
    bbh_df = pd.read_csv("inputs/BBHs/"+row[2], sep="\t")
    con.cursor().executemany("""INSERT INTO BBHs(
                     subject_id, query_id, bit_score,
                     subject_genome, query_genome) 
                     VALUES(?, ?, ?, ?, ?)""", bbh_df.values.tolist())

# Create keys for subject and query gene IDs
con.cursor().execute("CREATE INDEX BBHs_subject_index ON BBHs(subject_id)")
con.cursor().execute("CREATE INDEX BBHs_query_index ON BBHs(query_id)")
con.commit()

# Insert promoter sequences
<a class="anchor" id="promoters"></a>
* [Go back to section 2](#section2)

In [23]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS promoter_seqs(
                     protein_id text,
                     genome_id text,
                     promoter_kind text,
                     promoter_length text,
                     promoter_sequence blob)""")

df = pd.read_csv("init/promoters.tsv", sep="\t")
for index, row in df.iterrows():
    fasta_file = gzip.open("inputs/promoters/"+row[5]+"_1kb_ATG.fasta.gz", mode='rt')
    promoter_seq_list = []
    for seq in SeqIO.parse(fasta_file, "fasta"):
        # TODO change db to number key to avoid duplicated name problems
        if row[0] == "ZmB73v3": 
            seq.id = seq.id + "v3"
        # Protein sequences are saved as as a Binary data type for compression
        promoter_seq_list.append([seq.id, row[0], "ATG", row[4], 
                                 sqlite3.Binary(zlib.compress(str(seq.seq).encode('utf-8')))])
    con.cursor().executemany("""INSERT INTO promoter_seqs(
                     protein_id, genome_id, promoter_kind,
                     promoter_length, promoter_sequence) 
                     VALUES(?, ?, ?, ?, ?)""", promoter_seq_list)
    
    fasta_file = gzip.open("inputs/promoters/"+row[5]+"_1kb_TSS.fasta.gz", mode='rt')
    promoter_seq_list = []
    for seq in SeqIO.parse(fasta_file, "fasta"):
        # TODO change db to number key to avoid duplicated name problems
        if row[0] == "ZmB73v3": 
            seq.id = seq.id + "v3"
        # Protein sequences are saved as as a Binary data type for compression
        promoter_seq_list.append([seq.id, row[0], "TSS", row[4], 
                                 sqlite3.Binary(zlib.compress(str(seq.seq).encode('utf-8')))])
    con.cursor().executemany("""INSERT INTO promoter_seqs(
                     protein_id, genome_id, promoter_kind,
                     promoter_length, promoter_sequence) 
                     VALUES(?, ?, ?, ?, ?)""", promoter_seq_list)

con.cursor().execute("CREATE INDEX promoter_index ON promoter_seqs(protein_id, promoter_kind)")
con.commit()

# Gene annotation insert
<a class="anchor" id="annotations"></a>
* [Go back to section 2](#section2)

In [20]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS gene_annotations(
                     gene_id text PRIMARY KEY,
                     genome_id text,
                     gene_genotype text,
                     annotation_source text,
                     gene_annotation text)""")

df = pd.read_csv("init/annotation_list.tsv", sep="\t")
# Every element is: gene_id, gene_species, gene_genotype, gene_annotation, annotation_source
gene_annotation_list = []
for index, row in df.iterrows():
    annot = pd.read_csv("inputs/annotations/"+row[3], sep="\t")
    annot = annot.drop_duplicates(subset="locusName")

    if row[0]=="Arabidopsis thaliana":
        annot = annot[["locusName", "rice-defline"]]
    elif row[0] in ["Panicum hallii", "Pharus latifolius", "Solanum lycopersicum", "Vigna unguiculata"]:
        annot = annot[["locusName", "Best-hit-arabi-defline"]]        
    else:
        annot = annot[["locusName", "arabi-defline"]]
    annot.insert(1, 'species', row[0])
    annot.insert(2, 'genotype', row[1])
    annot.insert(3, 'source', row[2])
    con.cursor().executemany("""INSERT INTO gene_annotations(
                         gene_id, genome_id, gene_genotype,
                         annotation_source, gene_annotation) 
                         VALUES(?, ?, ?, ?, ?)""", annot.values.tolist())
con.commit()

# Insert gene symbols
<a class="anchor" id="symbols"></a>
* [Go back to section 2](#section2)

In [19]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS gene_symbols(
                     gene_id text,
                     genome_id text,
                     gene_symbol text)""")

df = pd.read_csv("init/symbols_list.tsv", sep="\t")
# Every element is: gene_id, gene_species, gene_genotype, gene_annotation, annotation_source
gene_annotation_list = []
for index, row in df.iterrows():
    annot = pd.read_csv("inputs/symbols/"+row[2], sep="\t")
    annot.insert(1, 'gene_id', row[0])
    con.cursor().executemany("""INSERT INTO gene_symbols(
                             gene_id, genome_id, gene_symbol) 
                             VALUES(?, ?, ?)""", annot.values.tolist())
con.commit()

# Insert RNA-seq Files
<a class="anchor" id="RNAseq"></a>
* [Go back to section 2](#section2)

(Note: This part will need more work and is not currently in use)

In [None]:
con.cursor().execute("""CREATE TABLE IF NOT EXISTS packages(
                     rowid integer,
                     name text,
                     version text,
                     settings text)""")

con.cursor().execute("""CREATE TABLE IF NOT EXISTS studies(
                     study_accession text, 
                     tax_id integer,
                     scientific_name text,
                     instrument_model text,
                     library_strategy text,
                     description text)""")

con.cursor().execute("""CREATE TABLE IF NOT EXISTS fastq(
                     run_accession text,
                     study_accession text,
                     read_count integer,
                     sample_alias text,
                     fastq_ftp text,
                     fastq_md5 text,
                     compression integer)""")

con.cursor().execute("""CREATE TABLE IF NOT EXISTS bam(
                     run_accession text,
                     study_accession text,
                     sample_alias text,
                     compression integer,
                     filter integer,
                     align integer,
                     sort integer)""")

def packages_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO packages(
                         rowid, 
                         name,
                         version,
                         settings) 
                         VALUES(?, ?, ?, ?)""", entities)
    con.commit()
    
def studies_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO studies(
                         study_accession, 
                         tax_id, 
                         scientific_name,
                         instrument_model,
                         library_strategy,
                         description) 
                         VALUES(?, ?, ?, ?, ?, ?)""", entities)
    con.commit()

# https://stackoverflow.com/questions/18219779/bulk-insert-huge-data-into-sqlite-using-python
def fastq_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO fastq(
                         run_accession, 
                         study_accession, 
                         read_count, 
                         sample_alias,  
                         fastq_ftp,
                         fastq_md5,
                         compression) 
                         VALUES(?, ?, ?, ?, ?, ?, ?)""", entities)
    con.commit()
    
def bam_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO bam(
                         run_accession,
                         study_accession,
                         sample_alias,
                         compression,
                         filter,
                         align,
                         sort) 
                         VALUES(?, ?, ?, ?, ?, ?, ?)""", entities)
    con.commit()

df = pd.read_csv("inputs/omics/packages.tsv", sep="\t")
for index, row in df.iterrows():
    packages_insert(con, entities=list(row))

df = pd.read_csv("inputs/omics/studies.tsv", sep="\t")
for index, row in df.iterrows():
    studies_insert(con, entities=list(row))

df = pd.read_csv("inputs/omics/fastq.tsv", sep="\t")
for index, row in df.iterrows():
    fastq_insert(con, entities=list(row))

df = pd.read_csv("inputs/omics/bam.tsv", sep="\t")
for index, row in df.iterrows():
    bam_insert(con, entities=list(row))

# SQNce Query Functions

In [None]:
rows = """Chr2\t12345\nChr3\t54354\nChr2\t5234354\n""".split("\n")
coordinate_list = [row.split("\t") for row in rows]
if coordinate_list[-1]==[""]:
    coordinate_list = coordinate_list[:-1]
for row in coordinate_list:
    if len(row) != 2:
        return(html.P("Number of columns is not 2. Use tab-seperated values."))


In [None]:
# Query to find neighboring genes
def get_SNP_neighbors(genotype, chromsome, coordinate, distance):
    con = sqlite3.connect('SQNce.db')
    cursorObj = con.cursor()
    df = pd.read_sql_query('''SELECT * 
                     FROM gene_coordinates 
                     WHERE genotype_id = "{0}"
                     AND gene_chr = "{1}"
                     AND gene_start BETWEEN {2} AND {3}
                     
                     UNION ALL
                     
                     SELECT * 
                     FROM gene_coordinates 
                     WHERE genotype_id = "{0}"
                     AND gene_chr = "{1}"
                     AND gene_start BETWEEN {2} AND {3}
                     '''.format(genotype, chromsome, coordinate-distance, coordinate+distance), con)
    # Should check why it returns the same row twice, probably need to correct the query
    df = df.drop_duplicates()
    df.insert(0, 'Query', pd.Series(["_".join([chromsome, str(coordinate)]) for x in range(len(df.index))]))
    return(df)

df = pd.DataFrame()
for entity in [["Chr2", 19681637], ["Chr2", 1234564], ["Chr4", 1234564], ["Chr2", 19681638]]:
      df = pd.concat([df, get_SNP_neighbors("Arabidopsis", entity[0], entity[1], 10000)])  
#df.drop_duplicates(subset=["gene_id"]).reset_index()
#df1 = get_SNP_neighbors("Arabidopsis", "Chr2", 19681637, 10000)
#df1
#pd.concat([df1, df1])

In [None]:
# Query to find neighboring genes and annotate them

df["annotation"] = annotation_select(con, df["gene_id"].to_list())
df

In [None]:
def show_available_species(con):
    con = sqlite3.connect('SQNce.db')
    cursorObj = con.cursor()
    return(pd.read_sql_query("SELECT * FROM species", con))

show_available_species(con)

In [None]:
cursorObj = con.cursor()
cursorObj.execute('''SELECT gene_id, gene_annotation 
                     FROM gene_annotations 
                     WHERE gene_id =  ?  ''', (entity,))
# (name,) - need the comma to treat it as a single item and not list of letters
selected = cursorObj.fetchall()[0]
od[selected[0]] = selected[1]

In [None]:
# TODO add documentation to SQNce queries

def protein_seq_select(con, entity_list):
    od = OrderedDict()
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_variant, gene_annotation 
                             FROM protein_seqs 
                             WHERE protein_variant =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()[0]
        record = SeqRecord(Seq(zlib.decompress(selected[1]).decode(encoding='UTF-8')), 
                           id=selected[0], name="", description="")
        od[selected[0]] = record
        with open("selected.fasta", 'w') as handle:
            SeqIO.write(od.values(), handle, 'fasta')

In [None]:
def annotation_select(con, entity_list):
    ls = []
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT gene_id, gene_annotation 
                             FROM gene_annotations 
                             WHERE gene_id =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()
        if selected == []:
            ls.append("Gene not found")
        else:
            ls.append(selected[0][1])    
    return(ls)

con = sqlite3.connect('SQNce.db')
gene_list = ["Zm00001d010294", "dsa", "Sobic.002G128101", "AT2G34360", "Sobic.002G128101", "002G128101"]
df = pd.DataFrame({"name": gene_list, "annotation": annotation_select(con, gene_list) })
df.columns = ["GeneID", "annotation"]
df
#pd.DataFrame.from_dict(annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"], "dict"), 
#                       orient="index", columns=["annotation"])


In [None]:
annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"])

In [None]:
test = pd.DataFrame.from_dict(annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"]), 
                              orient="index").reset_index()
test.columns = ["GeneID", "annotation"]
test 

# Example Queries

In [None]:
con = sqlite3.connect('SQNce.db')
# Use the above query functions to parse SQNce with your gene lists 
input_value = [your_gene_list]
protein_seq_select(con, input_value)
con.close()

In [None]:
[{ 'label': label, 'value': val} for label, val in [[1,2], [2,3]]]

In [None]:
def distinct_db_vals(db, table, column, custom_vals=[], return_ls=False):
    # Input is the column to select and from which table
    # Returns a list of all values in a specific table from SQNce.db
    # Custom vals are added to the front using nested list of [label, value]
    ls = [{ 'label': label, 'value': val} for label, val in custom_vals]
    con = sqlite3.connect(db) # deploy with this
    cursorObj = con.cursor()
    distinct_df = pd.read_sql_query('''SELECT DISTINCT {0} 
                                       FROM {1}'''.format(column, table), con)
    if return_ls:
            return(distinct_df[column].to_list())
    for name in distinct_df[column]:
        ls.append({'label': name, 'value': name})
    return(ls)
distinct_db_vals("SQNce.db", "gene_coordinates", "genotype_id",[[1,2], [2,3]], True)

In [None]:
def family_gene_select(gene_list):
    # Use an input list of genes to find their family assignments
    con = sqlite3.connect("SQNce.db")
    ls = []
    for gene in gene_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_id, family_name 
                            FROM gene_families
                            WHERE protein_id =  ? ''', (gene,))
        selected = cursorObj.fetchall()
        if selected == []:
            ls.append("Gene not found")
        else:
            ls.append(selected[0][1])    
    return(ls)
family_gene_select(["Seita.5G010100", "test", "Zm00001d011673"])

In [None]:
con = sqlite3.connect("SQNce.db") # deploy with this
cursorObj = con.cursor()
genotype = str("','".join(['Zea mays', "dsa",'Setaria italica']))
family = str("','".join(['Terpenoid synthases', 'Cytochrome P450']))
df = pd.read_sql_query("""SELECT protein_id, family_name 
                                   FROM gene_families
                                   WHERE species_id IN ('{0}') AND family_name IN ('{1}')""".format(genotype, family), con)
df

In [None]:
selected = "subject_id"
entity_list = ["Zm00001d014121", "Zm00001d014134, ""Zm00001d014136"]
con = sqlite3.connect("SQNce.db") # deploy with this
cursorObj = con.cursor()
entity_list_str = str("','".join(entity_list))
df = pd.read_sql_query("""SELECT * 
                        FROM BBHs
                        WHERE {0} IN ('{1}')""".format(selected, entity_list_str), con)
df

In [None]:
# This is a stupid function but it seems to work correctly.
df.sort_values(['bit_score'], ascending=False).groupby(["subject_id", "query_genotype"]).agg({"bit_score": "first", "query_id": "first",}).reset_index()

In [None]:
df[df["query_genotype"].isin(["B97", "dsa"])]

In [None]:
test = pd.read_csv(os.path.join("init/BBHs_combs.tsv"), sep="\t")
