In [1]:
import os
import gzip
import sqlite3
import zlib
from sqlite3 import Error
import pandas as pd
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

# SQNce Creation Functions

In [2]:
# TODO add documentation to all SQNce creation functions

# Establish connection with SQNce.db, generating a new SQLite3 database if needed
def sql_connection():
    try:
        con = sqlite3.connect('SQNce.db')
        print("Connection established.")
        return(con)
    except Error:
        print(Error)

# After establishing connection with SQNce create the specified tables
def sql_table(con):
    cursorObj = con.cursor()
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS species(
                         species_name text PRIMARY KEY, 
                         common_name text) 
                         WITHOUT ROWID""")
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS genotypes(
                         genotype_id text PRIMARY KEY, 
                         species_name text, 
                         genotype_name text,
                         FOREIGN KEY (species_name) REFERENCES species(species_name)) 
                         WITHOUT ROWID""")
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS gene_coordinates(
                         gene_id text,
                         genotype_id text,
                         gene_chr text,
                         gene_start integer,
                         gene_end integer,
                         gene_orientation text,
                         FOREIGN KEY (genotype_id) REFERENCES genotypes(genotype_id)) 
                         """)

    cursorObj.execute("""CREATE TABLE IF NOT EXISTS protein_seqs(
                         protein_isoform text,
                         protein_id text,
                         species_id text,
                         genotype_id text,
                         protein_length text,
                         protein_sequence blob,
                         FOREIGN KEY (genotype_id) REFERENCES genotypes(genotype_id)) 
                         """)
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS gene_families(
                         protein_id text,
                         species_id text,
                         genotype_id text,
                         source_id text,
                         family_id text,
                         family_name text,
                         FOREIGN KEY (genotype_id) REFERENCES genotypes(genotype_id)) 
    """)
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS promoter_seqs(
                         protein_id text,
                         genotype_id text,
                         promoter_kind text,
                         promoter_length text,
                         promoter_sequence blob,
                         FOREIGN KEY (genotype_id) REFERENCES genotypes(genotype_id)) 
                         """)
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS gene_annotations(
                         gene_id text PRIMARY KEY,
                         gene_species text,
                         gene_genotype text,
                         annotation_source text,
                         gene_annotation text,
                         FOREIGN KEY (gene_genotype) REFERENCES genotypes(genotype_id)) 
                         WITHOUT ROWID""")
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS packages(
                         rowid integer PRIMARY KEY,
                         name text,
                         version text,
                         settings text) 
                         WITHOUT ROWID""")
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS studies(
                         study_accession text PRIMARY KEY, 
                         tax_id integer,
                         scientific_name text,
                         instrument_model text,
                         library_strategy text,
                         description text)
                         WITHOUT ROWID""")

    cursorObj.execute("""CREATE TABLE IF NOT EXISTS fastq(
                         run_accession text PRIMARY KEY,
                         study_accession text,
                         read_count integer,
                         sample_alias text,
                         fastq_ftp text,
                         fastq_md5 text,
                         compression integer,
                         FOREIGN KEY (study_accession) REFERENCES studies(study_accession)) 
                         WITHOUT ROWID""")
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS bam(
                         run_accession text,
                         study_accession text,
                         sample_alias text,
                         compression integer,
                         filter integer,
                         align integer,
                         sort integer,
                         FOREIGN KEY (run_accession) REFERENCES fastq(run_accession)) 
                         """)

    con.commit()

In [3]:
# Current implementation requires re-parsing of all the input files to create SQNce
# TODO SQNce update functions to parse input data only if not previously included 
if os.path.exists("SQNce.db"): os.remove("SQNce.db")
con = sql_connection()
sql_table(con)

Connection established.


# SQNce Data Input Functions

In [4]:
# TODO add documentation to all SQNce Data Input Functions

def species_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO species(
                         species_name, common_name) 
                         VALUES(?, ?)""", entities)
    con.commit()
    
def genotype_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO genotypes(
                         genotype_id, 
                         species_name, 
                         genotype_name) 
                         VALUES(?, ?, ?)""", entities)
    con.commit()

# https://stackoverflow.com/questions/18219779/bulk-insert-huge-data-into-sqlite-using-python
def gene_coordinates_insert(con, entity_list):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO gene_coordinates(
                         gene_id,
                         genotype_id,
                         gene_chr, 
                         gene_start,  
                         gene_end,
                         gene_orientation) 
                         VALUES(?, ?, ?, ?, ?, ?)""", entity_list)
    con.commit()

    
    
# https://stackoverflow.com/questions/18219779/bulk-insert-huge-data-into-sqlite-using-python
def protein_seq_insert(con, entity_list):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO protein_seqs(
                         protein_isoform,
                         protein_id,
                         species_id,
                         genotype_id, 
                         protein_length,  
                         protein_sequence) 
                         VALUES(?, ?, ?, ?, ?, ?)""", entity_list)
    con.commit()
    
def family_annotation_insert(con, entity_list):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO gene_families(
                         protein_id,
                         species_id,
                         genotype_id,
                         source_id,
                         family_id,  
                         family_name) 
                         VALUES(?, ?, ?, ?, ?, ?)""", entity_list)
    con.commit()
    
# https://stackoverflow.com/questions/18219779/bulk-insert-huge-data-into-sqlite-using-python
def promoter_seq_insert(con, entity_list):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO promoter_seqs(
                         protein_id,
                         genotype_id,
                         promoter_kind,
                         promoter_length,  
                         promoter_sequence) 
                         VALUES(?, ?, ?, ?, ?)""", entity_list)
    con.commit()

    
def gene_annotation_insert(con, entity_list):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO gene_annotations(
                         gene_id,
                         gene_species,
                         gene_genotype,
                         annotation_source,
                         gene_annotation) 
                         VALUES(?, ?, ?, ?, ?)""", entity_list)
    con.commit()

def packages_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO packages(
                         rowid, 
                         name,
                         version,
                         settings) 
                         VALUES(?, ?, ?, ?)""", entities)
    con.commit()
    
def studies_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO studies(
                         study_accession, 
                         tax_id, 
                         scientific_name,
                         instrument_model,
                         library_strategy,
                         description) 
                         VALUES(?, ?, ?, ?, ?, ?)""", entities)
    con.commit()

# https://stackoverflow.com/questions/18219779/bulk-insert-huge-data-into-sqlite-using-python
def fastq_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO fastq(
                         run_accession, 
                         study_accession, 
                         read_count, 
                         sample_alias,  
                         fastq_ftp,
                         fastq_md5,
                         compression) 
                         VALUES(?, ?, ?, ?, ?, ?, ?)""", entities)
    con.commit()
    
def bam_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO bam(
                         run_accession,
                         study_accession,
                         sample_alias,
                         compression,
                         filter,
                         align,
                         sort) 
                         VALUES(?, ?, ?, ?, ?, ?, ?)""", entities)
    con.commit()

In [5]:
# SQNce database is initiated using predifined TSV files
# TSV either contain the input data or reference input files to parse
# TODO add documentation to SQNce data input parsing

df = pd.read_csv("init/species.tsv", sep="\t")
for index, row in df.iterrows():
    species_insert(con, entities=list(row))

df = pd.read_csv("init/genotypes.tsv", sep="\t")
for index, row in df.iterrows():
    genotype_insert(con, entities=list(row))

#######################################################################
############################ Coordinates ##############################
#######################################################################
df = pd.read_csv("init/coordinates.tsv", sep="\t")
for index, row in df.iterrows():
    gff3_file = gzip.open("inputs/gff3/"+row[3], mode='rt')
    gene_coordinate_list = []
    for gene in gff3_file:
        gene = gene.split("\t")
        if len(gene) == 1:
            # skip the first row
            continue
        if gene[2] == "gene":
            gene_id = gene[-1].split(";")
            gene_id = [i for i in gene_id if i.startswith('Name=')][0].replace('Name=', '').replace('\n', '') # delete \n if exists
            # Append list of: gene ID, genotype, chromsome, start, end, orientation
            gene_coordinate_list.append([gene_id, row[1], gene[0], gene[3], gene[4], gene[6]])
        else:
            continue
    gene_coordinates_insert(con, gene_coordinate_list)    

# Create a secondary key on the name column
cursorObj = con.cursor()
cursorObj.execute("CREATE INDEX coordinate_index_start ON gene_coordinates(genotype_id, gene_chr, gene_start)")
cursorObj.execute("CREATE INDEX coordinate_index_end ON gene_coordinates(genotype_id, gene_chr, gene_end)")
con.commit()

#######################################################################
########################### Protein Seqs ##############################
#######################################################################
df = pd.read_csv("init/proteins.tsv", sep="\t")
for index, row in df.iterrows():
    fasta_file = gzip.open("inputs/proteins/"+row[2], mode='rt')
    protein_seq_list = []
    for seq in SeqIO.parse(fasta_file, "fasta"):
        # Extract the locus ID that doesn't contain the isoform
        locus = seq.description.split(" ")
        for ls in locus:
            if "locus" in ls:
                locus = ls[6:]
        # Protein sequences are saved as as a Binary data type for compression
        protein_seq_list.append([seq.id, locus, row[0], row[1], len(seq.seq), 
                                 sqlite3.Binary(zlib.compress(str(seq.seq).encode('utf-8')))])
    protein_seq_insert(con, protein_seq_list)
# Create keys for protein IDs with isoform and without
cursorObj = con.cursor()
cursorObj.execute("CREATE INDEX proteins_isoforms_index ON protein_seqs(protein_isoform)")
cursorObj.execute("CREATE INDEX proteins_ids_index ON protein_seqs(protein_id)")
con.commit()    

#######################################################################
########################### Family Annotations ##############################
#######################################################################
df = pd.read_csv("init/families.tsv", sep="\t")
for index, row in df.iterrows():
    annot = pd.read_csv("inputs/families/"+row[3], sep="\t")
    annot.insert(1, 'species', row[0])
    annot.insert(2, 'genotype', row[1])
    annot.insert(3, 'source', row[2])
    family_annotation_insert(con, annot.values.tolist())
# Create keys for protein IDs with isoform and without
cursorObj = con.cursor()
cursorObj.execute("CREATE INDEX family_gene_index ON gene_families(protein_id)")
#cursorObj.execute("CREATE INDEX family_species_index ON gene_families(species_id)")
#cursorObj.execute("CREATE INDEX family_genotype_index ON gene_families(genotype_id)")
#cursorObj.execute("CREATE INDEX family_id_index ON gene_families(family_id)")
cursorObj.execute("CREATE INDEX family_name_index ON gene_families(family_name)")
cursorObj.execute("CREATE INDEX family_genotype_name_index ON gene_families(genotype_id, family_name)")
con.commit()    



#######################################################################
########################### Promoter Seqs #############################
#######################################################################
df = pd.read_csv("init/promoters.tsv", sep="\t")
for index, row in df.iterrows():
    fasta_file = gzip.open("inputs/promoters/"+row[5]+"_1kb_ATG.fasta.gz", mode='rt')
    promoter_seq_list = []
    for seq in SeqIO.parse(fasta_file, "fasta"):
        # TODO change db to number key to avoid duplicated name problems
        if row[0] == "ZmB73v3": 
            seq.id = seq.id + "v3"
        # Protein sequences are saved as as a Binary data type for compression
        promoter_seq_list.append([seq.id, row[0], "ATG", row[4], 
                                 sqlite3.Binary(zlib.compress(str(seq.seq).encode('utf-8')))])
    promoter_seq_insert(con, promoter_seq_list)
    
    fasta_file = gzip.open("inputs/promoters/"+row[5]+"_1kb_TSS.fasta.gz", mode='rt')
    promoter_seq_list = []
    for seq in SeqIO.parse(fasta_file, "fasta"):
        # TODO change db to number key to avoid duplicated name problems
        if row[0] == "ZmB73v3": 
            seq.id = seq.id + "v3"
        # Protein sequences are saved as as a Binary data type for compression
        promoter_seq_list.append([seq.id, row[0], "TSS", row[4], 
                                 sqlite3.Binary(zlib.compress(str(seq.seq).encode('utf-8')))])
    promoter_seq_insert(con, promoter_seq_list)
    
# Create a secondary key on the name column
cursorObj = con.cursor()
cursorObj.execute("CREATE INDEX promoter_index ON promoter_seqs(protein_id, promoter_kind)")
con.commit()

#######################################################################
############################ Annotations ##############################
#######################################################################

df = pd.read_csv("init/annotation_list.tsv", sep="\t")
# Every element is: gene_id, gene_species, gene_genotype, gene_annotation, annotation_source
gene_annotation_list = []
for index, row in df.iterrows():
    annot = pd.read_csv("inputs/annotations/"+row[3], sep="\t")
    annot = annot.drop_duplicates(subset="locusName")

    if row[0]=="Arabidopsis thaliana":
        annot = annot[["locusName", "rice-defline"]]
    elif row[0] in ["Panicum hallii", "Pharus latifolius", "Solanum lycopersicum", "Vigna unguiculata"]:
        annot = annot[["locusName", "Best-hit-arabi-defline"]]        
    else:
        annot = annot[["locusName", "arabi-defline"]]
    annot.insert(1, 'species', row[0])
    annot.insert(2, 'genotype', row[1])
    annot.insert(3, 'source', row[2])
    gene_annotation_insert(con, annot.values.tolist())

#######################################################################
########################### RNA-seq Files #############################
#######################################################################
    

df = pd.read_csv("inputs/omics/packages.tsv", sep="\t")
for index, row in df.iterrows():
    packages_insert(con, entities=list(row))

df = pd.read_csv("inputs/omics/studies.tsv", sep="\t")
for index, row in df.iterrows():
    studies_insert(con, entities=list(row))

df = pd.read_csv("inputs/omics/fastq.tsv", sep="\t")
for index, row in df.iterrows():
    fastq_insert(con, entities=list(row))

df = pd.read_csv("inputs/omics/bam.tsv", sep="\t")
for index, row in df.iterrows():
    bam_insert(con, entities=list(row))

In [6]:
annot

Unnamed: 0,gene,species,genotype,source,SSF,eval,description
0,Zm00001d020134,Zea mays,B73,SUPERFAMILY,37958,0.0,P-loop containing nucleoside triphosphate hydr...
1,Zm00001d020134,Zea mays,B73,SUPERFAMILY,37958,0.0,P-loop containing nucleoside triphosphate hydr...
2,Zm00001d021647,Zea mays,B73,SUPERFAMILY,37958,0.0,P-loop containing nucleoside triphosphate hydr...
3,Zm00001d021647,Zea mays,B73,SUPERFAMILY,37958,0.0,P-loop containing nucleoside triphosphate hydr...
4,Zm00001d020208,Zea mays,B73,SUPERFAMILY,37958,0.0,P-loop containing nucleoside triphosphate hydr...
...,...,...,...,...,...,...,...
36867,Zm00001d013760,Zea mays,B73,SUPERFAMILY,46406,0.0,ENTH/VHS domain
36868,Zm00001d017951,Zea mays,B73,SUPERFAMILY,49418,0.0,Nudix
36869,Zm00001d012826,Zea mays,B73,SUPERFAMILY,54271,0.0,Ribonuclease PH domain 2-like
36870,Zm00001d013151,Zea mays,B73,SUPERFAMILY,51412,0.0,Lipocalins


# SQNce Query Functions

In [27]:
rows = """Chr2\t12345\nChr3\t54354\nChr2\t5234354\n""".split("\n")
coordinate_list = [row.split("\t") for row in rows]
if coordinate_list[-1]==[""]:
    coordinate_list = coordinate_list[:-1]
for row in coordinate_list:
    if len(row) != 2:
        return(html.P("Number of columns is not 2. Use tab-seperated values."))


SyntaxError: 'return' outside function (<ipython-input-27-5da10337c74d>, line 7)

In [38]:
# Query to find neighboring genes
def get_SNP_neighbors(genotype, chromsome, coordinate, distance):
    con = sqlite3.connect('SQNce.db')
    cursorObj = con.cursor()
    df = pd.read_sql_query('''SELECT * 
                     FROM gene_coordinates 
                     WHERE genotype_id = "{0}"
                     AND gene_chr = "{1}"
                     AND gene_start BETWEEN {2} AND {3}
                     
                     UNION ALL
                     
                     SELECT * 
                     FROM gene_coordinates 
                     WHERE genotype_id = "{0}"
                     AND gene_chr = "{1}"
                     AND gene_start BETWEEN {2} AND {3}
                     '''.format(genotype, chromsome, coordinate-distance, coordinate+distance), con)
    # Should check why it returns the same row twice, probably need to correct the query
    df = df.drop_duplicates()
    df.insert(0, 'Query', pd.Series(["_".join([chromsome, str(coordinate)]) for x in range(len(df.index))]))
    return(df)

df = pd.DataFrame()
for entity in [["Chr2", 19681637], ["Chr2", 1234564], ["Chr4", 1234564], ["Chr2", 19681638]]:
      df = pd.concat([df, get_SNP_neighbors("Arabidopsis", entity[0], entity[1], 10000)])  
#df.drop_duplicates(subset=["gene_id"]).reset_index()
#df1 = get_SNP_neighbors("Arabidopsis", "Chr2", 19681637, 10000)
#df1
#pd.concat([df1, df1])

In [39]:
# Query to find neighboring genes and annotate them

df["annotation"] = annotation_select(con, df["gene_id"].to_list())
df

Unnamed: 0,Query,gene_id,genotype_id,gene_chr,gene_start,gene_end,gene_orientation,annotation
0,Chr2_19681637,AT2G48110,Arabidopsis,Chr2,19673293,19679711,+,"structural constituent of ribosome, putative, ..."
1,Chr2_19681637,AT2G48120,Arabidopsis,Chr2,19679730,19681546,+,"PAC, putative, expressed"
2,Chr2_19681637,AT2G48121,Arabidopsis,Chr2,19681637,19682391,+,"chloroplast ribonuclease III domain protein, p..."
3,Chr2_19681637,AT2G48130,Arabidopsis,Chr2,19685066,19685993,-,LTPL78 - Protease inhibitor/seed storage/LTP f...
4,Chr2_19681637,AT2G48140,Arabidopsis,Chr2,19686409,19687664,+,LTPL99 - Protease inhibitor/seed storage/LTP f...
5,Chr2_19681637,AT2G48150,Arabidopsis,Chr2,19687962,19689173,-,glutathione peroxidase domain containing prote...
6,Chr2_19681637,AT2G48160,Arabidopsis,Chr2,19689409,19696821,-,"PWWP domain containing protein, expressed"
0,Chr2_1234564,AT2G03955,Arabidopsis,Chr2,1238677,1239071,+,
0,Chr4_1234564,AT4G02770,Arabidopsis,Chr4,1229111,1229945,-,"photosystem I reaction center subunit II, chlo..."
1,Chr4_1234564,AT4G02780,Arabidopsis,Chr4,1237767,1244813,-,"ent-kaurene synthase, chloroplast precursor, p..."


In [9]:
def show_available_species(con):
    con = sqlite3.connect('SQNce.db')
    cursorObj = con.cursor()
    return(pd.read_sql_query("SELECT * FROM species", con))

show_available_species(con)

Unnamed: 0,species_name,common_name
0,Arabidopsis halleri,Ahalleri
1,Arabidopsis lyrata,Alyrata
2,Arabidopsis thaliana,Arabidopsis
3,Brassica oleracea,Boleracea
4,Brassica rapa,Brapa
5,Glycine max,Soybean
6,Hordeum vulgare,Barley
7,Medicago truncatula,Barrel medic
8,Oropetium thomaeum,Resurrection grass
9,Oryza sativa,Rice


In [None]:
cursorObj = con.cursor()
cursorObj.execute('''SELECT gene_id, gene_annotation 
                     FROM gene_annotations 
                     WHERE gene_id =  ?  ''', (entity,))
# (name,) - need the comma to treat it as a single item and not list of letters
selected = cursorObj.fetchall()[0]
od[selected[0]] = selected[1]

In [None]:
# TODO add documentation to SQNce queries

def protein_seq_select(con, entity_list):
    od = OrderedDict()
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_variant, gene_annotation 
                             FROM protein_seqs 
                             WHERE protein_variant =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()[0]
        record = SeqRecord(Seq(zlib.decompress(selected[1]).decode(encoding='UTF-8')), 
                           id=selected[0], name="", description="")
        od[selected[0]] = record
        with open("selected.fasta", 'w') as handle:
            SeqIO.write(od.values(), handle, 'fasta')

In [36]:
def annotation_select(con, entity_list):
    ls = []
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT gene_id, gene_annotation 
                             FROM gene_annotations 
                             WHERE gene_id =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()
        if selected == []:
            ls.append("Gene not found")
        else:
            ls.append(selected[0][1])    
    return(ls)

con = sqlite3.connect('SQNce.db')
gene_list = ["Zm00001d010294", "dsa", "Sobic.002G128101", "AT2G34360", "Sobic.002G128101", "002G128101"]
df = pd.DataFrame({"name": gene_list, "annotation": annotation_select(con, gene_list) })
df.columns = ["GeneID", "annotation"]
df
#pd.DataFrame.from_dict(annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"], "dict"), 
#                       orient="index", columns=["annotation"])


Unnamed: 0,GeneID,annotation
0,Zm00001d010294,Ubiquitin-associated/translation elongation fa...
1,dsa,Gene not found
2,Sobic.002G128101,protein kinase family protein
3,AT2G34360,"MATE efflux family protein, putative, expressed"
4,Sobic.002G128101,protein kinase family protein
5,002G128101,Gene not found


In [33]:
annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"])

['Ubiquitin-associated/translation elongation factor EF1B protein', 'Gene not found', 'protein kinase family protein', 'MATE efflux family protein, putative, expressed']


['Ubiquitin-associated/translation elongation factor EF1B protein',
 'Gene not found',
 'protein kinase family protein',
 'MATE efflux family protein, putative, expressed']

In [47]:
test = pd.DataFrame.from_dict(annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"]), 
                              orient="index").reset_index()
test.columns = ["GeneID", "annotation"]
test 

Unnamed: 0,GeneID,annotation
0,Zm00001d010294,Ubiquitin-associated/translation elongation fa...
1,Sobic.002G128101,protein kinase family protein
2,AT2G34360,"MATE efflux family protein, putative, expressed"


# Example Queries

In [None]:
con = sqlite3.connect('SQNce.db')
# Use the above query functions to parse SQNce with your gene lists 
input_value = [your_gene_list]
protein_seq_select(con, input_value)
con.close()

In [14]:
[1,2,3] + [3,4,5,6]

[1, 2, 3, 3, 4, 5, 6]

In [7]:
[{ 'label': label, 'value': val} for label, val in [[1,2], [2,3]]]

[{'label': 1, 'value': 2}, {'label': 2, 'value': 3}]

In [9]:
def distinct_db_vals(db, table, column, custom_vals=[]):
    # Input is the column to select and from which table
    # Returns a list of all values in a specific table from SQNce.db
    # Custom vals are added to the front using nested list of [label, value]
    ls = [{ 'label': label, 'value': val} for label, val in custom_vals]
    con = sqlite3.connect(db) # deploy with this
    cursorObj = con.cursor()
    distinct_df = pd.read_sql_query('''SELECT DISTINCT {0} 
                                       FROM {1}'''.format(column, table), con)
    for name in distinct_df[column]:
        ls.append({'label': name, 'value': name})
    return(ls)
distinct_db_vals("SQNce.db", "gene_coordinates", "genotype_id",[[1,2], [2,3]])

[{'label': 1, 'value': 2},
 {'label': 2, 'value': 3},
 {'label': 'Arabidopsis', 'value': 'Arabidopsis'},
 {'label': 'B73v2', 'value': 'B73v2'},
 {'label': 'B73v4', 'value': 'B73v4'},
 {'label': 'Gmax', 'value': 'Gmax'},
 {'label': 'Sorghum', 'value': 'Sorghum'}]

In [49]:
def family_gene_select(gene_list):
    # Use an input list of genes to find their family assignments
    con = sqlite3.connect("SQNce.db")
    ls = []
    for gene in gene_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_id, family_name 
                            FROM gene_families
                            WHERE protein_id =  ? ''', (gene,))
        selected = cursorObj.fetchall()
        if selected == []:
            ls.append("Gene not found")
        else:
            ls.append(selected[0][1])    
    return(ls)
family_gene_select(["Seita.5G010100", "test", "Zm00001d011673"])

['Cytochrome P450', 'Gene not found', 'Terpenoid synthases']

In [45]:
con = sqlite3.connect("SQNce.db") # deploy with this
cursorObj = con.cursor()
genotype = str("','".join(['Zea mays', "dsa",'Setaria italica']))
family = str("','".join(['Terpenoid synthases', 'Cytochrome P450']))
df = pd.read_sql_query("""SELECT protein_id, family_name 
                                   FROM gene_families
                                   WHERE species_id IN ('{0}') AND family_name IN ('{1}')""".format(genotype, family), con)
df

Unnamed: 0,protein_id,family_name
0,Seita.5G085000,Cytochrome P450
1,Seita.5G010200,Cytochrome P450
2,Seita.5G010100,Cytochrome P450
3,Seita.9G408200,Cytochrome P450
4,Seita.9G011300,Cytochrome P450
...,...,...
833,Zm00001d011673,Terpenoid synthases
834,Zm00001d009431,Terpenoid synthases
835,Zm00001d050159,Terpenoid synthases
836,Zm00001d014367,Terpenoid synthases
