In [1]:
import os
import gzip
import sqlite3
import zlib
from sqlite3 import Error
import pandas as pd
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

# SQNce Creation Functions

In [3]:
# TODO add documentation to all SQNce creation functions

# Establish connection with SQNce.db, generating a new SQLite3 database if needed
def sql_connection():
    try:
        con = sqlite3.connect('SQNce-proteomes.db')
        print("Connection established.")
        return(con)
    except Error:
        print(Error)

# After establishing connection with SQNce create the specified tables
def sql_table(con):
    cursorObj = con.cursor()
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS protein_seqs(
                         protein_id text,
                         species_id text,
                         genotype_id text,
                         protein_length text,
                         protein_sequence blob,
                         FOREIGN KEY (genotype_id) REFERENCES genotypes(genotype_id)) 
                         """)   
    con.commit()

In [4]:
# Current implementation requires re-parsing of all the input files to create SQNce
# TODO SQNce update functions to parse input data only if not previously included 
if os.path.exists("SQNce-proteomes.db"): os.remove("SQNce-proteomes.db")
con = sql_connection()
sql_table(con)

Connection established.


# SQNce Data Input Functions

In [5]:
# TODO add documentation to all SQNce Data Input Functions
    
# https://stackoverflow.com/questions/18219779/bulk-insert-huge-data-into-sqlite-using-python
def protein_seq_insert(con, entity_list):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO protein_seqs(
                         protein_id,
                         species_id,
                         genotype_id, 
                         protein_length,  
                         protein_sequence) 
                         VALUES(?, ?, ?, ?, ?)""", entity_list)
    con.commit()

In [7]:
#######################################################################
########################### Protein Seqs ##############################
#######################################################################
df = pd.read_csv("init/proteins-db.tsv", sep="\t")
for index, row in df.iterrows():
    fasta_file = gzip.open("inputs/proteins/"+row[2], mode='rt')
    protein_seq_list = []
    for seq in SeqIO.parse(fasta_file, "fasta"):
        # Protein sequences are saved as as a Binary data type for compression
        protein_seq_list.append([seq.id, row[0], row[1], len(seq.seq), 
                                 sqlite3.Binary(zlib.compress(str(seq.seq).encode('utf-8')))])
    protein_seq_insert(con, protein_seq_list)
# Create keys for protein IDs with isoform and without
cursorObj = con.cursor()
cursorObj.execute("CREATE INDEX proteins_ids_index ON protein_seqs(protein_id)")
con.commit()    

# SQNce Query Functions

In [27]:
rows = """Chr2\t12345\nChr3\t54354\nChr2\t5234354\n""".split("\n")
coordinate_list = [row.split("\t") for row in rows]
if coordinate_list[-1]==[""]:
    coordinate_list = coordinate_list[:-1]
for row in coordinate_list:
    if len(row) != 2:
        return(html.P("Number of columns is not 2. Use tab-seperated values."))


SyntaxError: 'return' outside function (<ipython-input-27-5da10337c74d>, line 7)

In [38]:
# Query to find neighboring genes
def get_SNP_neighbors(genotype, chromsome, coordinate, distance):
    con = sqlite3.connect('SQNce.db')
    cursorObj = con.cursor()
    df = pd.read_sql_query('''SELECT * 
                     FROM gene_coordinates 
                     WHERE genotype_id = "{0}"
                     AND gene_chr = "{1}"
                     AND gene_start BETWEEN {2} AND {3}
                     
                     UNION ALL
                     
                     SELECT * 
                     FROM gene_coordinates 
                     WHERE genotype_id = "{0}"
                     AND gene_chr = "{1}"
                     AND gene_start BETWEEN {2} AND {3}
                     '''.format(genotype, chromsome, coordinate-distance, coordinate+distance), con)
    # Should check why it returns the same row twice, probably need to correct the query
    df = df.drop_duplicates()
    df.insert(0, 'Query', pd.Series(["_".join([chromsome, str(coordinate)]) for x in range(len(df.index))]))
    return(df)

df = pd.DataFrame()
for entity in [["Chr2", 19681637], ["Chr2", 1234564], ["Chr4", 1234564], ["Chr2", 19681638]]:
      df = pd.concat([df, get_SNP_neighbors("Arabidopsis", entity[0], entity[1], 10000)])  
#df.drop_duplicates(subset=["gene_id"]).reset_index()
#df1 = get_SNP_neighbors("Arabidopsis", "Chr2", 19681637, 10000)
#df1
#pd.concat([df1, df1])

In [39]:
# Query to find neighboring genes and annotate them

df["annotation"] = annotation_select(con, df["gene_id"].to_list())
df

Unnamed: 0,Query,gene_id,genotype_id,gene_chr,gene_start,gene_end,gene_orientation,annotation
0,Chr2_19681637,AT2G48110,Arabidopsis,Chr2,19673293,19679711,+,"structural constituent of ribosome, putative, ..."
1,Chr2_19681637,AT2G48120,Arabidopsis,Chr2,19679730,19681546,+,"PAC, putative, expressed"
2,Chr2_19681637,AT2G48121,Arabidopsis,Chr2,19681637,19682391,+,"chloroplast ribonuclease III domain protein, p..."
3,Chr2_19681637,AT2G48130,Arabidopsis,Chr2,19685066,19685993,-,LTPL78 - Protease inhibitor/seed storage/LTP f...
4,Chr2_19681637,AT2G48140,Arabidopsis,Chr2,19686409,19687664,+,LTPL99 - Protease inhibitor/seed storage/LTP f...
5,Chr2_19681637,AT2G48150,Arabidopsis,Chr2,19687962,19689173,-,glutathione peroxidase domain containing prote...
6,Chr2_19681637,AT2G48160,Arabidopsis,Chr2,19689409,19696821,-,"PWWP domain containing protein, expressed"
0,Chr2_1234564,AT2G03955,Arabidopsis,Chr2,1238677,1239071,+,
0,Chr4_1234564,AT4G02770,Arabidopsis,Chr4,1229111,1229945,-,"photosystem I reaction center subunit II, chlo..."
1,Chr4_1234564,AT4G02780,Arabidopsis,Chr4,1237767,1244813,-,"ent-kaurene synthase, chloroplast precursor, p..."


In [9]:
def show_available_species(con):
    con = sqlite3.connect('SQNce.db')
    cursorObj = con.cursor()
    return(pd.read_sql_query("SELECT * FROM species", con))

show_available_species(con)

Unnamed: 0,species_name,common_name
0,Arabidopsis halleri,Ahalleri
1,Arabidopsis lyrata,Alyrata
2,Arabidopsis thaliana,Arabidopsis
3,Brassica oleracea,Boleracea
4,Brassica rapa,Brapa
5,Glycine max,Soybean
6,Hordeum vulgare,Barley
7,Medicago truncatula,Barrel medic
8,Oropetium thomaeum,Resurrection grass
9,Oryza sativa,Rice


In [None]:
cursorObj = con.cursor()
cursorObj.execute('''SELECT gene_id, gene_annotation 
                     FROM gene_annotations 
                     WHERE gene_id =  ?  ''', (entity,))
# (name,) - need the comma to treat it as a single item and not list of letters
selected = cursorObj.fetchall()[0]
od[selected[0]] = selected[1]

In [None]:
# TODO add documentation to SQNce queries

def protein_seq_select(con, entity_list):
    od = OrderedDict()
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_variant, gene_annotation 
                             FROM protein_seqs 
                             WHERE protein_variant =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()[0]
        record = SeqRecord(Seq(zlib.decompress(selected[1]).decode(encoding='UTF-8')), 
                           id=selected[0], name="", description="")
        od[selected[0]] = record
        with open("selected.fasta", 'w') as handle:
            SeqIO.write(od.values(), handle, 'fasta')

In [36]:
def annotation_select(con, entity_list):
    ls = []
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT gene_id, gene_annotation 
                             FROM gene_annotations 
                             WHERE gene_id =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()
        if selected == []:
            ls.append("Gene not found")
        else:
            ls.append(selected[0][1])    
    return(ls)

con = sqlite3.connect('SQNce.db')
gene_list = ["Zm00001d010294", "dsa", "Sobic.002G128101", "AT2G34360", "Sobic.002G128101", "002G128101"]
df = pd.DataFrame({"name": gene_list, "annotation": annotation_select(con, gene_list) })
df.columns = ["GeneID", "annotation"]
df
#pd.DataFrame.from_dict(annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"], "dict"), 
#                       orient="index", columns=["annotation"])


Unnamed: 0,GeneID,annotation
0,Zm00001d010294,Ubiquitin-associated/translation elongation fa...
1,dsa,Gene not found
2,Sobic.002G128101,protein kinase family protein
3,AT2G34360,"MATE efflux family protein, putative, expressed"
4,Sobic.002G128101,protein kinase family protein
5,002G128101,Gene not found


In [33]:
annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"])

['Ubiquitin-associated/translation elongation factor EF1B protein', 'Gene not found', 'protein kinase family protein', 'MATE efflux family protein, putative, expressed']


['Ubiquitin-associated/translation elongation factor EF1B protein',
 'Gene not found',
 'protein kinase family protein',
 'MATE efflux family protein, putative, expressed']

In [47]:
test = pd.DataFrame.from_dict(annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"]), 
                              orient="index").reset_index()
test.columns = ["GeneID", "annotation"]
test 

Unnamed: 0,GeneID,annotation
0,Zm00001d010294,Ubiquitin-associated/translation elongation fa...
1,Sobic.002G128101,protein kinase family protein
2,AT2G34360,"MATE efflux family protein, putative, expressed"


# Example Queries

In [None]:
con = sqlite3.connect('SQNce.db')
# Use the above query functions to parse SQNce with your gene lists 
input_value = [your_gene_list]
protein_seq_select(con, input_value)
con.close()

In [7]:
[{ 'label': label, 'value': val} for label, val in [[1,2], [2,3]]]

[{'label': 1, 'value': 2}, {'label': 2, 'value': 3}]

In [55]:
def distinct_db_vals(db, table, column, custom_vals=[], return_ls=False):
    # Input is the column to select and from which table
    # Returns a list of all values in a specific table from SQNce.db
    # Custom vals are added to the front using nested list of [label, value]
    ls = [{ 'label': label, 'value': val} for label, val in custom_vals]
    con = sqlite3.connect(db) # deploy with this
    cursorObj = con.cursor()
    distinct_df = pd.read_sql_query('''SELECT DISTINCT {0} 
                                       FROM {1}'''.format(column, table), con)
    if return_ls:
            return(distinct_df[column].to_list())
    for name in distinct_df[column]:
        ls.append({'label': name, 'value': name})
    return(ls)
distinct_db_vals("SQNce.db", "gene_coordinates", "genotype_id",[[1,2], [2,3]], True)

['Arabidopsis', 'B73v2', 'B73v4', 'Gmax', 'Sorghum']

In [49]:
def family_gene_select(gene_list):
    # Use an input list of genes to find their family assignments
    con = sqlite3.connect("SQNce.db")
    ls = []
    for gene in gene_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_id, family_name 
                            FROM gene_families
                            WHERE protein_id =  ? ''', (gene,))
        selected = cursorObj.fetchall()
        if selected == []:
            ls.append("Gene not found")
        else:
            ls.append(selected[0][1])    
    return(ls)
family_gene_select(["Seita.5G010100", "test", "Zm00001d011673"])

['Cytochrome P450', 'Gene not found', 'Terpenoid synthases']

In [52]:
con = sqlite3.connect("SQNce.db") # deploy with this
cursorObj = con.cursor()
genotype = str("','".join(['Zea mays', "dsa",'Setaria italica']))
family = str("','".join(['Terpenoid synthases', 'Cytochrome P450']))
df = pd.read_sql_query("""SELECT protein_id, family_name 
                                   FROM gene_families
                                   WHERE species_id IN ('{0}') AND family_name IN ('{1}')""".format(genotype, family), con)
df

Unnamed: 0,protein_id,family_name
0,Seita.5G085000,Cytochrome P450
1,Seita.5G010200,Cytochrome P450
2,Seita.5G010100,Cytochrome P450
3,Seita.9G408200,Cytochrome P450
4,Seita.9G011300,Cytochrome P450
...,...,...
833,Zm00001d011673,Terpenoid synthases
834,Zm00001d009431,Terpenoid synthases
835,Zm00001d050159,Terpenoid synthases
836,Zm00001d014367,Terpenoid synthases


In [2]:
selected = "subject_id"
entity_list = ["Zm00001d014121", "Zm00001d014134, ""Zm00001d014136"]
con = sqlite3.connect("SQNce.db") # deploy with this
cursorObj = con.cursor()
entity_list_str = str("','".join(entity_list))
df = pd.read_sql_query("""SELECT * 
                        FROM BBHs
                        WHERE {0} IN ('{1}')""".format(selected, entity_list_str), con)
df

Unnamed: 0,subject_id,query_id,bit_score,subject_genotype,query_genotype
0,Zm00001d014121,Zm00023ab228620,1005,B73v4,CML247
1,Zm00001d014121,Zm00023ab144550,390,B73v4,CML247
2,Zm00001d014121,Zm00023ab142020,357,B73v4,CML247
3,Zm00001d014121,Zm00023ab171210,342,B73v4,CML247
4,Zm00001d014121,Zm00023ab228750,733,B73v4,CML247
...,...,...,...,...,...
1043,Zm00001d014121,Glyma.14G117200,409,B73v4,Wm82
1044,Zm00001d014121,Glyma.14G015100,384,B73v4,Wm82
1045,Zm00001d014121,Glyma.18G080400,370,B73v4,Wm82
1046,Zm00001d014121,Glyma.18G080200,384,B73v4,Wm82


In [16]:
# This is a stupid function but it seems to work correctly.
df.sort_values(['bit_score'], ascending=False).groupby(["subject_id", "query_genotype"]).agg({"bit_score": "first", "query_id": "first",}).reset_index()

Unnamed: 0,subject_id,query_genotype,bit_score,query_id
0,Zm00001d014121,B73v4,1015,Zm00001d014121
1,Zm00001d014121,B73v5,1013,Zm00001eb222540
2,Zm00001d014121,B97,1007,Zm00018ab231330
3,Zm00001d014121,CML103,1012,Zm00021ab225420
4,Zm00001d014121,CML228,1012,Zm00022ab224970
5,Zm00001d014121,CML247,1005,Zm00023ab228620
6,Zm00001d014121,CML277,1003,Zm00024ab225870
7,Zm00001d014121,CML322,1012,Zm00025ab231000
8,Zm00001d014121,CML333,1005,Zm00026ab225150
9,Zm00001d014121,CML52,1011,Zm00019ab212360


In [15]:
df[df["query_genotype"].isin(["B97", "dsa"])]

Unnamed: 0,subject_id,query_id,bit_score,subject_genotype,query_genotype
107,Zm00001d011673,Zm00018ab378850,709,B73v4,B97
108,Zm00001d011673,Zm00018ab160760,688,B73v4,B97
109,Zm00001d011673,Zm00018ab359540,615,B73v4,B97


In [30]:
test = pd.read_csv(os.path.join("init/BBHs_combs.tsv"), sep="\t")


['CML247',
 'CML69',
 'B73v5',
 'CML333',
 'Ki3',
 'Sbicolor',
 'CML277',
 'Oh7B',
 'CML52',
 'Il14H',
 'Sitalica',
 'P39',
 'M37W',
 'Oh43',
 'HP301',
 'Tzi8',
 'Osativa',
 'Mo18W',
 'Ki11',
 'M162W',
 'Col-0',
 'B73v4',
 'Ms71',
 'Sviridis',
 'Ky21',
 'NC350',
 'CML228',
 'CML103',
 'NC358',
 'CML322',
 'Tx303',
 'Pvirgatum',
 'B97',
 'Wm82']

test not in
