In [1]:
import os
import gzip
import sqlite3
from sqlite3 import Error
import pandas as pd
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

import zlib

# SQNce Creation Functions

In [9]:
# TODO add documentation to all SQNce creation functions

# Establish connection with SQNce.db, generating a new SQLite3 database if needed
def sql_connection():
    try:
        con = sqlite3.connect('SQNce.db')
        print("Connection established.")
        return(con)
    except Error:
        print(Error)

# After establishing connection with SQNce create the specified tables
def sql_table(con):
    cursorObj = con.cursor()
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS species(
                         species_name text PRIMARY KEY, 
                         common_name text) 
                         WITHOUT ROWID""")
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS genotypes(
                         genotype_id text PRIMARY KEY, 
                         species_name text, 
                         genotype_name text,
                         FOREIGN KEY (species_name) REFERENCES species(species_name)) 
                         WITHOUT ROWID""")

    cursorObj.execute("""CREATE TABLE IF NOT EXISTS protein_seqs(
                         protein_variant text PRIMARY KEY,
                         protein_id text,
                         genotype_id text,
                         protein_length text,
                         protein_sequence blob,
                         FOREIGN KEY (genotype_id) REFERENCES genotypes(genotype_id)) 
                         WITHOUT ROWID""")
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS gene_annotations(
                         gene_id text PRIMARY KEY,
                         gene_species text,
                         gene_genotype text,
                         annotation_source text,
                         gene_annotation text,
                         FOREIGN KEY (gene_genotype) REFERENCES genotypes(genotype_id)) 
                         WITHOUT ROWID""")
    
    
    con.commit()

In [17]:
# Current implementation requires re-parsing of all the input files to create SQNce
# TODO SQNce update functions to parse input data only if not previously included 

if os.path.exists("SQNce.db"): os.remove("SQNce.db")
con = sql_connection()
sql_table(con)

Connection established.


# SQNce Data Input Functions

In [11]:
# TODO add documentation to all SQNce Data Input Functions

def species_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO species(
                         species_name, common_name) 
                         VALUES(?, ?)""", entities)
    con.commit()
    
def genotype_insert(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO genotypes(
                         genotype_id, 
                         species_name, 
                         genotype_name) 
                         VALUES(?, ?, ?)""", entities)
    con.commit()

# https://stackoverflow.com/questions/18219779/bulk-insert-huge-data-into-sqlite-using-python
def protein_seq_insert(con, entity_list):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO protein_seqs(
                         protein_variant, 
                         protein_id, 
                         genotype_id, 
                         protein_length,  
                         protein_sequence) 
                         VALUES(?, ?, ?, ?, ?)""", entity_list)
    con.commit()
    
def gene_annotation_insert(con, entity_list):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO gene_annotations(
                         gene_id,
                         gene_species,
                         gene_genotype,
                         annotation_source,
                         gene_annotation) 
                         VALUES(?, ?, ?, ?, ?)""", entity_list)
    con.commit()

In [None]:
# SQNce database is initiated using predifined TSV files
# TSV either contain the input data or reference input files to parse
# TODO add documentation to SQNce data input parsing

df = pd.read_csv("init/species.tsv", sep="\t")
for index, row in df.iterrows():
    species_insert(con, entities=list(row))

df = pd.read_csv("init/genotypes.tsv", sep="\t")
for index, row in df.iterrows():
    genotype_insert(con, entities=list(row))

df = pd.read_csv("init/fasta_list.tsv", sep="\t")
for index, row in df.iterrows():
    fasta_file = gzip.open("/files/longest/"+row[2], mode='rt')
    protein_seq_list = []
    for seq in SeqIO.parse(fasta_file, "fasta"):
        # TODO change db to number key to avoid duplicated name problems
        if row[0] == "ZmB73v3": 
            seq.id = seq.id + "v3"
        # Protein sequences are saved as as a Binary data type for compression
        protein_seq_list.append([seq.id, seq.id.split("_")[0], row[0], len(seq.seq), 
                                 sqlite3.Binary(zlib.compress(str(seq.seq).encode('utf-8')))])
    protein_seq_insert(con, protein_seq_list)
    
################## Not yet tested ##################
df = pd.read_csv("init/annotation_list.tsv", sep="\t")
# Every element is: gene_id, gene_species, gene_genotype, gene_annotation, annotation_source
gene_annotation_list = []
for index, row in df.iterrows():
    annot = pd.read_csv("inputs/"+row[3], sep="\t")
    annot = annot.drop_duplicates(subset="locusName")
    annot = annot[["locusName", "arabi-defline"]]
    annot.insert(1, 'species', row[0])
    annot.insert(2, 'genotype', row[1])
    annot.insert(3, 'source', row[2])
    gene_annotation_insert(con, test.values.tolist())

In [18]:
################## Not yet tested ##################
df = pd.read_csv("init/annotation_list.tsv", sep="\t")
# Every element is: gene_id, gene_species, gene_genotype, gene_annotation, annotation_source
gene_annotation_list = []
for index, row in df.iterrows():
    annot = pd.read_csv("inputs/"+row[3], sep="\t")
    annot = annot.drop_duplicates(subset="locusName")
    if row[0]=="Arabidopsis thaliana":
        annot = annot[["locusName", "rice-defline"]]
    else:
        annot = annot[["locusName", "arabi-defline"]]
    annot.insert(1, 'species', row[0])
    annot.insert(2, 'genotype', row[1])
    annot.insert(3, 'source', row[2])
    gene_annotation_insert(con, annot.values.tolist())

# SQNce Query Functions

In [None]:
# TODO add documentation to SQNce queries

def protein_seq_select(con, entity_list):
    od = OrderedDict()
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT protein_variant, gene_annotation 
                             FROM protein_seqs 
                             WHERE protein_variant =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()[0]
        record = SeqRecord(Seq(zlib.decompress(selected[1]).decode(encoding='UTF-8')), 
                           id=selected[0], name="", description="")
        od[selected[0]] = record
        with open("selected.fasta", 'w') as handle:
            SeqIO.write(od.values(), handle, 'fasta')

In [32]:
def annotation_select(con, entity_list):
    od = OrderedDict()
    for entity in entity_list:
        cursorObj = con.cursor()
        cursorObj.execute('''SELECT gene_id, gene_annotation 
                             FROM gene_annotations 
                             WHERE gene_id =  ?  ''', (entity,))
        # (name,) - need the comma to treat it as a single item and not list of letters
        selected = cursorObj.fetchall()[0]
        od[selected[0]] = selected[1]
    return(od)

con = sqlite3.connect('SQNce.db')

pd.DataFrame.from_dict(annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"]), 
                       orient="index", columns=["annotation"])


OrderedDict([('Zm00001d010294',
              'Ubiquitin-associated/translation elongation factor EF1B protein'),
             ('Sobic.002G128101', 'protein kinase family protein'),
             ('AT2G34360', 'MATE efflux family protein, putative, expressed')])

In [35]:
annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"])

'Ubiquitin-associated/translation elongation factor EF1B protein'

In [47]:
test = pd.DataFrame.from_dict(annotation_select(con, ["Zm00001d010294", "Sobic.002G128101", "AT2G34360"]), 
                              orient="index").reset_index()
test.columns = ["GeneID", "annotation"]
test 

Unnamed: 0,GeneID,annotation
0,Zm00001d010294,Ubiquitin-associated/translation elongation fa...
1,Sobic.002G128101,protein kinase family protein
2,AT2G34360,"MATE efflux family protein, putative, expressed"


# Example Queries

In [None]:
con = sqlite3.connect('SQNce.db')
# Use the above query functions to parse SQNce with your gene lists 
input_value = [your_gene_list]
protein_seq_select(con, input_value)
con.close()