In [1]:
import os
import gzip
import sqlite3
import zlib
import io
import numpy as np
#from sqlite3 import Error
import pandas as pd
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

# SQNce Mapping-related Tables

In [2]:
# TODO add documentation to all SQNce creation functions

# Establish connection with SQNce.db, generating a new SQLite3 database if needed
def sql_connection():
    con = sqlite3.connect('SQNce.db')
    print("Connection established.")
    
# After establishing connection with SQNce create the specified tables
def sql_table(con):
    cursorObj = con.cursor()
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_traits(
                         experiment text,
                         trait text,
                         description text,
                         plot blob,
                         processed text,
                         score integer,
                         num_candidates integer)
                         """)   

    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_results(
                         experiment text,
                         trait text,
                         snp text,
                         chrom text,
                         pos integer,
                         ref text,
                         alt text,
                         effect REAL,
                         pval REAL)
                         """)
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_candidates(
                         gene_id text,
                         gene_symbol text,
                         gene_annotation text,
                         gene_start integer,
                         gene_end integer,
                         distance integer,
                         experiment text,
                         trait text,
                         snp text,
                         chrom text,
                         pos integer,
                         pval REAL,
                         effect REAL)
                         """)
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_clusters(
                         experiment text,
                         trait text,
                         feature text,
                         description text)
                         """)
    
    con.commit()

In [3]:
con = sqlite3.connect('SQNce.db')
def drop_trait_tables(con):
    cursorObj = con.cursor()
    cursorObj.execute("""DROP TABLE mapping_traits""")
    cursorObj.execute("""DROP TABLE mapping_results""")
    cursorObj.execute("""DROP TABLE mapping_candidates""")
    cursorObj.execute("""DROP TABLE mapping_clusters""")
    con.commit()
drop_trait_tables(con)
con.close()

In [4]:
# Current implementation requires re-parsing of all the input files to create SQNce
# TODO SQNce update functions to parse input data only if not previously included 
#if os.path.exists("SQNce-proteomes.db"): os.remove("SQNce-proteomes.db")
con = sqlite3.connect('SQNce.db')
sql_table(con)
con.close()

# SQNce Mapping-related Functions

In [5]:
def mapping_traits_insert_traits(con, entities):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO mapping_traits(
                         experiment, 
                         trait, 
                         description,
                         plot,
                         processed,
                         score,
                         num_candidates) 
                         VALUES(?,?,?,?,?,?,?)""", entities)
    con.commit()

def mapping_traits_insert_results(con, entities):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO mapping_results(
                         experiment, 
                         trait, 
                         snp,
                         chrom,
                         pos,
                         ref,
                         alt,
                         effect,
                         pval) 
                         VALUES(?,?,?,?,?,?,?,?,?)""", entities)
    con.commit()
    
def mapping_traits_insert_candidates(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO mapping_candidates(
                         gene_id, 
                         gene_symbol, 
                         gene_annotation,
                         gene_start,
                         gene_end,
                         distance,
                         experiment,
                         trait,
                         snp,
                         chrom,
                         pos,
                         pval,
                         effect) 
                         VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?)""", entities)
    con.commit()
    
def mapping_traits_insert_clusters(con, entities):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO mapping_clusters(
                         experiment, 
                         trait, 
                         feature,
                         description) 
                         VALUES(?,?,?,?)""", entities)
    con.commit()




# SQNce Load the rMVP GLM GWAS Results

# Add the mapping results and traits to the tables

In [7]:
con = sqlite3.connect('SQNce.db')

description_dict = {"experiment1": "description1",
                    "experiment2": "description2"}

for experiment in ["experiment1", "experiment2"]:
    cwd = os.getcwd()
    folder_path = os.path.join(cwd, "mapping", experiment)

    # The name of the experiment is also the name of the folder

    fl_pvals_dict = {}
    fl_plots_dict = {}
    fl_name_set = set()
    for fl in os.listdir(folder_path):
        # Assume for now only my output files are used
        if fl.split(".")[-1] == "png":
            fl_name = fl.replace(".png", "")
            fl_plots_dict[fl_name] = fl
        if fl.split(".")[-1] == "csv":
            fl_name = fl.replace(".sig.csv", "")
            fl_pvals_dict[fl_name] = fl
        fl_name_set.add(fl_name)
    fl_name_set


    all_traits_list = []
    all_results_list = []

    # The name of the trait is also the name of the file
    for trait in fl_name_set:
        # Load the trait results dataframe
        filename = os.path.join(folder_path, fl_pvals_dict[trait])
        df = pd.read_csv(filename)
        if len(df) == 0:
            continue
        df = df.sort_values(df.columns[-1])

        # Load the trait plot binary variable
        filename = os.path.join(folder_path, fl_plots_dict[trait])
        with open(filename, 'rb') as file: plot_blob = file.read()
        
        description = description_dict[experiment]
        all_traits_list.append([experiment, trait, description, plot_blob, "Unannotated", 0, 0])    

        # List order: experiment,trait,snp,chrom,pos,ref,alt,effect,pval,plot
        #selected_rows_ix = [0]
        #selected_rows = df.iloc[selected_rows_ix,].values.tolist()
        selected_rows = df.values.tolist()

        for row in selected_rows:
            all_results_list.append([experiment, trait, row[0], row[1], row[2], row[3], row[4], row[5], row[7]])

    mapping_traits_insert_traits(con, all_traits_list)
    mapping_traits_insert_results(con, all_results_list)

# Adding clustered traits to mapping_clusters

In [8]:
con = sqlite3.connect('SQNce.db')
cwd = os.getcwd()
all_clusters_list = []
for experiment in ["experiment1", "experiment2"]:
    folder_path = os.path.join(cwd, "mapping", experiment+".tsv")
    tmp = pd.read_csv(folder_path, sep="\t")
    tmp.columns = ["feature", "trait"]
    tmp["experiment"] = experiment
    tmp["description"] = ""
    tmp = tmp[["experiment", "trait", "feature", "description"]]
    all_clusters_list += tmp.values.tolist() 
mapping_traits_insert_clusters(con, all_clusters_list)

# html.Img Related

In [None]:
from io import BytesIO
def pil_to_b64(im, enc_format="png", **kwargs):
    """
    Converts a PIL Image into base64 string for HTML displaying
    :param im: PIL Image object
    :param enc_format: The image format for displaying. If saved the image will have that extension.
    :return: base64 encoding
    """

    buff = BytesIO()
    im.save(buff, format=enc_format, **kwargs)
    encoded = base64.b64encode(buff.getvalue()).decode("utf-8")

    return encoded

# https://stackoverflow.com/questions/14348442/how-do-i-display-a-pil-image-object-in-a-template#14348661
import PIL
import io
from PIL import Image

picture_stream = io.BytesIO(fig)
picture = Image.open(picture_stream)
# In dash you can send the image to html.Img like this:
# html.Img(id="my-img",className="image", src="data:image/png;base64, " + pil_to_b64(picture))

# picture = PIL.Image.open(picture_stream)
# picture.show()
# pil_to_b64(picture)

# Creating thumbnails
(I ended up not using it but might be a nice option to keep in mind)

In [6]:
import os
from PIL import Image
# https://www.tutorialspoint.com/python_pillow/python_pillow_creating_thumbnails.htm
cwd = os.getcwd()
def tnails(fpath, fname):
    image = Image.open(os.path.join(fpath, fname))
    image.thumbnail((300,300))
    image.save(os.path.join(fpath, fname.replace(".png", ".thumb.png")))

folder_path = os.path.join(cwd, "mapping", "experiment_folder")

for fname in os.listdir(folder_path):
    if fname.split(".")[-1] == "csv":
        continue
    tnails(folder_path, fname)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\SagerFish\\Desktop\\PlantApp\\mapping\\experiment_folder'

# Groupby Bins

In [None]:
df = pd.read_sql_query('''SELECT * FROM mapping_candidates''', con)
df.head()