In [39]:
import os
import gzip
import sqlite3
import zlib
import io
import numpy as np
from sqlite3 import Error
import pandas as pd
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

# SQNce Mapping-related Tables

In [23]:
# TODO add documentation to all SQNce creation functions

# Establish connection with SQNce.db, generating a new SQLite3 database if needed
def sql_connection():
    try:
        con = sqlite3.connect('SQNce.db')
        print("Connection established.")
        return(con)
    except Error:
        print(Error)

# After establishing connection with SQNce create the specified tables
def sql_table(con):
    cursorObj = con.cursor()
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_traits(
                         experiment text,
                         trait text,
                         description text,
                         plot blob,
                         processed text,
                         score integer,
                         num_candidates integer)
                         """)   

    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_results(
                         experiment text,
                         trait text,
                         snp text,
                         chrom text,
                         pos integer,
                         ref text,
                         alt text,
                         effect REAL,
                         pval REAL)
                         """)
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_candidates(
                         gene_id text,
                         gene_symbol text,
                         gene_annotation text,
                         gene_start integer,
                         gene_end integer,
                         distance integer,
                         experiment text,
                         trait text,
                         snp text,
                         chrom text,
                         pos integer,
                         pval REAL,
                         effect REAL)
                         """)   
    con.commit()

In [24]:
con = sql_connection()
def drop_trait_tables(con):
    cursorObj = con.cursor()
    cursorObj.execute("""DROP TABLE mapping_traits""")
    cursorObj.execute("""DROP TABLE mapping_results""")
    cursorObj.execute("""DROP TABLE mapping_candidates""")
    con.commit()
drop_trait_tables(con)
con.close()

Connection established.


In [25]:
# Current implementation requires re-parsing of all the input files to create SQNce
# TODO SQNce update functions to parse input data only if not previously included 
#if os.path.exists("SQNce-proteomes.db"): os.remove("SQNce-proteomes.db")
con = sql_connection()
sql_table(con)
con.close()

Connection established.


# SQNce Mapping-related Functions

In [33]:
[{"label": str(n), "value": n} for n in range(6)]

[{'label': '0', 'value': 0},
 {'label': '1', 'value': 1},
 {'label': '2', 'value': 2},
 {'label': '3', 'value': 3},
 {'label': '4', 'value': 4},
 {'label': '5', 'value': 5}]

In [26]:
def mapping_traits_insert_traits(con, entities):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO mapping_traits(
                         experiment, 
                         trait, 
                         description,
                         plot,
                         processed,
                         score,
                         num_candidates) 
                         VALUES(?,?,?,?,?,?,?)""", entities)
    con.commit()

def mapping_traits_insert_results(con, entities):
    cursorObj = con.cursor()
    cursorObj.executemany("""INSERT INTO mapping_results(
                         experiment, 
                         trait, 
                         snp,
                         chrom,
                         pos,
                         ref,
                         alt,
                         effect,
                         pval) 
                         VALUES(?,?,?,?,?,?,?,?,?)""", entities)
    con.commit()
    
def mapping_traits_insert_candidates(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO mapping_candidates(
                         gene_id, 
                         gene_symbol, 
                         gene_annotation,
                         gene_start,
                         gene_end,
                         distance,
                         experiment,
                         trait,
                         snp,
                         chrom,
                         pos,
                         pval,
                         effect) 
                         VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?)""", entities)
    con.commit()



# SQNce Load the rMVP GLM GWAS Results

In [27]:
con = sql_connection()

# The name of the experiment is also the name of the folder
experiment = "282F_SC_5"

fl_pvals_dict = {}
fl_plots_dict = {}
fl_name_set = set()
for fl in os.listdir(experiment):
    # Assume for now only my output files are used
    if fl.split(".")[-1] == "png":
        fl_name = fl.replace(".png", "")
        fl_plots_dict[fl_name] = fl
    if fl.split(".")[-1] == "csv":
        fl_name = fl.replace(".sig.csv", "")
        fl_pvals_dict[fl_name] = fl
    fl_name_set.add(fl_name)
fl_name_set


all_traits_list = []
all_results_list = []

# The name of the trait is also the name of the file
for trait in fl_name_set:
    # Load the trait results dataframe
    filename = os.path.join(experiment, fl_pvals_dict[trait])
    df = pd.read_csv(filename)
    if len(df) == 0:
        continue
    df = df.sort_values(df.columns[-1])

    # Load the trait plot binary variable
    filename = os.path.join(experiment, fl_plots_dict[trait])
    with open(filename, 'rb') as file: plot_blob = file.read()
    
    all_traits_list.append([experiment, trait, "rMVP GLM of 282_SC MR network with E5", plot_blob, "Unannotated", 0, 0])    

    # List order: experiment,trait,snp,chrom,pos,ref,alt,effect,pval,plot
    #selected_rows_ix = [0]
    #selected_rows = df.iloc[selected_rows_ix,].values.tolist()
    selected_rows = df.values.tolist()
        
    for row in selected_rows:
        all_results_list.append([experiment, trait, row[0], row[1], row[2], row[3], row[4], row[5], row[7]])

mapping_traits_insert_traits(con, all_traits_list)
mapping_traits_insert_results(con, all_results_list)

Connection established.


# html.Img Related

In [None]:
from io import BytesIO
def pil_to_b64(im, enc_format="png", **kwargs):
    """
    Converts a PIL Image into base64 string for HTML displaying
    :param im: PIL Image object
    :param enc_format: The image format for displaying. If saved the image will have that extension.
    :return: base64 encoding
    """

    buff = BytesIO()
    im.save(buff, format=enc_format, **kwargs)
    encoded = base64.b64encode(buff.getvalue()).decode("utf-8")

    return encoded

# https://stackoverflow.com/questions/14348442/how-do-i-display-a-pil-image-object-in-a-template#14348661
import PIL
import io
from PIL import Image

picture_stream = io.BytesIO(fig)
picture = Image.open(picture_stream)
# In dash you can send the image to html.Img like this:
# html.Img(id="my-img",className="image", src="data:image/png;base64, " + pil_to_b64(picture))

# picture = PIL.Image.open(picture_stream)
# picture.show()
# pil_to_b64(picture)

# Groupby Bins

In [36]:
df = pd.read_sql_query('''SELECT * FROM mapping_candidates''', con)
df.head()

Unnamed: 0,gene_id,gene_symbol,gene_annotation,gene_start,gene_end,distance,experiment,trait,snp,chrom,pos,pval,effect
0,Zm00001d049544,,UDP-Glycosyltransferase superfamily protein,34206911,34208549,18677,282F_SC_5,cc1539.GLM.1.29.GLM,4-32678274,4,34227226,6.043896e-10,-2e-06
1,Zm00001d023992,AGD11,Calcium-binding EF-hand family protein,34971922,34972401,64295,282F_SC_5,cc824.GLM.1.17.GLM,10-34087255,10,35036696,1.616159e-09,-2e-06
2,Zm00001d021653,ATGPT2,glucose-6-phosphate/phosphate translocator 2,159155247,159157703,5859,282F_SC_5,cc953.GLM.1.03.GLM,7-153855138,7,159149388,1.003216e-10,1e-06
3,Zm00001d029139,ATTPS21,terpene synthase 21,60187028,60190201,468,282F_SC_5,cc985.GLM.1.21.GLM,1-59472251,1,60190669,2.5410119999999997e-34,1.1e-05
4,Zm00001d051863,CYP715A1,"cytochrome P450, family 715, subfamily A, poly...",172596420,172598331,0,282F_SC_5,cc1847.GLM.1.07.GLM,4-169646677,4,172596435,1.110248e-10,3e-06


In [42]:
x = 1.7
x = int(x)
print(x)

1


In [43]:
df["rpos"] = df["pos"].apply(lambda x: round(x, -5))

In [45]:
print(df)

            gene_id gene_symbol  \
0    Zm00001d049544               
1    Zm00001d023992       AGD11   
2    Zm00001d021653      ATGPT2   
3    Zm00001d029139     ATTPS21   
4    Zm00001d051863    CYP715A1   
..              ...         ...   
124  Zm00001d005018               
125  Zm00001d040429        AGO4   
126  Zm00001d003355               
127  Zm00001d003356               
128  Zm00001d007718               

                                       gene_annotation  gene_start   gene_end  \
0          UDP-Glycosyltransferase superfamily protein    34206911   34208549   
1               Calcium-binding EF-hand family protein    34971922   34972401   
2         glucose-6-phosphate/phosphate translocator 2   159155247  159157703   
3                                  terpene synthase 21    60187028   60190201   
4    cytochrome P450, family 715, subfamily A, poly...   172596420  172598331   
..                                                 ...         ...        ...   
124         

In [None]:
df.groupby(pd.cut(df["B"], np.arange(0, 1.0+0.155, 0.155))).sum()

In [None]:
df.