In [3]:
import os
import gzip
import sqlite3
import zlib
from sqlite3 import Error
import pandas as pd
from collections import OrderedDict

from Bio import SeqIO 
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

# SQNce Mapping-related Tables
*** This part is still in development ***

In [4]:
# TODO add documentation to all SQNce creation functions

# Establish connection with SQNce.db, generating a new SQLite3 database if needed
def sql_connection():
    try:
        con = sqlite3.connect('SQNce.db')
        print("Connection established.")
        return(con)
    except Error:
        print(Error)

# After establishing connection with SQNce create the specified tables
def sql_table(con):
    cursorObj = con.cursor()
    
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_traits(
                         experiment text,
                         trait text,
                         description text,
                         plot blob)
                         """)   

    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_results(
                         experiment text,
                         trait text,
                         snp text,
                         chrom text,
                         pos integer,
                         ref text,
                         alt text,
                         effect REAL,
                         pval REAL)
                         """)
    cursorObj.execute("""CREATE TABLE IF NOT EXISTS mapping_candidates(
                         gene_id text,
                         gene_symbol text,
                         gene_description text, 
                         experiment text,
                         trait text,
                         candidate_count integer,
                         snp text,
                         chrom text,
                         pos integer)
                         """)   
    con.commit()

In [6]:
# Current implementation requires re-parsing of all the input files to create SQNce
# TODO SQNce update functions to parse input data only if not previously included 
#if os.path.exists("SQNce-proteomes.db"): os.remove("SQNce-proteomes.db")
con = sql_connection()
sql_table(con)
con.close()

Connection established.


# SQNce Mapping-related Functions

In [7]:
def mapping_traits_insert_traits(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO mapping_traits(
                         experiment, 
                         trait, 
                         description,
                         plot) 
                         VALUES(?,?,?,?)""", entities)
    con.commit()

def mapping_traits_insert_results(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO mapping_results(
                         experiment, 
                         trait, 
                         snp,
                         chrom,
                         pos,
                         ref,
                         alt,
                         effect,
                         pval) 
                         VALUES(?,?,?,?,?,?,?,?,?)""", entities)
    con.commit()
    
def mapping_traits_insert_candidates(con, entities):
    cursorObj = con.cursor()
    cursorObj.execute("""INSERT INTO mapping_candidates(
                         experiment, 
                         trait, 
                         gene_id,
                         gene_symbol,
                         gene_description,
                         candidate_count,
                         snp,
                         chrom,
                         pos) 
                         VALUES(?,?,?,?,,?,?,?,?,?)""", entities)
    con.commit()



# SQNce Load the rMVP GLM GWAS Results

In [None]:
con = sql_connection()

# The name of the experiment is also the name of the folder
experiment = "experiment_name"

fl_pvals_dict = {}
fl_plots_dict = {}
fl_name_set = set()
for fl in os.listdir(experiment):
    # Assume for now only my output files are used
    if fl.split(".")[-1] == "png":
        fl_name = fl.replace(".png", "")
        fl_plots_dict[fl_name] = fl
    if fl.split(".")[-1] == "csv":
        fl_name = fl.replace(".sig.csv", "")
        fl_pvals_dict[fl_name] = fl
    fl_name_set.add(fl_name)
fl_name_set

# The name of the trait is also the name of the file
for trait in fl_name_set:
    # Load the trait results dataframe
    filename = os.path.join(experiment, fl_pvals_dict[trait])
    df = pd.read_csv(filename)
    if len(df) == 0:
        continue
    df = df.sort_values(df.columns[-1])

    # Load the trait plot binary variable
    filename = os.path.join(experiment, fl_plots_dict[trait])
    with open(filename, 'rb') as file: plot_blob = file.read()

    mapping_traits_insert_traits(con, [experiment, trait, "experiment description", plot_blob])
    

    # List order: experiment,trait,snp,chrom,pos,ref,alt,effect,pval,plot
    #selected_rows_ix = [0]
    #selected_rows = df.iloc[selected_rows_ix,].values.tolist()
    selected_rows = df.values.tolist()
        
    for row in selected_rows:
        mapping_traits_insert_results(con, 
            [experiment, trait, row[0], row[1], row[2], row[3], row[4], row[5], row[7]])
con.close()

Connection established.
