# Setup

import libraries, etc

In [1]:
from io import StringIO
import glob
import os
import re
# the peak callers all inherit from DatabaseApi, a class which offers an interface 
# to a sqlite database to store Calling Cards data
from callingcardstools.database_managers.yeast import HopsDb
from callingcardstools.PackageResources import Resources
import pandas as pd

# This object allows retrieval of package resources
cc_resources = Resources()

# create a database either in memory or at a specified location
#yeast_db = hopsdb("/home/oguzkhan/Desktop/cc_metadata/hops_db.sqlite")
yeast_db = HopsDb("/home/oguzkhan/projects/rank_response_shiny/data/qc_db_v2.sqlite")


Checking table column names...
Current database tables are valid


# Build the database -- preliminaries

Add the chromosome name map, background data, and any promoter definition 
tables

In [None]:
# From the package resources, read in the chr_map, background data and 
# promoter regions files
chr_map = pd.read_csv(StringIO(cc_resources.yeast_chr_map))
chr_map['chr'] = [str(x) for x in chr_map['numbered']]

background_sir4 = pd.read_csv(StringIO(cc_resources.yeast_background_sir4),
                              names = yeast_db.required_fields['qbed'],
							  sep = "\t")

background_sir4['batch_id'] = ['dSir4']*len(background_sir4)

background_adh1 = pd.read_csv(StringIO(cc_resources.yeast_background_adh1),
                              names = yeast_db.required_fields['qbed'],
							  sep = "\t")

background_adh1['batch_id'] = ['minus_adh1']*len(background_adh1)
background_adh1['chr'] = [str(x) for x in background_adh1['chr']]

yiming_promoters = pd.read_csv(StringIO(cc_resources.yeast_promoters_yiming), 
							sep = "\t", 
							names = yeast_db.required_fields['bed6'] + ['common'])

not_orf = pd.read_csv(StringIO(cc_resources.yeast_promoters_not_orf), 
							sep = "\t", 
							names = yeast_db.required_fields['bed6'])
# add these to the database
yeast_db.add_frame(chr_map, "chr_map", tablename = 'chr_map', drop = True)
yeast_db.add_frame(yiming_promoters, 'bed6', table_type='regions', tablename_suffix ='yiming', drop = True)
yeast_db.add_frame(not_orf, 'bed6', table_type='regions', tablename_suffix= 'not_orf', drop = True)
yeast_db.add_frame(background_sir4,'qbed', table_type='background',tablename_suffix='dSir4', drop = True)
yeast_db.add_frame(background_adh1,'qbed', table_type="background", tablename_suffix= 'adh1', drop = True)

# Add batch QC data

In [None]:
barcode_details = glob.glob('/mnt/scratch/calling_cards/sequence/*/*json')
barcode_details_run_numbers =  [re.findall(r'run_\d+_\d+|run_\d+',x)[0] for x in barcode_details]
barcode_details_dict = {k:v for k,v in  zip(barcode_details_run_numbers,barcode_details)}

id_to_bc_maps = glob.glob("/mnt/scratch/calling_cards/sequence/*/*/*tsv")
id_to_bc_map_run_numbers = [re.findall(r'run_\d+',x)[0] for x in id_to_bc_maps]

id_to_bc_map_dict = {k:v for k,v in zip(id_to_bc_map_run_numbers, id_to_bc_maps)}

add_batch_qc_dict = {}
for k,v in barcode_details_dict.items():
	if not k in ['run_5301', 'run_5301_5088', 'run_5690']:
		add_batch_qc_dict[k] = [v,id_to_bc_map_dict[k]]

In [None]:
for k,v in add_batch_qc_dict.items():
	yeast_db.add_batch_qc(k,v[0],v[1])

# Get experimental data 

create a dataframe which stores the path to the ccf/qbed file, and some additional 
info on each, parsed out of the filepath.

In [None]:
def extract_descriptors(row: pd.Series) -> list:
	"""add columns to the ccf_df

	Args:
		row (pd.Series): a row from the ccf_df below

	Returns:
		list: a list of items to add to fields in the ccf_df
	"""
	path = row['ccf']
	
	sample = os.path.basename(path)
	sample = re.sub(r"^E\d+_","",sample)
	sample = re.sub(r"_JP\d+.ccf$","",sample)
	sample = re.sub(r".ccf$","", sample)

	runNumber = re.findall(r'run_\d+',path)[0]
	tf_replicate = re.sub(r'.ccf','', sample)

	tf = re.sub(r'_taqaI.*.ccf|_v2','', sample)
	tf = re.sub(r'_taqaI*$',"", tf)
	tf = re.sub(r'_taqaIV2', '', tf)
	tf = re.sub(r'HAP4\w', 'HAP4', tf)

	return tf,tf_replicate,runNumber,sample

# create initial df with list of ccf files
ccf_df = pd.DataFrame({'ccf': glob.glob("/mnt/lts/sequence_data/yeast_cc/cc_pipeline_output/*/sig_promoter/data/*ccf")})
# add some additional info
ccf_df[['tf', 'tf_replicate', 'runNumber', 'sample']] = ccf_df.apply(lambda row: extract_descriptors(row), axis=1, result_type='expand')

In [None]:
ccf_df = ccf_df.assign(replicate = ['none']* ccf_df.shape[0])
ccf_df.loc[24:25,'replicate'] = ['1','2']
ccf_df.loc[56:57,'replicate'] = ['1','2']

ccf_df = ccf_df[['ccf','tf','runNumber','replicate']].rename(columns={'runNumber':'batch'})

ccf_df = ccf_df[~ccf_df['batch'].isin(['run_5301', 'run_5301_5088', 'run_5690'])]

batch_tbl = pd.read_sql_query('Select * from batch', yeast_db.con)



# Associate each run with the batch_id in the database

In [None]:
ccf_df_with_batch_id = pd.merge(ccf_df,batch_tbl,how='left', on=['batch','tf', 'replicate'])

# Add each experimental experimental (eg, a single TF replicate) ccf/qbed to the database

In [None]:
def add_ccf_to_db(row: pd.Series) -> None:
	"""read in the ccf, augment and add to the yeast_db

	Args:
		row (pd.Series): a row from the ccf_df
	"""
	# note that only the first 6 rows are used for the names. sample, the 6th, 
	# is added in this function
	df = pd.read_csv(row['ccf'], 
	                 sep = '\t', 
					 names =yeast_db.required_fields['qbed'][:-1])
	df['batch_id'] = row['id']
	yeast_db.add_frame(df,'qbed',table_type='experiment',tablename_suffix = row['tf'], fk_tablelist=['batch'])

ccf_df_with_batch_id.apply(lambda row: add_ccf_to_db(row), axis=1)

# Create aggregate views

For a given table of region (ie promoter) definitions, this creates a view 
for each background and each experiment table which aggregates the hops over 
each of the defined regions

In [2]:
# aggregate each replicate over the regions

background_and_expr_tbls = [x for x in yeast_db.list_tables(yeast_db.con) if re.search(r"^background|^experiment",x)]
for regions_tbl in ['regions_yiming', 'regions_not_orf']:
    for qbed in background_and_expr_tbls:
        yeast_db.create_aggregate_view(qbed,regions_tbl)

# aggregate passing aggregated replicates over regions
#aggregate_hop_views = # list of views
#for agg_view in aggregate_hop_views:
#    yeast_db.aggregate_passing_replicate

# Create statistics tables

In [None]:
for region_tbl in ['regions_yiming', 'regions_not_orf']:
	print(f"region: {region_tbl}")
	for background_tbl in ['background_adh1', 'background_dSir4']:
		print(f"background: {background_tbl}")
		for experiment_tbl in [x for x in yeast_db.list_tables(yeast_db.con) if re.match('^experiment_', x)]:
			print(f"experiment: {experiment_tbl}")
			yeast_db.peak_caller(regions = region_tbl,background = background_tbl, experiment = experiment_tbl)

# Adding additional runs to the database
For instance, if you want to add more replicates or another batch

In [None]:
background_adh1 = yeast_db.get_total_hops('background_adh1')
expr_hops = yeast_db.get_total_hops

# Create rank/response plots