# Example usage of GarNet

In [1]:
import numpy as np
import pandas as pd

# if you pip installed the package: 
import GarNet

# if you cloned the repo:
# import sys
# sys.path.append("../GarNet")

# Having issues importing despite module exising in virtual environment? 
# See https://github.com/jupyter/notebook/issues/397

# Generate GarNet file

Skip this section if GarNet file has already been generated. See Appendix for how to obtain UCSC reference file.

In [2]:
reference_filepath = "../garnet_data/reference/ucsc_reference.hg19.bed"
motif_filepath = "../garnet_data/cisBP_hg19_SLIM/cisBP.hg19.LOD.10kb.chr1_SLIM.bed"
garnet_filepath = "../garnet_data/cisBP_hg19_SLIM/garnetDB_cisBP.hg19.LOD.10kb.chr1_SLIM.tsv"

garnet_df = GarNet.construct_garnet_file(reference_filepath, motif_filepath, garnet_filepath)
garnet_df.head()

04:36:41 - GarNet: INFO -   - TSS file already exists here ../garnet_data/reference/ucsc_reference.hg19.tss.bed
04:36:41 - GarNet: INFO -   - Searching for motifs near genes. This may take a while...
04:36:48 - GarNet: INFO -   - 181422 motif-gene associations found and written to ../garnet_data/cisBP_hg19_SLIM/garnetDB_cisBP.hg19.LOD.10kb.chr1_SLIM.tsv


Unnamed: 0,motifChrom,motifStart,motifEnd,motifName,motifScore,motifStrand,geneName,tssStart,tssEnd,motif_gene_distance
0,chr1,17000,17008,HOXA4,8.846505,+,DDX11L1,11873,11874,5135
1,chr1,18055,18067,LMO2,8.768321,+,DDX11L1,11873,11874,6194
2,chr1,18339,18351,LMO2,8.901481,+,DDX11L1,11873,11874,6478
3,chr1,19086,19096,SREBF1,11.043803,-,DDX11L1,11873,11874,7213
4,chr1,19858,19866,HOXA4,8.539458,+,DDX11L1,11873,11874,7993


# Map peaks to genes using a549 data

This is accomplished by intersecting motifs with genes specified in the open chromatin bed file. 

In [4]:
garnet_filepath = "../data/garnetDB_cisBP.hg19.LOD.10kb.tsv"
peaks_filepath = "a549_example_peaks.chr1.bed"

df = GarNet.map_peaks(peaks_filepath, garnet_filepath)
df.head()

Unnamed: 0,chrom,start,end,motifName,motifScore,motifStrand,geneName,geneStart,geneEnd,motif_gene_distance
0,chr1,8086390,8086402,AC0021266,10.431478,-,ERRFI1,8086392,8086393,2
1,chr1,23913349,23913361,AC0021266,13.02428,-,MDS2,23907984,23907985,5365
2,chr1,24246686,24246698,AC0021266,12.883207,+,CNR2,24239816,24239817,-6882
3,chr1,24246686,24246698,AC0021266,12.883207,+,MIR378F,24255559,24255560,-8861
4,chr1,57297772,57297784,AC0021266,12.783137,-,BC048114,57292592,57292593,-5180


# Regress gene expression against TF Motif binding score

Perform regression, plots are automatically generated in `regression_plots/`. 

In [5]:
expression_filepath = "a549_example_expression.txt"

results = GarNet.TF_regression(df, expression_filepath, ".")
results.head(10)

04:48:00 - GarNet: INFO - Performing linear regression for 692 transcription factor expression profiles...


Unnamed: 0,Transcription Factor,Slope,P-Value,Targets
94,FOXO1,1.486838,0.004396,"PVRL4,TMEM63A,S100A14,EFNA1,SYTL1,LRRC8C,TGFB2..."
97,FOXO6,1.199703,0.008159,"GALNT2,SYTL1,PVRL4,TMEM63A,SOX13,ELF3,LAD1,S10..."
89,FOXL1,-2.024675,0.008195,"TMEM63A,SOX13,ELF3,PVRL4,EFNA1,CYR61,LRRC8C,PT..."
315,TEAD3,0.352063,0.011501,"PTGS2,SOX13,BCAN,NMNAT2,ZBTB7B,ACOT7,ADAM15,LA..."
341,VDR,0.968223,0.012346,"KIRREL,RAB25,SYTL1,CSF1,ELF3,PTPN14,TMEM61,LAD..."
366,ZNF274,0.818921,0.014866,"LEPRE1,NMNAT2,PVRL4,LRRC8B,S100A14,MFAP2,DVL1,..."
253,RXRA,-1.486082,0.01493,"ANXA9,RAB25,S100A14,ELF3,ARHGEF16,MFAP2,LMNA,A..."
193,NFIC,-0.074088,0.025818,"KIAA1522,ZBTB7B,ATF3,SOX13,BCAN,TACSTD2,THBS3,..."
240,RFX1,0.832647,0.033029,"CAMK2N1,ZBTB7B,PVRL4,LMNA,RPS6KA1,THBS3,LIX1L,..."
115,HIC2,-15.151852,0.035823,"S100A4,RPS6KA1,ELF3,ANXA9,PTAFR,LMNA,YARS,S100A6"


# Appendix

## Download and format reference file

### 1. Download UCSC known genes file

Visit [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables?command=start) and select the following options

 - **assembly**: Feb. 2009 (GRCh37/hg19)
 - **group**: Genes and Gene Predictions
 - **track**: UCSC Genes
 - **table**: knownGene
 - **region**: genome
 - **output format**: selected fields from primary and related tables

Press "get output", clear all fields, then check the following fields:

 - chrom
 - strand
 - txStart
 - txEnd
 - geneSymbol
 
Press "get output" again, and save the table locally. A copy (retrieved 9/11/2017) is saved in `../garnet_data/reference/ucsc_known_genes.hg19.tsv`.

### 2. Reformat to BED file

In [5]:
ucsc_reference = pd.read_csv("../garnet_data/reference/ucsc_known_genes.hg19.tsv", sep='\t', names=["chrom", "strand", "chromStart", "chromEnd", "name"], comment="#")
ucsc_reference.drop_duplicates(inplace=True)
ucsc_reference["score"] = "."
ucsc_reference.head()

Unnamed: 0,chrom,strand,chromStart,chromEnd,name,score
0,chr1,+,11873,14409,DDX11L1,.
3,chr1,-,14361,16765,WASH7P,.
4,chr1,-,16857,17751,WASH7P,.
5,chr1,-,15795,18061,WASH7P,.
6,chr1,-,14361,19759,WASH7P,.


In [6]:
ucsc_reference_bed = ucsc_reference[["chrom", "chromStart", "chromEnd", "name", "score", "strand"]]
ucsc_reference_bed.head()

Unnamed: 0,chrom,chromStart,chromEnd,name,score,strand
0,chr1,11873,14409,DDX11L1,.,+
3,chr1,14361,16765,WASH7P,.,-
4,chr1,16857,17751,WASH7P,.,-
5,chr1,15795,18061,WASH7P,.,-
6,chr1,14361,19759,WASH7P,.,-


In [7]:
ucsc_reference_bed.to_csv("../garnet_data/reference/ucsc_reference.hg19.bed", sep='\t', header=False, index=False)