# Example usage of GarNet

In [6]:
import numpy as np
import pandas as pd

# if you cloned the repo:
import sys
sys.path.append("../GarNet")

from importlib import reload
reload(garnet)

# Having issues importing despite module exising in virtual environment? 
# See https://github.com/jupyter/notebook/issues/397

<module 'garnet' from '../GarNet/garnet.py'>

# Map peaks to genes using a549 data

This is accomplished by intersecting motifs with genes specified in the open chromatin bed file. 

In [7]:
garnet_filepath = "../garnet_data/cisBP_hg19_SLIM/garnetDB_cisBP.hg19.LOD.10kb.chr1_SLIM.tsv"
peaks_filepath = "a549_example_peaks.chr1.bed"

df = garnet.map_peaks(peaks_filepath, garnet_filepath)
df.head()

Unnamed: 0,chrom,start,end,motifName,motifLODScore,motifStrand,geneName,geneStart,geneEnd,motif_gene_distance
0,chr1,565301,565308,HOXA4,7.539123,-,JA429830,566114,566115,813
1,chr1,565460,565467,HOXA4,7.539123,-,JA429830,566114,566115,654
2,chr1,568230,568241,GATA1,10.237757,-,JA429830,566114,566115,-2116
3,chr1,568231,568240,LMO2,8.409497,-,JA429830,566114,566115,-2117
4,chr1,570229,570235,HOXA4,7.118708,+,JA429830,566114,566115,-4121


# Regress gene expression against TF Motif binding score

Perform regression, plots are automatically generated in `regression_plots/`. 

In [8]:
expression_filepath = "a549_example_expression.txt"

results = garnet.TF_regression(df, expression_filepath, ".")
results.head(10)

02:35:27 - GarNet: INFO - Performing linear regression for 7 transcription factor expression profiles...
02:35:27 - GarNet: INFO - Performing linear regression for 7 transcription factor expression profiles...
02:35:27 - GarNet: INFO - Performing linear regression for 7 transcription factor expression profiles...


  chrom   start     end motifName  motifLODScore motifStrand geneName  \
0  chr1  858670  858680    SREBF1      10.496339           +   SAMD11   
1  chr1  858670  858680    SREBF1      10.496339           +   SAMD11   
2  chr1  858670  858680    SREBF1      10.496339           +   SAMD11   
3  chr1  873998  874008    SREBF1      10.366826           -   SAMD11   
4  chr1  880640  880652      LMO2       8.814717           -   SAMD11   

   geneStart  geneEnd  motif_gene_distance    name  expression  \
0     860529   860530                -1849  SAMD11        1.13   
1     861120   861121                -2440  SAMD11        1.13   
2     861301   861302                -2621  SAMD11        1.13   
3     874654   874655                 -656  SAMD11        1.13   
4     874654   874655                 5986  SAMD11        1.13   

   score_distance_corrected  
0                  8.724425  
1                  8.223752  
2                  8.076241  
3                  9.708588  
4             

Unnamed: 0,Transcription Factor,Slope,P-Value,Targets
4,HOXA4,0.252656,0.421671,"LMNA,TGFB2,TMEM63A,RAB25,EPB41,PTPN14,ID3,INAD..."
6,SREBF1,-1.391935,0.48299,"IER5,THBS3,KIAA1522,PIK3C2B,SAMD11,EPB41,GSTM3"
1,CEBPB,0.735577,0.63835,"DHCR24,PIK3C2B,NIPAL3,ST6GALNAC3,CYR61,NBL1,KCNK1"
0,ATF5,-0.119244,0.901884,"ACOT7,CSF1,SOX13,PBXIP1,PTGS2,ID3,IL6R,KIRREL,..."
5,LMO2,0.026257,0.977691,"ZBTB7B,S100A4,PVRL4,CAMK2N1,PLEKHO1,F11R,RASSF..."
3,GATA1,0.003935,0.991699,"ELF3,S100A4,KIAA1522,CYR61,NIPAL3,ARHGEF16,PLE..."
2,EGR3,0.00108,0.997791,"IER5,THBS3,ACOT7,EFNA1,ATF3,PLEKHO1,CYR61,RASS..."


# Appendix

## Download and format reference file

### 1. Download UCSC known genes file

Visit [UCSC Table Browser](https://genome.ucsc.edu/cgi-bin/hgTables?command=start) and select the following options

 - **assembly**: Feb. 2009 (GRCh37/hg19)
 - **group**: Genes and Gene Predictions
 - **track**: UCSC Genes
 - **table**: knownGene
 - **region**: genome
 - **output format**: selected fields from primary and related tables

Press "get output", clear all fields, then check the following fields:

 - chrom
 - strand
 - txStart
 - txEnd
 - geneSymbol
 
Press "get output" again, and save the table locally. A copy (retrieved 9/11/2017) is saved in `../garnet_data/reference/ucsc_known_genes.hg19.tsv`.

### 2. Reformat to BED file

In [None]:
ucsc_reference = pd.read_csv("../garnet_data/reference/ucsc_known_genes.hg19.tsv", sep='\t', names=["chrom", "strand", "chromStart", "chromEnd", "name"], comment="#")
ucsc_reference.drop_duplicates(inplace=True)
ucsc_reference["score"] = "."
ucsc_reference.head()

In [None]:
ucsc_reference_bed = ucsc_reference[["chrom", "chromStart", "chromEnd", "name", "score", "strand"]]
ucsc_reference_bed.head()

In [None]:
ucsc_reference_bed.to_csv("../garnet_data/reference/ucsc_reference.hg19.bed", sep='\t', header=False, index=False)