# 3D Structure

In [None]:
import glob
import os
import re
import subprocess
import urllib2

import cdpybio as cpb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
import pybedtools as pbt
import seaborn as sns
import vcf as pyvcf

import cardipspy as cpy
import ciepy

%matplotlib inline

dy_name = '3d_structure'

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    dy = os.path.join(ciepy.root, 'sandbox', 'tmp', dy_name)
    cpy.makedir(dy)
    pbt.set_tempdir(dy)
    
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)

In [None]:
tg = pd.read_table(cpy.gencode_transcript_gene, index_col=0, 
                   header=None, squeeze=True)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 
                  'tpm_log_filtered_phe_std_norm_peer_resid.tsv')
exp = pd.read_table(fn, index_col=0)

genes = pbt.BedTool(cpy.gencode_gene_bed)

t_to_g = pd.read_table(cpy.gencode_transcript_gene, index_col=0, header=None, squeeze=True)

fn = os.path.join(os.path.split(cpy.roadmap_15_state_annotation)[0], 'EIDlegend.txt')
roadmap_ids = pd.read_table(fn, squeeze=True, index_col=0, header=None)

## TADs

I'd like to look for enrichment of eQTLs in TADs. As in the Grubert et al. 2015 paper, I
can mirror the position of the variant over the gene and see whether they fall within the
same TAD as often. They also shuffled the locations of the TADs and compared the number of
distal (> 50 kb) QTLs falling within the same TAD as the gene against the shuffled data.
I can do the same analysis using sets of null variants as well.

In [None]:
# Download TADs from Dixon et al. 2012. These coordinates are in hg18 so we'll download the
# liftOver chain files and convert the bed file.

dy = os.path.join(private_outdir, 'hESC')
if not os.path.exists(dy):
    out = os.path.join(private_outdir, 'hESC.domain.tar.gz')
    !curl http://132.239.201.216/mouse/hi-c/hESC.domain.tar.gz > {out}
    !tar -C {private_outdir} -xvf {out}
    !rm {out}
    
fn = os.path.join(private_outdir, 'hg18ToHg19.over.chain')
if not os.path.exists(fn):
    url = ('http://hgdownload.cse.ucsc.edu/goldenPath/hg18/liftOver/hg18ToHg19.over.chain.gz')
    !curl {url} | zcat > {fn}

In [None]:
# Read in TADs, convert to hg19.
tads = pbt.BedTool(os.path.join(private_outdir, 'hESC', 'combined', 'total.combined.domain'))
n = len(tads)
mapped = os.path.join(outdir, 'hg19_hESC_tads_mapped.bed')
unmapped = os.path.join(outdir, 'hg19_hESC_tads_unmapped.txt')
tads = cpb.analysis.liftover_bed(tads, os.path.join(private_outdir, 'hg18ToHg19.over.chain'),
                                 mapped, unmapped)
print('Converted {} of {} TADs to hg19.'.format(len(tads), n))

## [Tang et al. 2015](http://www.sciencedirect.com/science/article/pii/S0092867415015044)

In [None]:
annot_beds = dict()
# H3K27ac
fn = os.path.join(ciepy.root, 'output', 'ji_et_al_2015_processing', 'h3k27ac_peaks.bed')
annot_beds['h3k27ac'] = pbt.BedTool(fn)
# Genes
annot_beds['gene'] = genes
# Promoters
promoters = pbt.BedTool('/publicdata/gencode_v19_20151104/promoters_by_gene.bed')
df = promoters.to_dataframe()
df.name = df.name.apply(lambda x: x.split('_')[0])
s = '\n'.join(df.astype(str).apply(lambda x: '\t'.join(x), axis=1)) + '\n'
promoters = pbt.BedTool(s, from_string=True)
annot_beds['promoter'] = promoters
# TSS
tss = pbt.BedTool(cpy.gencode_tss_bed)
df = tss.to_dataframe()
df.name = t_to_g[df.name.apply(lambda x: x.split('_')[0])].values
df = df.drop_duplicates()
s = '\n'.join(df.astype(str).apply(lambda x: '\t'.join(x), axis=1)) + '\n'
tss = pbt.BedTool(s, from_string=True)
annot_beds['tss'] = tss
# Exons
exons = pbt.BedTool(cpy.gencode_exon_bed)
df = exons.to_dataframe()
df.name = t_to_g[df.name].values
df = df.drop_duplicates()
s = '\n'.join(df.astype(str).apply(lambda x: '\t'.join(x), axis=1)) + '\n'
exons = pbt.BedTool(s, from_string=True)
annot_beds['exon'] = exons

In [None]:
def parse_tang(url):
    s = cpb.general.read_gzipped_text_url(url)
    lines = [x.split() for x in s.strip().split('\n')]
    df = pd.DataFrame(lines, columns=['chrom1', 'start1', 'end1', 
                                      'chrom2', 'start2', 'end2', 'freq'])
    df.index = df.chrom1 + ':' + df.start1 + '-' + df.end1 + '==' + df.chrom2 + ':' + df.start1 + '-' + df.end2
    for c in ['start1', 'start2', 'end1', 'end2', 'freq']:
        df[c] = df[c].astype(int)
    return df

In [None]:
# GM12878_RNAPII
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1872887
url = ('http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM1872887&format=file&file='
       'GSM1872887%5FGM12878%5FRNAPII%5FPET%5Fclusters%2Etxt%2Egz')
gm_rnap = parse_tang(url)
gm_rnap_a = cpb.bedtools.AnnotatedInteractions(gm_rnap, annot_beds, completely_contains=['gene'])

In [None]:
# GM12878_CTCF
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1872886
url = ('http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM1872886&format=file&file='
       'GSM1872886%5FGM12878%5FCTCF%5FPET%5Fclusters%2Etxt%2Egz')
gm_ctcf = parse_tang(url)
gm_ctcf_a = cpb.bedtools.AnnotatedInteractions(gm_ctcf, annot_beds, completely_contains=['gene'])

In [None]:
# HeLa_CTCF
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1872888
url = ('http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM1872888&format=file&file='
       'GSM1872888%5FHeLa%5FCTCF%5FPET%5Fclusters%2Etxt%2Egz')
hela_ctcf = parse_tang(url)
hela_ctcf_a = cpb.bedtools.AnnotatedInteractions(hela_ctcf, annot_beds, completely_contains=['gene'])

In [None]:
# HeLa_RNAPII
# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM1872889
url = ('http://www.ncbi.nlm.nih.gov/geo/download/?acc=GSM1872889&format=file&file='
       'GSM1872889%5FHeLa%5FRNAPII%5FPET%5Fclusters%2Etxt%2Egz')
hela_rnap = parse_tang(url)
hela_rnap_a = cpb.bedtools.AnnotatedInteractions(hela_rnap, annot_beds, completely_contains=['gene'])