# 3D Organization

In [1]:
import os
import subprocess

import numpy as np
import pandas as pd
import pybedtools as pbt

import cardipspy as cpy
import ciepy

%matplotlib inline

In [2]:
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)

fn = 'ftp://ftp.informatics.jax.org/pub/reports/HOM_AllOrganism.rpt'
conv = pd.read_table(fn, index_col=0, low_memory=False)
#conv.index = conv['HomoloGene ID']
mouse_genes = conv[conv['Common Organism Name'] == 'mouse, laboratory']
human_genes = conv[conv['Common Organism Name'] == 'human']

wit_nanog = pd.read_excel(os.path.join(ciepy.root, 'private_data', 'nature12420-s3.xls'))

In [3]:
wit_bt = pbt.BedTool('\n'.join(wit_nanog.chromosome + '\t' + 
                               wit_nanog.start.astype(str) + 
                               '\t' + wit_nanog.end.astype(str)) + '\n',
                     from_string=True)

In [4]:
def liftover_bed(bt, chain):
    """Lift over a bed file using a given chain file. Returns pandas data
    frame with old coordinates as index and new coordinates as columns. Regions
    that couldn't be lifted over are discarded."""
    import tempfile
    bt = bt.sort()
    mapped = tempfile.NamedTemporaryFile()
    unmapped = tempfile.NamedTemporaryFile()
    c = '{} {} {} {} {}'.format(cpy.liftOver, bt.fn, chain,
                                mapped.name, unmapped.name)
    subprocess.check_call(c, shell=True)
    with open(unmapped.name) as f:
        missing = pbt.BedTool(''.join([x for x in f.readlines()[1::2]]),
                              from_string=True)
    bt = bt.subtract(missing)
    bt_mapped = pbt.BedTool(mapped.name)
    old_loc = []
    for r in bt:
        old_loc.append('{}:{}-{}'.format(r.chrom, r.start, r.end))
    new_loc = []
    new_chrom = []
    new_start = []
    new_end = []
    for r in bt_mapped:
        new_loc.append('{}:{}-{}'.format(r.chrom, r.start, r.end))
        new_chrom.append(r.chrom)
        new_start.append(r.start)
        new_end.append(r.end)
    new_info = pd.DataFrame({'loc':new_loc, 
                             'chrom': new_chrom, 
                             'start': new_start,
                             'end': new_end}, 
                            index=old_loc)
    mapped.close()
    unmapped.close()
    return new_info

In [5]:
mouse_genes['symbol_upper'] = mouse_genes['Symbol'].apply(lambda x: x.upper())
mouse_genes.ix[mouse_genes.Synonyms.isnull() == False, 'synonyms_list'] = \
    mouse_genes.ix[mouse_genes.Synonyms.isnull() == False, 
                   'Synonyms'].apply(lambda x: [y.upper() for y in x.split('|')])
syns = set([item for sublist in mouse_genes.synonyms_list.dropna().values for item in sublist])
synonyms_list = mouse_genes.ix[mouse_genes.synonyms_list.isnull() == False, 'synonyms_list']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [6]:
wit_nanog['human_gene_name'] = np.nan
wit_nanog['HomoloGene ID'] = np.nan
for i in wit_nanog.index:
    g = wit_nanog.ix[i, 'associated gene'].upper()
    t = mouse_genes[mouse_genes.symbol_upper == g]
    if t.shape[0] == 0:
        if g in syns:
            t = synonyms_list[synonyms_list.apply(lambda x: g in x)]
    if t.shape[0] == 1:
        if t.index[0] in human_genes.index.values:
            wit_nanog.ix[i, 'HomoloGene ID'] = t.index[0]
            ht = human_genes.ix[t.index[0], 'Symbol']
            if type(ht) == str:
                wit_nanog.ix[i, 'human_gene_name'] = ht
            else:
                wit_nanog.ix[i, 'human_gene_name'] = '|'.join(ht.values)

In [7]:
wit_nanog['human_gene_id'] = np.nan
for i in wit_nanog.human_gene_name.dropna().index:
    t = gene_info[gene_info.gene_name == wit_nanog.ix[i, 'human_gene_name']]
    if t.shape[0] == 1:
        wit_nanog.ix[i, 'human_gene_id'] = t.index[0]
    elif t.shape[0] == 0:
        t = gene_info[gene_info.gene_name.apply(lambda x: x.upper()) == 
                      wit_nanog.ix[i, 'human_gene_name'].upper()]

In [8]:
tss = pbt.BedTool(cpy.gencode_tss_bed)

In [9]:
tss_2kb = tss.slop(l=2000, r=2000, g=pbt.genome_registry.hg19)

In [10]:
len(tss_2kb)

196520

In [11]:
t = tss_2kb.merge()

In [12]:
len(t)

70732

In [None]:
variant_regions = cpb.gencode.make_promoter_bed(cpy.gencode_gtf, merge_by_gene=True,
                                                   up=1000000, down=1000000, out=fn)