In [1]:
# Set up
%pylab inline

# Allow us to edit fonts in Illustrator
import matplotlib
matplotlib.rcParams['ps.useafm'] = True
matplotlib.rcParams['pdf.use14corefonts'] = True
matplotlib.rcParams['text.usetex'] = True

# Libraries
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.stats
import seaborn.apionly as sns
from collections import defaultdict

ANNOTDIR = "/storage/mgymrek/gtex/annotations/"
FEATUREDIR = "/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/"
HTDIR = "/storage/mgymrek/gtex/tfbs/encodedata/"
BEDDIR = "/storage/mgymrek/gtex/annotations/encode/"
mergefile = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Merged_Best_causality.Table'
REGRSSION = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/LR_SummaryTest_Table.tsv'
SCORETYPE = ""
MINDIST = 1500 # For TSS
MINQ = 1 # FDR threshold. Ignored since overlap with Estrs later

# LIst of features to intersect with
FEATURES = {
    "3utr": os.path.join(ANNOTDIR, "3utr.bed"),
    "5utr": os.path.join(ANNOTDIR, "5utr.bed"),
    "coding": os.path.join(ANNOTDIR, "coding.bed"),
    "introns": os.path.join(ANNOTDIR, "introns.bed"),
    "H3K27ac": os.path.join(HTDIR, "H3K27ac_GM12878.bed.gz"),
    "H3K4me1": os.path.join(HTDIR, "H3K4me1_GM12878.bed.gz"),
    "H3K4me2": os.path.join(HTDIR, "H3K4me2_GM12878.bed.gz"),
    "H3K4me3": os.path.join(HTDIR, "H3K4me3_GM12878.bed.gz"),
    "H3K27me3": os.path.join(HTDIR, "H3K27me3_GM12878.bed.gz"),
    "H3K36me3": os.path.join(HTDIR, "H3K36me3_GM12878.bed.gz"),
}

percentiles = [0, 50] + list(np.arange(60, 101, 10))

Populating the interactive namespace from numpy and matplotlib


In [12]:
Motifs = pd.read_csv('/storage/resources/dbase/human/hg19/hg19.hipstr_reference_withmotif.bed', sep='\t', header=None)
Motifs.columns = ['chrom','str.start','str.end','motif.len','motif']
data = Motifs.loc[Motifs['motif.len']==1]
data['id'] = 'STR_'+data['str.start'].astype(str)
print(data.loc[data['motif']=='A'].shape, ' A/Ts and ',data.loc[data['motif']=='C'].shape, " G/Cs" )
data[['chrom','str.start','str.end','id']].to_csv('reg.bed', sep='\t',index=None, header=None)
for key in FEATURES.keys():
    #print(key)
    bedfile = FEATURES[key]
    annot = pd.read_csv(bedfile, sep="\t", header=None, skiprows=1)
    annot[[0,1,2]].to_csv('facts.bed', sep='\t',index=None, header=None)
    cmd = "bedtools intersect -a reg.bed -b facts.bed >Intersect.bed"
    returned_value = os.system(cmd)
    Inter = pd.read_csv('Intersect.bed', sep='\t', header=None)
    Feature_strs = list(set(list(Inter[3])))
    data[key] = np.where(data['id'].isin(Feature_strs), 1, 0)
    COUNT = data.loc[data[key]==1]
    print(COUNT.shape[0]*100/824767,'% overlaps with ',key, '(',COUNT.shape[0],')')#,COUNT.loc[COUNT['motif']=='A'].shape[0],' As',   )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(824767, 6)  A/Ts and  (7437, 6)  G/Cs


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1.4425892403551548 % overlaps with  H3K4me1 ( 11898 )
1.2496862750328275 % overlaps with  H3K36me3 ( 10307 )
1.4125201420522402 % overlaps with  H3K4me3 ( 11650 )
0.017580722798075092 % overlaps with  coding ( 145 )
0.07311155756716746 % overlaps with  H3K27me3 ( 603 )
0.2668632474383674 % overlaps with  5utr ( 2201 )
1.9223611031964178 % overlaps with  H3K27ac ( 15855 )
43.65075227306621 % overlaps with  introns ( 360017 )
2.449055308953923 % overlaps with  H3K4me2 ( 20199 )
1.1444444309726263 % overlaps with  3utr ( 9439 )


In [None]:
(824767, 6)  A/Ts and  (7437, 6)  G/Cs

1.4425892403551548 % overlaps with  H3K4me1 ( 11898 )
1.2496862750328275 % overlaps with  H3K36me3 ( 10307 )
1.4125201420522402 % overlaps with  H3K4me3 ( 11650 )
0.017580722798075092 % overlaps with  coding ( 145 )
0.07311155756716746 % overlaps with  H3K27me3 ( 603 )
0.2668632474383674 % overlaps with  5utr ( 2201 )
1.9223611031964178 % overlaps with  H3K27ac ( 15855 )
43.65075227306621 % overlaps with  introns ( 360017 )
2.449055308953923 % overlaps with  H3K4me2 ( 20199 )
1.1444444309726263 % overlaps with  3utr ( 9439 )