In [1]:
# Set up
%pylab inline

# Allow us to edit fonts in Illustrator
import matplotlib
matplotlib.rcParams['ps.useafm'] = True
matplotlib.rcParams['pdf.use14corefonts'] = True
matplotlib.rcParams['text.usetex'] = True

# Libraries
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import scipy.stats
import seaborn.apionly as sns
from collections import defaultdict

ANNOTDIR = "/storage/mgymrek/gtex/annotations/"
FEATUREDIR = "/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/"
HTDIR = "/storage/mgymrek/gtex/tfbs/encodedata/"
BEDDIR = "/storage/mgymrek/gtex/annotations/encode/"
mergefile = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/Merged_Best_causality.Table'
REGRSSION = '/storage/szfeupe/Runs/650GTEx_estr/Analysis_by_Tissue/LR_SummaryTest_Table.tsv'
SCORETYPE = ""
MINDIST = 1500 # For TSS
MINQ = 1 # FDR threshold. Ignored since overlap with Estrs later

# LIst of features to intersect with
FEATURES = {
    "3utr": os.path.join(ANNOTDIR, "3utr.bed"),
    "5utr": os.path.join(ANNOTDIR, "5utr.bed"),
    "coding": os.path.join(ANNOTDIR, "coding.bed"),
    "introns": os.path.join(ANNOTDIR, "introns.bed"),
    "H3K27ac": os.path.join(HTDIR, "H3K27ac_GM12878.bed.gz"),
    "H3K4me1": os.path.join(HTDIR, "H3K4me1_GM12878.bed.gz"),
    "H3K4me2": os.path.join(HTDIR, "H3K4me2_GM12878.bed.gz"),
    "H3K4me3": os.path.join(HTDIR, "H3K4me3_GM12878.bed.gz"),
    "H3K27me3": os.path.join(HTDIR, "H3K27me3_GM12878.bed.gz"),
    "H3K36me3": os.path.join(HTDIR, "H3K36me3_GM12878.bed.gz"),
}

percentiles = [0, 50] + list(np.arange(60, 101, 10))

Populating the interactive namespace from numpy and matplotlib




In [20]:
data = pd.read_csv(REGRSSION, sep='\t')
data = data.loc[data['E.tissues']>0]
data = data.loc[data['period']==1]
"""
#Motif in general
Motifs = pd.read_csv('/storage/resources/dbase/human/hg19/hg19.hipstr_reference_withmotif.bed', sep='\t', header=None)
Motifs.columns = ['chrom','str.start','str.end','motif.len','motif']
data = Motifs.loc[Motifs['motif.len']==1]
data['id'] = 'STR_'+data['str.start'].astype(str)
print(data.loc[data['motif']=='A'].shape, ' A/Ts and ',data.loc[data['motif']=='C'].shape, " G/Cs" )
"""
data[['chrom','str.start','str.end','str.id']].to_csv('reg.bed', sep='\t',index=None, header=None)
for key in FEATURES.keys():
    #print(key)
    bedfile = FEATURES[key]
    annot = pd.read_csv(bedfile, sep="\t", header=None, skiprows=1)
    annot[[0,1,2]].to_csv('facts.bed', sep='\t',index=None, header=None)
    cmd = "bedtools intersect -a reg.bed -b facts.bed >Intersect.bed"
    returned_value = os.system(cmd)
    Inter = pd.read_csv('Intersect.bed', sep='\t', header=None)
    Feature_strs = list(set(list(Inter[3])))
    data[key] = np.where(data['str.id'].isin(Feature_strs), 1, 0)
    COUNT = data.loc[data[key]==1]
    print(COUNT.shape[0]*100/data.shape[0],'% overlaps with ',key, '(',COUNT.shape[0],')')#,COUNT.loc[COUNT['motif']=='A'].shape[0],' As',   )
    #break
    

2.546341463414634 % overlaps with  3utr ( 261 )
3.9414634146341463 % overlaps with  H3K4me3 ( 404 )
2.029268292682927 % overlaps with  H3K36me3 ( 208 )
0.5365853658536586 % overlaps with  5utr ( 55 )
0.03902439024390244 % overlaps with  coding ( 4 )
4.068292682926829 % overlaps with  H3K27ac ( 417 )
6.029268292682927 % overlaps with  H3K4me2 ( 618 )
2.8878048780487804 % overlaps with  H3K4me1 ( 296 )
0.17560975609756097 % overlaps with  H3K27me3 ( 18 )
63.21951219512195 % overlaps with  introns ( 6480 )


In [41]:
causal = pd.read_csv(mergefile, sep='\t').dropna()
causal['str.start'] = causal['best.str.start'].astype(int)
print(len(list(set(data['gene']))), data.shape[0], data.loc[data['E.tissues']>1].shape, len(set(data.loc[data['E.tissues']>1]['gene']))   )
HP = data.loc[(data['3utr']==1)|(data['5utr']==1)].merge(causal, on=['chrom', 'str.start','gene'], how='inner')
HP = HP.loc[HP['best.score']>0.1]
print(HP.shape, HP.loc[HP['motif']=='A'].shape, HP.loc[HP['motif']=='C'].shape)
HP.loc[HP['best.score']>0.8][['gene.name', 'gene','E.tissues', 'chrom','best.score','str.start','str.end','best.q','introns','3utr','5utr','motif']].shape #


7070 10250 (3237, 87) 2687
(1189, 95) (1155, 95) (34, 95)


Unnamed: 0,gene.name,gene,E.tissues,chrom,best.score,str.start,str.end,best.q,introns,3utr,5utr,motif
1,METTL18,ENSG00000171806.7,8,chr1,0.957622,169759316,169759335,0.00331,1,0,0,A
91,HHLA3,ENSG00000197568.9,1,chr1,0.886121,70722142,70722155,0.007285,0,0,0,A
198,GPATCH4,ENSG00000160818.12,2,chr1,0.988064,156570363,156570379,0.001019,1,0,0,A
210,LRRIQ3,ENSG00000162620.11,8,chr1,0.905589,74669076,74669090,0.007285,1,0,0,A
235,KIF17,ENSG00000117245.8,1,chr1,0.884583,21017184,21017197,0.001857,1,0,0,A
261,SMPDL3B,ENSG00000130768.10,3,chr1,0.996349,28250549,28250559,0.001295,0,0,0,A
433,ATAD3B,ENSG00000160072.15,5,chr1,0.948717,1437601,1437611,0.000857,1,0,0,A
529,PKD2L1,ENSG00000107593.15,1,chr10,0.94933,102029009,102029026,0.001319,0,0,0,A
723,TTC17,ENSG00000052841.10,1,chr11,0.911543,43431366,43431409,0.002273,1,0,0,A
827,RPL27A,ENSG00000166441.8,1,chr11,0.855372,8630424,8630434,0.00162,0,0,0,A


In [None]:
(824767, 6)  A/Ts and  (7437, 6)  G/Cs
#All homopolymers
1.4425892403551548 % overlaps with  H3K4me1 ( 11898 )
1.2496862750328275 % overlaps with  H3K36me3 ( 10307 )
1.4125201420522402 % overlaps with  H3K4me3 ( 11650 )
0.017580722798075092 % overlaps with  coding ( 145 )
0.07311155756716746 % overlaps with  H3K27me3 ( 603 )
0.2668632474383674 % overlaps with  5utr ( 2201 )
1.9223611031964178 % overlaps with  H3K27ac ( 15855 )
43.65075227306621 % overlaps with  introns ( 360017 )
2.449055308953923 % overlaps with  H3K4me2 ( 20199 )
1.1444444309726263 % overlaps with  3utr ( 9439 )
#All e-homopolymers
2.546341463414634 % overlaps with  3utr ( 261 )
3.9414634146341463 % overlaps with  H3K4me3 ( 404 )
2.029268292682927 % overlaps with  H3K36me3 ( 208 )
0.5365853658536586 % overlaps with  5utr ( 55 )
0.03902439024390244 % overlaps with  coding ( 4 )
4.068292682926829 % overlaps with  H3K27ac ( 417 )
6.029268292682927 % overlaps with  H3K4me2 ( 618 )
2.8878048780487804 % overlaps with  H3K4me1 ( 296 )
0.17560975609756097 % overlaps with  H3K27me3 ( 18 )
63.21951219512195 % overlaps with  introns ( 6480 )
