In [1]:
%load_ext rpy2.ipython

In [2]:
%%R -o gene2name
suppressMessages(suppressWarnings(library(tidyverse)))

GTF = '~/genomes/hg38/gencode.v34/gencode.v34.annotation.gtf'

gtf <- rtracklayer::import(GTF)
gene2name <- gtf[gtf$type == "gene"] %>% data.frame %>% column_to_rownames('gene_id') %>% dplyr::select('gene_name')

In [3]:
import pandas as pd
import numpy as np

In [197]:
!mkdir -p enrichment/
!mkdir -p enrichment/exon_level
!mkdir -p enrichment/gene_level

## event-level analysis
> you can also do an event-level analysis
where you use the exon-level results
but as genefile use ChIP-seq data intersected with ex1-ex2-ex3 event (basically upstrem and downstream exon included with the intron in the middle)

> for exon-level analysus
`JunctionSeqallGenes.results.txt.gz`
has the exon coordinates `chr	start	end	strand`
also has the transcripts that exon junction is in transcripts
you can go to gff flatfile and find exons upstream and downstream of the featyre at hand

___


__1) ex1-ex2-ex3 event__

(basically upstrem and downstream exon included with the intron in the middle)

In [26]:
!zcat annoFiles/JunctionSeq.flat.gff.gz | awk '{print $3}' | sort | uniq

aggregate_gene
exonic_part
splice_site


In [261]:
%%bash
zcat annoFiles/JunctionSeq.flat.gff.gz \
| grep "num 00[1-3];" \
| awk -v FS='[\t;]' -v OFS='\t' '{print $1,$2,$3,$4,$5,$6,$7,$8,$10}' \
| sort -k9,9 \
| groupBy -g 1,9 -c 9,4,5,6,7,8 -o count,min,max,first,first,first \
| awk 'gsub(" tx_set ENST","ENST")' \
| awk -v FS='\t' -v OFS='\t' '{print $1,$4,$5,$6,$7,$8,$2,$3}' \
| sort -k2,2 -k3,3 -n | sort -k1,1 -s \
> annoFiles/JunctionSeq.flat.exon1to3.bed

In [38]:
# !cat ~/genomes/hg38/gencode.v34/gencode.v34.annotation.gtf | head 

__2) Process event level `JunctionSeq` results__

In [4]:
def read_results_table(PATH):
    '''event-level analysis
    '''
    table = pd.read_csv(PATH,sep='\t').drop("featureID",axis=1).dropna(subset=['expr_case']).dropna(axis=1)
    # .drop("featureID",axis=1)
    table = pd.concat([
        gene2name.loc[table.geneID,:].reset_index(drop=True),
        table.reset_index(drop=True)
    ],axis=1).set_index(["geneID","gene_name","countbinID"])
    
    table=table[[i for i in table if len(set(table[i]))>1]]

    return table

In [5]:
sigtable = read_results_table('jscs/sigGenes.results.txt.gz')

In [8]:
set(sigtable.featureType)

{'exonic_part', 'splice_site'}

In [9]:
len(set(sigtable.index.to_frame().geneID))

29

In [111]:
[(i+1,col) for i,col in enumerate(pd.read_csv('jscs/sigGenes.results.txt.gz',sep='\t').columns)]

[(1, 'featureID'),
 (2, 'geneID'),
 (3, 'countbinID'),
 (4, 'testable'),
 (5, 'status'),
 (6, 'allZero'),
 (7, 'baseMean'),
 (8, 'baseVar'),
 (9, 'dispBeforeSharing'),
 (10, 'dispFitted'),
 (11, 'dispersion'),
 (12, 'pvalue'),
 (13, 'padjust'),
 (14, 'chr'),
 (15, 'start'),
 (16, 'end'),
 (17, 'strand'),
 (18, 'transcripts'),
 (19, 'featureType'),
 (20, 'padjust_noFilter'),
 (21, 'log2FC(case/ctrl)'),
 (22, 'log2FCvst(case/ctrl)'),
 (23, 'expr_ctrl'),
 (24, 'expr_case'),
 (25, 'geneWisePadj')]

In [105]:
ls annoFiles/JunctionSeq.flat.exon1to3.bed

JunctionSeq.flat.exon1to3.bed  [0m[38;5;9mJunctionSeq.flat.gff.gz[0m


In [262]:
%%bash 
zcat jscs/sigGenes.results.txt.gz \
| awk -v FS='\t' -v OFS='\t' 'NR>1 {print $14,$15,$16,$17,$18}' \
| bedtools intersect -wa -c -a annoFiles/JunctionSeq.flat.exon1to3.bed -b - \
| sort -k9,9rn \
| awk -v FS='\t' -v OFS='\t' '{print $7,$($9==0)*$9+($9>=1)*1}' | sort -u -k1,1 \
> jscs/sigGenes.results.exon1to3.txt

### ChIP-seq data

In [200]:
ls ChIP-seq-data/bed_files

GFP_1_DOX_pos_IDX1.rep1_sorted_peaks.narrowPeak.bed
GFP_2_DOX_pos_IDX2.rep1_sorted_peaks.narrowPeak.bed
s303_1_DOX_neg_IDX13.rep1_sorted_peaks.narrowPeak.bed
s303_1_DOX_pos_IDX7.rep1_sorted_peaks.narrowPeak.bed
s303_2_DOX_neg_IDX14.rep1_sorted_peaks.narrowPeak.bed
s303_2_DOX_pos_IDX8.rep1_sorted_peaks.narrowPeak.bed
sox9_1_DOX_neg_IDX11.rep1_sorted_peaks.narrowPeak.bed
SOX9_1_DOX_pos_IDX3.rep1_sorted_peaks.narrowPeak.bed
sox9_2_DOX_neg_IDX12.rep1_sorted_peaks.narrowPeak.bed
SOX9_2_DOX_pos_IDX4.rep1_sorted_peaks.narrowPeak.bed


In [263]:
%%bash 
cat \
ChIP-seq-data/bed_files/GFP_1_DOX_pos*.bed \
ChIP-seq-data/bed_files/GFP_2_DOX_pos*.bed \
| bedtools intersect -wa -c -a annoFiles/JunctionSeq.flat.exon1to3.bed -b - \
| awk '$9 >= 1{print $7}' | sort -u -k1,1 \
> enrichment/exon_level/GFP_DOX_pos.txt

In [264]:
%%bash 
cat \
ChIP-seq-data/bed_files/SOX9_1_DOX_pos*.bed \
ChIP-seq-data/bed_files/SOX9_2_DOX_pos*.bed \
| bedtools intersect -wa -c -a annoFiles/JunctionSeq.flat.exon1to3.bed -b - \
| awk '$9 >= 1{print $7}' | sort -u -k1,1 \
> enrichment/exon_level/SOX9_DOX_pos.txt

cat \
ChIP-seq-data/bed_files/sox9_1_DOX_neg*.bed \
ChIP-seq-data/bed_files/sox9_2_DOX_neg*.bed \
| bedtools intersect -wa -c -a annoFiles/JunctionSeq.flat.exon1to3.bed -b - \
| awk '$9 >= 1{print $7}' | sort -u -k1,1 \
> enrichment/exon_level/SOX9_DOX_neg.txt

In [265]:
%%bash 
cat \
ChIP-seq-data/bed_files/s303_1_DOX_neg*.bed \
ChIP-seq-data/bed_files/s303_2_DOX_neg*.bed \
| bedtools intersect -wa -c -a annoFiles/JunctionSeq.flat.exon1to3.bed -b - \
| awk '$9 >= 1{print $7}' | sort -u -k1,1  \
> enrichment/exon_level/S303_DOX_neg.txt

cat \
ChIP-seq-data/bed_files/s303_1_DOX_pos*.bed \
ChIP-seq-data/bed_files/s303_2_DOX_pos*.bed \
| bedtools intersect -wa -c -a annoFiles/JunctionSeq.flat.exon1to3.bed -b - \
| awk '$9 >= 1{print $7}' | sort -u -k1,1 \
> enrichment/exon_level/S303_DOX_pos.txt

`--exp` file:

In [267]:
%%bash 
echo -e "#event\tspliced" > enrichment/exon_level/splice_binary_table.txt
cat jscs/sigGenes.results.exon1to3.txt >> enrichment/exon_level/splice_binary_table.txt

In [9]:
# alltable = read_results_table('jscs/allGenes.results.txt.gz')

In [10]:
# len(set(alltable.index.to_frame().geneID))

In [11]:
# alltable.head()

In [268]:
%%bash 
export TEISERDIR='/data_gilbert/home/aarab/Workflows/TEISERv1.1'
exp="splice_binary_table.txt"

cd enrichment/exon_level/
for gene in *DOX*.txt; do

    perl ${TEISERDIR}/run_mi_gene_list.pl \
        --expfile=$exp \
        --genefile=$gene \
        --exptype=discrete \
        --species=human \
        --doremovedups=0 \
        --doremoveextra=0 
        # --ebins=11 \

    perl /data_gilbert/home/aarab/Workflows/TEISERv1.1/Scripts/teiser_draw_matrix.pl \
        --pvmatrixfile=splice_binary_table.txt_GENESET/splice_binary_table.txt.matrix \
        --summaryfile=splice_binary_table.txt_GENESET/splice_binary_table.txt.summary \
        --expfile=splice_binary_table.txt_GENESET/splice_binary_table.txt \
        --quantized=1 \
        --colmap=/data_gilbert/home/aarab/Workflows/TEISERv1.1/Scripts/HEATMAPS/cmap_1.txt --order=0 --min=-3 --max=3 --cluster=5 

    out=${gene/.txt/}
    rm -rf $out
    mv ${exp}_GENESET $out
    cp $out/${exp}.summary.pdf ${out}.pdf
done
cd ../../

The TEISERDIR environment variable is /data_gilbert/home/aarab/Workflows/TEISERv1.1
Mon Jan 31 17:25:14 PST 2022
Remove duplicates, create splice_binary_table.txt_GENESET/splice_binary_table.txt
step 2: seed optimization.
Expfile loaded: 93265 values...
Quantizing the input vector...Done
Number of clusters: 2
calculating the p-value matrix.
Allocating memory ... Done
doing stats: mi = 0.000000
pass = 1.000000
z = -0.103496
freq 0: 0.000097	9	93144	9	93265
freq 1: 0.000000	0	121	9	93265
step 7: drawing matrix.
Reading MI data ... Done.
Start drawing
0
Outputing EPS file splice_binary_table.txt_GENESET/splice_binary_table.txt.summary.eps
Convert to PDF splice_binary_table.txt_GENESET/splice_binary_table.txt.summary.pdf
ps2pdf -dEPSCrop -dAutoRotatePages=/None splice_binary_table.txt_GENESET/splice_binary_table.txt.summary.eps splice_binary_table.txt_GENESET/splice_binary_table.txt.summary.pdf
Finished.
Reading MI data ... Done.
Start drawing
0
Outputing EPS file splice_binary_table.txt_G

Option suffix requires an argument
Option suffix requires an argument
Option suffix requires an argument
Option suffix requires an argument
Option suffix requires an argument


___
## gene-level analysis
> to run teiser
with ChIP-seq data intersected with genes as genefile

> this is the gene based file
`JunctionSeqsigGenes.genewiseResults.txt.gz`
genes listed here go to class ‘1’
and all the other expressed genes class ‘0’



In [14]:
def read_genewiseResults_table(PATH):
    '''event-level analysis
    '''
    table = pd.read_csv(PATH,sep='\t')#.drop("featureID",axis=1).dropna(subset=['expr_dox']).dropna(axis=1)
    table = table.set_index(["geneID","geneName"])

    return table

In [15]:
genewise_table = read_genewiseResults_table("jscs/sigGenes.genewiseResults.txt.gz")
genewise_genes = set(genewise_table.index.get_level_values(1))

In [16]:
genewise_table

Unnamed: 0_level_0,Unnamed: 1_level_0,chr,start,end,strand,baseMean,geneWisePadj,mostSigID,mostSigPadjust,numExons,numKnown,numNovel,exonsSig,knownSig,novelSig,numFeatures,numSig
geneID,geneName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
ENSG00000179051.14,RCC2,chr1,17406759,17439677,-,4226.7,0.0009841564,J032,0.000537,19,13,0,1,1,0,19/13/0,1/1/0
ENSG00000053372.5,MRTO4,chr1,19251804,19260128,+,1654.2,0.005966044,E001,0.00361,13,8,0,1,0,0,13/8/0,1/0/0
ENSG00000163399.16,ATP1A1,chr1,116372667,116410261,+,15877.3,0.0131172,E002,0.00875,40,28,0,1,0,0,40/28/0,1/0/0
ENSG00000158716.9,DUSP23,chr1,159780931,159782543,+,544.5,0.008004604,E006,0.00534,7,3,0,1,0,0,7/3/0,1/0/0
ENSG00000116750.13,UCHL5,chr1,193012249,193060080,-,1049.6,0.008947062,E027,0.00606,37,20,0,2,0,0,37/20/0,2/0/0
ENSG00000175348.11,TMEM9B,chr11,8947201,8965011,-,404.2,0.01324392,E007,0.00905,14,8,0,1,0,0,14/8/0,1/0/0
ENSG00000149136.9,SSRP1,chr11,57325985,57335892,-,5121.7,0.01330073,E032,0.00898,33,20,0,1,0,0,33/20/0,1/0/0
ENSG00000167995.17,BEST1,chr11,61950062,61965515,+,49388.8,0.008909568,E030,0.00606,30,15,0,1,0,0,30/15/0,1/0/0
ENSG00000089597.18,GANAB,chr11,62624825,62646726,-,7773.9,0.00498296,E048,0.00293,59,33,0,2,1,0,59/33/0,2/1/0
ENSG00000165526.9,RPUSD4,chr11,126202095,126211692,-,774.5,0.01052282,E008,0.00702,30,12,0,1,0,0,30/12/0,1/0/0


In [19]:
def read_expression_table(PATH):
    '''
    '''
    table = pd.read_csv(PATH,sep='\t').dropna(subset=['expr_case']).dropna(axis=1)
    table["geneID"] = table.featureID.str.split(':').str[0]
    table["countbinID"] = table.featureID.str.split(':').str[1]
    
    table = pd.concat([
        gene2name.loc[table.geneID,:].reset_index(drop=True),
        table.drop("featureID",axis=1).reset_index(drop=True)
    ],axis=1).set_index(["geneID","gene_name","countbinID"])
    
    return table

In [20]:
exp_table = read_expression_table('jscs/allGenes.expression.data.txt.gz')

exp_table.index.get_level_values(1)
exp_genes = set(exp_table.index.get_level_values(1))

In [21]:
len(exp_genes)

8187

In [22]:
splice_binary_table = pd.DataFrame(
    {"spliced":np.zeros(len(exp_genes))},
    index=exp_genes, dtype=int
)

In [23]:
splice_binary_table

Unnamed: 0,spliced
APP,0
PIK3R4,0
ZFP36,0
SAV1,0
ZNF16,0
...,...
TSPAN1,0
COX7A2L,0
EOLA2,0
AC079305.2,0


In [24]:
splice_binary_table.loc[genewise_genes,:] = 1

In [43]:
splice_binary_table.to_csv("ChIP-seq-data/splice_binary_table.txt",sep='\t')

### ChIP-seq-data

    %%bash 
    GTF=~/genomes/hg38/gencode.v34/gencode.v34.annotation.gtf

    for f in ChIP-seq-data/bed_files/*.bed; do 
        o=${f/_IDX*/.txt}

        echo "================================================================"
        echo $o

        cat $GTF \
        | awk -v FS='\t' '$3=="gene"' \
        | bedtools intersect -wa -wb -a - -b $f \
        | cut -f 9,18 | uniq | awk -v FS='\t' '{print $1,$2}' \
        | awk -v FS=";" -v OFS="" '{gsub("\"",""); print $3}' \
        | awk -v FS=' ' -v OFS="" '{print $2}' > $o

    done


In [36]:
ls ChIP-seq-data/bed_files/*.txt

ChIP-seq-data/bed_files/GFP_1_DOX_pos.txt
ChIP-seq-data/bed_files/GFP_2_DOX_pos.txt
ChIP-seq-data/bed_files/s303_1_DOX_neg.txt
ChIP-seq-data/bed_files/s303_1_DOX_pos.txt
ChIP-seq-data/bed_files/s303_2_DOX_neg.txt
ChIP-seq-data/bed_files/s303_2_DOX_pos.txt
ChIP-seq-data/bed_files/sox9_1_DOX_neg.txt
ChIP-seq-data/bed_files/SOX9_1_DOX_pos.txt
ChIP-seq-data/bed_files/sox9_2_DOX_neg.txt
ChIP-seq-data/bed_files/SOX9_2_DOX_pos.txt


In [41]:
%%bash 
cat \
ChIP-seq-data/bed_files/SOX9_1_DOX_pos.txt \
ChIP-seq-data/bed_files/SOX9_2_DOX_pos.txt \
| uniq > ChIP-seq-data/SOX9_DOX_pos.txt

cat \
ChIP-seq-data/bed_files/sox9_1_DOX_neg.txt \
ChIP-seq-data/bed_files/sox9_2_DOX_neg.txt \
| uniq > ChIP-seq-data/SOX9_DOX_neg.txt

In [39]:
%%bash 
cat \
ChIP-seq-data/bed_files/GFP_1_DOX_pos.txt \
ChIP-seq-data/bed_files/GFP_2_DOX_pos.txt \
| uniq > ChIP-seq-data/GFP_DOX_pos.txt

In [40]:
%%bash 
cat \
ChIP-seq-data/bed_files/s303_1_DOX_neg.txt \
ChIP-seq-data/bed_files/s303_2_DOX_neg.txt \
| uniq > ChIP-seq-data/S303_DOX_neg.txt

cat \
ChIP-seq-data/bed_files/s303_1_DOX_pos.txt \
ChIP-seq-data/bed_files/s303_2_DOX_pos.txt \
| uniq > ChIP-seq-data/S303_DOX_pos.txt

In [45]:
ls ChIP-seq-data/*DOX*.txt

ChIP-seq-data/GFP_DOX_pos.txt   ChIP-seq-data/SOX9_DOX_neg.txt
ChIP-seq-data/S303_DOX_neg.txt  ChIP-seq-data/SOX9_DOX_pos.txt
ChIP-seq-data/S303_DOX_pos.txt


In [54]:
%%bash 
export TEISERDIR='/data_gilbert/home/aarab/Workflows/TEISERv1.1'
exp="splice_binary_table.txt"

cd ChIP-seq-data/
for gene in *DOX*.txt; do

    perl ${TEISERDIR}/run_mi_gene_list.pl \
        --expfile=$exp \
        --genefile=$gene \
        --exptype=discrete \
        --species=human \
        --doremovedups=0 \
        --doremoveextra=0 
        # --ebins=11 \

    perl /data_gilbert/home/aarab/Workflows/TEISERv1.1/Scripts/teiser_draw_matrix.pl \
        --pvmatrixfile=splice_binary_table.txt_GENESET/splice_binary_table.txt.matrix \
        --summaryfile=splice_binary_table.txt_GENESET/splice_binary_table.txt.summary \
        --expfile=splice_binary_table.txt_GENESET/splice_binary_table.txt \
        --quantized=1 \
        --colmap=/data_gilbert/home/aarab/Workflows/TEISERv1.1/Scripts/HEATMAPS/cmap_1.txt --order=0 --min=-3 --max=3 --cluster=5 

    out=${gene/.txt/}
    rm -rf $out
    mv ${exp}_GENESET $out
    cp $out/${exp}.summary.pdf ${out}.pdf
done
cd ../

The TEISERDIR environment variable is /data_gilbert/home/aarab/Workflows/TEISERv1.1
Mon Jan 24 20:13:31 PST 2022
Remove duplicates, create splice_binary_table.txt_GENESET/splice_binary_table.txt
step 2: seed optimization.
Expfile loaded: 8187 values...
Quantizing the input vector...Done
Number of clusters: 2
calculating the p-value matrix.
Allocating memory ... Done
doing stats: mi = 0.000001
pass = 1.000000
z = -0.078693
freq 0: 0.000245	2	8158	2	8187
freq 1: 0.000000	0	29	2	8187
step 7: drawing matrix.
Reading MI data ... Done.
Start drawing
0
Outputing EPS file splice_binary_table.txt_GENESET/splice_binary_table.txt.summary.eps
Convert to PDF splice_binary_table.txt_GENESET/splice_binary_table.txt.summary.pdf
ps2pdf -dEPSCrop -dAutoRotatePages=/None splice_binary_table.txt_GENESET/splice_binary_table.txt.summary.eps splice_binary_table.txt_GENESET/splice_binary_table.txt.summary.pdf
Finished.
Reading MI data ... Done.
Start drawing
0
Outputing EPS file splice_binary_table.txt_GENESE

Option suffix requires an argument
Option suffix requires an argument
Option suffix requires an argument
Option suffix requires an argument
Option suffix requires an argument


In [193]:
ls /data_gilbert/home/aarab/iPAGE/run_mi_gene_list.pl

/data_gilbert/home/aarab/iPAGE/run_mi_gene_list.pl


In [183]:
!echo ${TEISERDIR}




In [80]:
!date

Thu Jan 13 19:49:49 PST 2022
