In [5]:
%load_ext rpy2.ipython

In [6]:
%%R
library (MotifDb)
library (MotIV)
library (seqLogo)
MotIV.toTable = function (match) {
  if (length (match@bestMatch) == 0)
    return (NA)

  alignments = match@bestMatch[[1]]@aligns

  df = data.frame (stringsAsFactors=FALSE)
  for (alignment in alignments) {
    x = alignment
    name = x@TF@name
    eVal = x@evalue
    sequence = x@sequence
    match = x@match
    strand = x@strand
    df = rbind (df, data.frame (name=name, eVal=eVal, sequence=sequence,
                                match=match, strand=strand, stringsAsFactors=FALSE))
    } # for alignment
  return (df)
  } # MotIV.toTable

https://bioconductor.org/packages/release/bioc/vignettes/MotifDb/inst/doc/MotifDb.pdf

We now do a simple geneSymbol search, followed by an examination of the sub-MotifDb the search returns. We are looking for all matrices associated with the well-known and highly conserved zinc-finger transcription factor, Egr1.

In [9]:
%%R
RT112.TF = read.csv('RT112.TF.txt', header=0, stringsAsFactor = FALSE)[,1]
UC3.TF = read.csv('UC3.TF.txt', header=0, stringsAsFactor = FALSE)[,1]

## RT112.TF

In [94]:
%%R
RT112.TF

 [1] "FLOT2"          "FLOT1"          "MISP"           "IGKV4-1"       
 [5] "HNRNPL"         "MATR3"          "RBM27"          "RBMX"          
 [9] "HNRNPUL1"       "PABPN1"         "HNRNPR"         "DHX15"         
[13] "FUS"            "HNRNPA0"        "THRAP3"         "HNRNPUL2"      
[17] "HNRNPUL2-BSCL2"


In [95]:
%%R
indices = list()
for (tf in RT112.TF){
#     print (tf)
    indices[[tf]] = grep (tf, values (MotifDb)$geneSymbol, ignore.case=TRUE)
}

In [96]:
%%R
indices

$FLOT2
integer(0)

$FLOT1
integer(0)

$MISP
integer(0)

$`IGKV4-1`
integer(0)

$HNRNPL
[1] 3019

$MATR3
integer(0)

$RBM27
integer(0)

$RBMX
integer(0)

$HNRNPUL1
integer(0)

$PABPN1
integer(0)

$HNRNPR
integer(0)

$DHX15
integer(0)

$FUS
[1]  388 2637 2638 2763 4336 5052 6200

$HNRNPA0
[1] 3015

$THRAP3
integer(0)

$HNRNPUL2
integer(0)

$`HNRNPUL2-BSCL2`
integer(0)



In [97]:
%%R 
MotifDb[indices$FUS]

MotifDb object of length 7
| Created from downloaded public sources: 2013-Aug-30
| 7 position frequency matrices from 5 sources:
|         cisbp_1.02:    1
|              HOMER:    3
|        JASPAR_2014:    1
|         jaspar2016:    1
|         jaspar2018:    1
| 2 organism/s
|          Athaliana:    4
|              other:    3
Athaliana-cisbp_1.02-M2358_1.02 
NA-HOMER-EWS:ERG-fusion(ETS)/CADO_ES1-EWS:ERG-ChIP-Seq(SRA014231)/Homer 
NA-HOMER-EWS:FLI1-fusion(ETS)/SK_N_MC-EWS:FLI1-ChIP-Seq(SRA014231)/Homer 
NA-HOMER-PAX3:FKHR-fusion(Paired,Homeobox)/Rh4-PAX3:FKHR-ChIP-Seq(GSE19063)/Homer 
Athaliana-JASPAR_2014-FUS3-MA0565.1 
Athaliana-jaspar2016-FUS3-MA0565.1 
Athaliana-jaspar2018-FUS3-MA0565.1 


In [108]:
%%R
motif = MotifDb[indices$FUS]
as.list (motif)
# noquote (t (as.data.frame (values (motif))))

$`Athaliana-cisbp_1.02-M2358_1.02`
     1    2          3    4          5    6    7          8          9
A 0.16 0.09 0.00990099 0.01 0.98019802 0.01 0.00 0.00990099 0.27722772
C 0.25 0.59 0.01980198 0.99 0.00990099 0.00 0.00 0.96039604 0.03960396
G 0.44 0.03 0.96039604 0.00 0.00000000 0.00 0.99 0.01980198 0.60396040
T 0.15 0.29 0.00990099 0.00 0.00990099 0.99 0.01 0.00990099 0.07920792

$`NA-HOMER-EWS:ERG-fusion(ETS)/CADO_ES1-EWS:ERG-ChIP-Seq(SRA014231)/Homer`
      1     2     3     4     5     6     7     8     9    10
A 0.871 0.001 0.001 0.001 0.001 0.001 0.001 0.001 0.194 0.220
C 0.009 0.155 0.001 0.001 0.997 0.997 0.001 0.105 0.277 0.269
G 0.119 0.001 0.001 0.001 0.001 0.001 0.041 0.805 0.053 0.276
T 0.001 0.843 0.997 0.997 0.001 0.001 0.957 0.089 0.476 0.235

$`NA-HOMER-EWS:FLI1-fusion(ETS)/SK_N_MC-EWS:FLI1-ChIP-Seq(SRA014231)/Homer`
      1     2     3     4     5     6     7     8     9    10
A 0.356 0.638 0.001 0.802 0.001 0.001 0.997 0.997 0.644 0.001
C 0.230 0.001 0.997 0.1

In [167]:
%%R
hits <- motifMatch (as.list (motif) [1], as.list (MotifDb), top=26) # eval < 1e-05
tbl.hits <- MotIV.toTable (hits)
sequences <- list(unique(tbl.hits$match))[[1]]
print (tbl.hits)


	Ungapped Alignment
	Scores read
	Database read
	Motif matches : 26
                                                       name         eVal
1                           Athaliana-cisbp_1.02-M2358_1.02 3.736678e-12
2                       Athaliana-JASPAR_2014-FUS3-MA0565.1 3.736678e-12
3                        Athaliana-jaspar2016-FUS3-MA0565.1 3.736678e-12
4                        Athaliana-jaspar2018-FUS3-MA0565.1 3.736678e-12
5                          NA-HOMER-NRF(NRF)/Promoter/Homer 1.322307e-09
6                                   Hsapiens-jolma2013-NRF1 1.432989e-09
7                                   Hsapiens-hPDI-HIST1H2BN 1.608303e-09
8  NA-HOMER-NRF1(NRF)/MCF7-NRF1-ChIP-Seq(Unpublished)/Homer 1.995037e-09
9                   Hsapiens-SwissRegulon-NRF1.SwissRegulon 1.996581e-09
10                 Mmusculus-HOCOMOCOv10-NRF1_MOUSE.H10MO.A 5.855833e-09
11                       Cparvum-UniPROBE-Cgd2_3490.UP00395 2.606320e-08
12                 Hsapiens-HOCOMOCOv10-ZN639_HUMAN.H10

In [170]:
%%R
sequences = gsub('N', '[ATCG]',sequences)
sequences = gsub('Y', '[CT]',sequences)
sequences = gsub('M', '[AC]',sequences)
sequences = gsub('S', '[CG]',sequences)
sequences = gsub('W', '[AT]',sequences)
sequences = gsub('R', '[AG]',sequences)
sequences = gsub('-', '',sequences)
write.table(data.frame(sequences),"motif_of_interests.txt",col.names=FALSE,row.names=FALSE,sep="\n",quote=FALSE)

## UC3.TF

In [90]:
%%R
UC3.TF

 [1] "RBM27"    "ZC3H14"   "RBM14"    "PABPN1"   "RBM26"    "SRRT"    
 [7] "ZFC3H1"   "SRSF7"    "ILF2"     "NUDT21"   "RNPS1"    "SERPINE1"
[13] "SRSF6"   


In [91]:
%%R
indices = list()
for (tf in UC3.TF){
#     print (tf)
    indices[[tf]] = grep (tf, values (MotifDb)$geneSymbol, ignore.case=TRUE)
}

In [92]:
%%R
indices

$RBM27
integer(0)

$ZC3H14
integer(0)

$RBM14
integer(0)

$PABPN1
integer(0)

$RBM26
integer(0)

$SRRT
integer(0)

$ZFC3H1
integer(0)

$SRSF7
integer(0)

$ILF2
integer(0)

$NUDT21
integer(0)

$RNPS1
integer(0)

$SERPINE1
integer(0)

$SRSF6
integer(0)

