Switch branches/tags
Nothing to show
Find file History
Fetching latest commit…
Cannot retrieve the latest commit at this time.
Permalink
..
Failed to load latest commit information.
Data
.gitignore
NOTES
README
fix_occratio_for_R.plx
gff2fasta.plx
look_at_occratio.R
maskSeq.plx
tallymer2gff3.plx
test-01.tmer
test.header

README

## 0) PICK YOUR TOOL

GT=/nfs/panda/ensemblgenomes/external/genometools/bin/gt





## 1) PICK THE ASSEMBLY TO FILTER

## WHEAT
FAS_DIR=Wheat_Assembly
FAS=161010_Chinese_Spring_v1.0_pseudomolecules
FAP=fasta

# INDEX OUT DIR
IDX_DIR=Wheat_Indexes

# REPEAT OUT DIR
RPT_DIR=Wheat_Repeats



## BARLEY
FAS_DIR=Barley_Assembly
FAS=Hordeum_vulgare.Hv_IBSC_PGSB_v2.dna.toplevel
FAP=fa

# INDEX OUT DIR
IDX_DIR=Barley_Indexes

# REPEAT OUT DIR
RPT_DIR=Barley_Repeats



## TEST
ll $FAS_DIR/$FAS.$FAP
ll $IDX_DIR
ll $RPT_DIR





## 1) First, we create the enhanced suffix array. We invoke gt
## suffixerator with options -tis, -suf, -lcp, -des, -ssp and -sds
## since LTRharvest needs the corresponding tables. Furthermore, we
## specify -dna, as we process DNA-sequences.

## NB: No need to repeat this if we have already run it and we are
## just changing parameters below!

time \
$GT suffixerator -v \
  -tis -suf -lcp -des -ssp -sds -dna \
  -db $FAS_DIR/$FAS.$FAP \
  -indexname $IDX_DIR/$FAS

md5sum $IDX_DIR/$FAS.{al1,des,sds,esq,ssp,suf,prj,lcp,llv}





## 2) Next we have a look at some kmer distributions using
## occratio. Note, this doesn't show the kmer frequency histogram, but
## just more general kmer counts against k.

## NB: No need to repeat this if we have already run it and a decision
## has been made below.

time \
$GT tallymer occratio -v \
  -minmersize 10 \
  -maxmersize 45 \
  -output unique nonunique nonuniquemulti total relative \
  -esa $IDX_DIR/$FAS \
> Data/$FAS.occratio.10.45.dump

md5sum Data/$FAS.occratio*


## Bah
./fix_occratio_for_R.plx \
    Data/$FAS.occratio.10.45.dump > \
    Data/$FAS.occratio.10.45.fix




## 3) After making a decision about what kmer size and occurance we
## want to filter based on the above, we need to build a specific
## index for those choices.

## Lets pick 19-mers to build an index for searching against?
## Lets pick 21-mers to build an index for searching against?
## Lets pick 41-mers to build an index for searching against?

#MER=19
MER=21
#MER=41

## Only care about kmers more frequent than this number in our index.
MIN=10
MIN=05

time \
$GT tallymer mkindex -v \
  -mersize $MER \
  -minocc $MIN \
  -esa $IDX_DIR/$FAS \
  -counts -pl \
  -indexname $IDX_DIR/$FAS.idx.$MER.$MIN

md5sum $IDX_DIR/$FAS.idx.$MER.$MIN.{mer,mct,mbd}





## 4) Finally we search a sequence against this index, with the idea
## of identifying repeats in the sequence.

# ## Testing...

# ## get a sequence for testing
# fastacmd \
#   -d $IDX_DIR/$FAS \
#   -s PGSC0003DMS000000001 \
# >    PGSC0003DMS000000001.fa

# $GT tallymer search -v \
#   -output qseqnum qpos counts sequence \
#   -tyr $IDX_DIR/$FAS.idx.$MER.$MIN \
#   -q PGSC0003DMS000000001.fa \
# | ./tallymer2gff3.plx -seq PGSC0003DMS000000001.fa



#echo \
time \
$GT tallymer search -v \
  -output qseqnum qpos counts  \
  -tyr $IDX_DIR/$FAS.idx.$MER.$MIN \
  -q $FAS_DIR/$FAS.$FAP \
   > $RPT_DIR/$FAS.rpt.$MER.$MIN.tmer

md5sum $RPT_DIR/$FAS.rpt.$MER.$MIN.tmer





## 5) Now we just need a script to process the tmer results into a GFF
## format file.

## Note that this script merges overlapping kmers and 'nearly
## overlapping' kmers to call repetative regions (greater than a
## certain minuimum size).

time \
./tallymer2gff3.plx -k $MER \
  -s $FAS_DIR/$FAS.$FAP \
     $RPT_DIR/$FAS.rpt.$MER.$MIN.tmer \
   > $RPT_DIR/$FAS.rpt.$MER.$MIN.gff

md5sum $RPT_DIR/$FAS.rpt.$MER.$MIN.gff

## (See the cov option! There is no need to re-run suffixerator,
## mkindex or search!)

time zcat $RPT_DIR/$FAS.rpt.$MER.$MIN.tmer |
./tallymer2gff3.plx -k $MER \
    --seq $FAS_DIR/$FAS.$FAP.header \
    --cov 99 \
    --gap 1 \
    > $RPT_DIR/$FAS.rpt.$MER.99-gap1.gff



## 6) Optionally mask the input sequence using the repeats.

./maskSeq.plx \
  -s $FAS_DIR/$FAS.$FAP \
  -f $RPT_DIR/$FAS.rpt.$MER.$MIN.gff \
   > $FAS_DIR/$FAS.kfilt.$MER.$MIN.$FAP

md5sum $FAS_DIR/$FAS.kfilt.$MER.$MIN.fa





## 7) Optionally convert the GFF into a fasta file of 'repeat
## sequences'.

./gff2fasta.plx \
  -s $FAS_DIR/$FAS.$FAP \
  -f $RPT_DIR/$FAS.rpt.$MER.$MIN.gff \
   > $RPT_DIR/$FAS.rpt.$MER.$MIN.$FAP

md5sum $RPT_DIR/$FAS.rpt.$MER.$MIN.fa





## 8) Finishing touches: make the sequences NR and search them back
## against the assembly... (as a sanity check).





## 9) Finishing touches: submit new repeats to a repeat database?




## NOTES:

## SGN pipeline:

#               basically, we take a model (for example, S. lycopersicum 
#               genome), we run RepeatScout (it have 4 steps)
# 16:10 <@aure> after the 2rd step we obtain a fasta file, we filter it with a 
#               RepeatScout script that use nseg and trf programs
# 16:11 <@aure> we also add another step where we compare with a set of genes 
#               and we remove the repeats that have hits with these genes from 
#               the pre-repeat set
# 16:12 <@aure> (they could be very big gene families)
# 16:13 <@aure> and after that we run RepeatMasker over the model. It will give 
#               us the position and the count of the repeats
# 16:13 <@aure> we are using the default parameters for RepeatScout
# 16:16 <@aure> (except -l length of l-mer to consider = 15, becuase i saw that 
#               in a paper... that currently I don't remember where I found it)

# ?

# http://genomebiology.com/2008/9/3/R61
# http://bioinformatics.oxfordjournals.org/cgi/content/abstract/21/suppl_1/i351