Skip to content

Commit

Permalink
added sp version of pyfasta to bootstrap: updated find_rna: added sto…
Browse files Browse the repository at this point in the history
…ps to run.sh
  • Loading branch information
gturco committed Oct 7, 2013
1 parent 140cf22 commit 0e1ca1c
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 11 deletions.
2 changes: 1 addition & 1 deletion bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def pip_install(dir_name):
if processing != 0: raise Exception('can not download processing READ REQUIREMENTS and TRY again!')
shapely = subprocess.call(['{0}/bin/pip'.format(dir_name), 'install', 'shapely'])
if shapely != 0: raise Exception('can not download shapely READ REQUIREMENTS and TRY again!')
pyfasta = subprocess.call(['{0}/bin/pip'.format(dir_name), 'install', 'pyfasta'])
pyfasta = subprocess.call(['{0}/bin/pip'.format(dir_name), 'install','-Iv', 'pyfasta==0.4.5'])
if pyfasta != 0: raise Exception('can not download pyfasta READ REQUIREMENTS and TRY again!')
scipy = subprocess.call(['{0}/bin/pip'.format(dir_name), 'install', 'scipy'])
if scipy != 0: raise Exception('can not download scipy READ REQUIREMENTS and TRY again!')
Expand Down
111 changes: 103 additions & 8 deletions pipeline/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,22 @@ DIR=data/${ORGA}_${ORGB}/
#
echo checking for unannotated proteins......
sh coann/co-anno.sh ${ORGA} ${ORGB} $QUOTA $BLAST_DIR

status=$?
if [ $status -ne 0 ]; then
echo "Error running checking for unannotated proteins"
exit
fi

echo finding syntenic regions...
sh quota.sh $DIR/${ORGA} $DIR/${ORGB} $QUOTA $NCPU
status=$?
if [ $status -ne 0 ]; then
echo "Error running quota-algn"
exit
fi


echo finding cns...
python scripts/find_cns.py \
-q $DIR/${ORGA}.fasta --qbed $DIR/${ORGA}.all.bed \
Expand All @@ -41,6 +55,13 @@ python scripts/find_cns.py \
--pair_fmt pair > $DIR/${ORGA}_${ORGB}.cns.txt
### if using blast+ --blast_path ${BLAST_DIR}\legacy_blast.pl \

status=$?
if [ $status -ne 0 ]; then
echo "Error runnining find cns"
exit
fi


python scripts/localdup.py \
-q $DIR/${ORGA}.fasta --qbed $DIR/${ORGA}.all.bed \
-s $DIR/${ORGB}.fasta --sbed $DIR/${ORGB}.all.bed \
Expand All @@ -55,6 +76,12 @@ python scripts/localdup.py \
--qdups $DIR/${ORGA}.all.localdups \
--sdups $DIR/${ORGB}.all.localdups
### if using blast+ --blast_path ${BLAST_DIR}\legacy_blast.pl \
status=$?
if [ $status -ne 0 ]; then
echo "Error runnining finding localdups"
exit
fi


python scripts/post_processing/cns_to_fasta.py \
-c $DIR/${ORGA}_${ORGB}.cns.txt.local \
Expand All @@ -65,23 +92,60 @@ python scripts/post_processing/cns_to_fasta.py \
--min_len=18 \
> $DIR/${ORGA}_${ORGB}.cns_test.fasta

status=$?
if [ $status -ne 0 ]; then
echo "Error runnining cns2fasta"
exit
fi


echo removing cns that have hits in arabidopsis as rna or protein
wget -O data/at_protein.fasta ftp://ftp.arabidopsis.org/home/tair/Sequences/blast_datasets/TAIR10_blastsets/TAIR10_pep_20101214_updated
wget -O data/os_protein.fasta ftp://ftp.plantbiology.msu.edu/pub/data/Eukaryotic_Projects/o_sativa/annotation_dbs/pseudomolecules/version_6.1/all.dir/all.pep

if [ ! -f data/at_protein.fasta ]; then
wget -O data/at_protein.fasta ftp://ftp.arabidopsis.org/home/tair/Sequences/blast_datasets/TAIR10_blastsets/TAIR10_pep_20101214_updated
fi

if [ ! -f data/os_protein.fasta ]; then
wget -O data/os_protein.fasta ftp://ftp.plantbiology.msu.edu/pub/data/Eukaryotic_Projects/o_sativa/annotation_dbs/pseudomolecules/version_6.1/all.dir/all.pep
fi

bblast.py -b ${BLAST_DIR}/blastall -p blastx -d data/at_protein.fasta -i $DIR/${ORGA}_${ORGB}.cns_test.fasta -e 0.01 -m 8 -a ${NCPU} -o $DIR/at_protein.blast
bblast.py -b ${BLAST_DIR}/blastall -p blastx -d data/os_protein.fasta -i $DIR/${ORGA}_${ORGB}.cns_test.fasta -e 0.01 -m 8 -a ${NCPU} -o $DIR/os_protein.blast

status=$?
if [ $status -ne 0 ]; then
echo "Error running bblast"
exit
fi


python scripts/post_processing/find_exons.py \
--cns $DIR/${ORGA}_${ORGB}.cns.txt.local \
-o $DIR \
$DIR/at_protein.blast $DIR/os_protein.blast

###NEED TO EDIT find_rna.py SO IT looks for the correct cns fasta file
wget -O data/thaliana_v10.gff ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR10_genome_release/TAIR10_gff3/TAIR10_GFF3_genes_transposons.gff
wget -O data/thaliana_v10.description ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR10_genome_release/TAIR10_functional_descriptions
wget -O data/thaliana_v10.fasta ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR10_genome_release/TAIR10_chromosome_files/TAIR10_chr_all.fas
perl -pi -e "s/>(.).*/>\$1/" data/thaliana_v10.fasta
perl -pi -e "s/.{3}//" data/thaliana_v10.gff
status=$?
if [ $status -ne 0 ]; then
echo "Error running find exons"
exit
fi



if [ ! -f data/thaliana_v10.gff ]; then
wget -O data/thaliana_v10.gff ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR10_genome_release/TAIR10_gff3/TAIR10_GFF3_genes_transposons.gff
perl -pi -e "s/.{3}//" data/thaliana_v10.gff
fi

if [ ! -f data/thaliana_v10.description ]; then
wget -O data/thaliana_v10.description ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR10_genome_release/TAIR10_functional_descriptions
fi

if [ ! -f data/thaliana_v10.fasta ]; then
wget -O data/thaliana_v10.fasta ftp://ftp.arabidopsis.org/home/tair/Genes/TAIR10_genome_release/TAIR10_chromosome_files/TAIR10_chr_all.fas
perl -pi -e "s/>(.).*/>\$1/" data/thaliana_v10.fasta
fi


python scripts/post_processing/find_rna.py \
--blastpath $BLAST_DIR/blastall \
Expand All @@ -93,6 +157,14 @@ python scripts/post_processing/find_rna.py \
-o $DIR \
-d data/thaliana_v10.description


status=$?
if [ $status -ne 0 ]; then
echo "Error running find rna"
exit
fi


cp $DIR/${ORGA}_${ORGB}.raw.filtered.local $DIR/${ORGA}_${ORGB}.raw2.filtered.local

python scripts/post_processing/shuffle_protein_cns.py \
Expand All @@ -104,6 +176,12 @@ python scripts/post_processing/shuffle_protein_cns.py \
--pairs $DIR/${ORGA}_${ORGB}.pairs.txt.local \
#creates: $DIR/${ORGA}_${ORGB}.quota.with_new.orthology

status=$?
if [ $status -ne 0 ]; then
echo "Error running shuffle proteins"
exit
fi



python scripts/post_processing/assign.py \
Expand All @@ -118,6 +196,14 @@ python scripts/post_processing/assign.py \
--pair_fmt pair > $DIR/${ORGA}_${ORGB}.cns.assigned.csv.local


status=$?
if [ $status -ne 0 ]; then
echo "Error running assing cns to genes"
exit
fi



python scripts/post_processing/cns_to_fasta.py \
-c $DIR/${ORGA}_${ORGB}.cns.txt.real.local \
--qfasta $DIR/${ORGA}.genomic.masked.fasta \
Expand All @@ -126,6 +212,15 @@ python scripts/post_processing/cns_to_fasta.py \
--sorg ${ORGB} \
> $DIR/${ORGA}_${ORGB}.cns.fasta


status=$?
if [ $status -ne 0 ]; then
echo "Error running cns to fasta"
exit
fi



### creates genelist and cnslist
sh list.sh ${ORGA} ${ORGB} $QUOTA $QDSGID $SDSGID

4 changes: 2 additions & 2 deletions pipeline/scripts/post_processing/find_rna.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,10 +130,10 @@ def make_cns_to_at_map(des_file, blast_file, gff, query, subject, outdir):
#print >>sys.stderr, "skipping main for testing"
main(options.gff, options.outdir, options.fasta)

import commands
import os
cmd = "bblast.py -b {0} -p blastn -e 0.001 -m 8 -W 7 -a 6 -i {1}/{2}_{3}.cns_test.fasta -d {1}/at_no_cds.fasta -o {4}".format(options.blastpath,options.outdir, options.query, options.subject, options.blast)
print >>sys.stderr, "executing\n %s" % cmd
print commands.getoutput(cmd)
print os.system(cmd)


make_cns_to_at_map(options.desc, options.blast, options.gff, options.query,
Expand Down

0 comments on commit 0e1ca1c

Please sign in to comment.