# Calculate YASS pairs for all genes

In [None]:
%%bash

# extract gene coordinates

zcat /home/maticzkd/genomes/hg19/Homo_sapiens.GRCh37.75.gtf.gz |
awk '$3=="gene"' | 
awk 'BEGIN{OFS="\t"}{print "chr"$1,$4-1,$5,$10"_"$12,255,$7}' | 
tr -d '";' > genes.bed

wc -l genes.bed

In [None]:
%%bash

# extract genes with Alu annotation

bedtools intersect -u \
-a genes.bed \
-b /home/maticzkd/genomes/hg19/RepeatMasker_hg19_20161020_Alu.bed \
> genes_with_alu.bed

wc -l genes_with_alu.bed

In [None]:
%%bash

# get sequences

time bedtools getfasta \
-fi /home/maticzkd/genomes/hg19.fa \
-bed genes_with_alu.bed \
-s > genes_with_alu.fa

grep -c '^>' genes_with_alu.fa

In [None]:
%%bash

# split into individual fastas for processing

rm -rf fasplitroot
mkdir -p fasplitroot
time faSplit byName genes_with_alu.fa fasplitroot/

In [None]:
%%bash

ls -l fasplitroot/*.fa | wc -l

In [None]:
%%bash

# run yass for all genes

rm -rf yassbed
mkdir -p yassbed

time for FA in fasplitroot/*.fa
do
    BASE=`basename $FA`
    OUT=yassbed/${BASE%%.fa}_yass.bed
    >&2 echo "" 
    >&2 echo ""
    >&2 echo "####################################################################################################"
    >&2 echo $BASE
    yass $FA -d 4 -r 1 | grep -v "^track" | gzip > $OUT.gz 
done 2> yass.log

In [None]:
%%bash

# convert to yass results to bed format
# do some filtering of irrelevant alignments to reduce data size

(
zcat yassbed/*.bed.gz | head -n 1
zcat yassbed/*.bed.gz | grep -v '^#' | awk '$3-$2>=50 && $6-$5>=50 && $2-$6+1<=10000'
) | gzip > yassbed.gz

In [None]:
%%bash

# convert to genomic coordinates in bed12 format

time ./yass_reverse_repeats_to_bed12.R \
-i yassbed.gz -o "" | 
sort -k1,1 -k2,2n --compress-program=gzip | \
uniq | gzip > yassbed12_genomic.bed.gz