# Scripts for masking the genome and filtering variation data

## Before you begin

#### Copy the tools to 'bin/' and in each cell below, define paths to the tools. Adjust '$ncpu', the number of computing threads used, within the script if necessary.

## 1. Masking the genome

### 1.1. RepeatMasker

In [None]:
cd ../.. #star_protocols_saimaa
work=$(pwd)
apps=$work/bin

refd=$work/data/reference
refs=$refd/norppa_12122017

rmask=$apps/RepeatMasker
ncpu=4

$rmask -xsmall -gff -dir $refd/RM -pa $ncpu -species "Canis familiaris" $refs.fa

#Making a tab file for repetitive regions

awk '!/#/{OFS="\t";print $1,$4,$5}' $refd/RM/norppa_12122017.fa.out.gff > $refd/RM/repeats.tab

### 1.2. Positive masking: SNPable

#### 1.2.1 Splitting reference fasta and alignment

In [None]:
cd ../.. #star_protocols_saimaa
work=$(pwd)
apps=$work/bin

refd=$work/data/reference
refs=$refd/norppa_12122017

seqbility=$apps/seqbility-20091110
bwa=$apps/bwa

maskdir=$refd/snpable

rm -f $maskdir/*

cd $maskdir

$seqbility/splitfa $refs.fa 35 | split -l 20000000

for i in $(ls x??); do 
  $bwa aln -R 1000000 -O 3 -E 3 $refs.fa $i > $i.sai
done

for i in $(ls x??); do 
  $bwa samse $refs.fa $i.sai $i > $i.sam
done


#### 1.2.3. Creating rawMask and final mask

In [None]:
cd ../.. #star_protocols_saimaa
work=$(pwd)
apps=$work/bin

refd=$work/data/reference
maskdir=$refd/snpable

seqbility=$apps/seqbility-20091110

cd $maskdir

cat x??.sam | $seqbility/gen_raw_mask.pl > rawMask_35.fa

$seqbility/gen_mask -l 35 -r 0.5 rawMask_35.fa > mask_35_50.fa


#### 1.2.4. Converting mask file to bed format

In [None]:
cd ../.. #star_protocols_saimaa
work=$(pwd)
apps=$work/bin
maskdir=$refd/snpable

python $work/scripts/2.masking-filtering/makeMappabilityMask.py

# Combining the files returned by makeMappabilityMask.py

for i in $(ls $maskdir/posmask); do zcat $maskdir/posmask/$i | bgzip -c >> $mask; done


## 2.Filtering VCFs

In [None]:
#Filtering out repeat regions from the imputated and phased VCF:
cd ../.. #star_protocols_saimaa
work=$(pwd)
apps=$work/bin

maskdir=$refd/snpable
refd=$work/data/reference

bcftools=$apps/bcftools

vcf_phased=$work/processed_data/1.mapping-variantcalling/beagle/saimaa_phased.vcf.gz
vcf_posmask=$work/processed_data/2.masking-filtering/saimaa_posm.vcf.gz
mask=$maskdir/norppa_12122017.posmask.bed.gz

#Filtering the VCF file based on RepeatMasker data and the positive mask produced by SNPable

$bcftools view -T ^$refd/RM/repeats.tab $vcf_phased | $bcftools view -T $mask -Oz -o $vcf_posmask
