# Code to get tracks set up for CoVid-relevant regulatory architecture
# General tracks

### Genes

In [None]:
%%bash

#First, download UCSC scripts
wget --quiet -nc http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/gtfToGenePred
wget --quiet -nc http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/genePredToBed
wget --quiet -nc http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/bigWigToWig
chmod +x gtfToGenePred genePredToBed bigWigToWig


In [None]:
%%bash

#Get gencode GTF (basic or full) and flip it to bed12
wget --quiet -nc -O gencode_v37.gtf.gz \
    ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/gencode.v30lift37.basic.annotation.gtf.gz
#wget --quiet -O gencode_v37.gtf.gz \
#    ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_30/GRCh37_mapping/gencode.v30lift37.annotation.gtf.gz
gunzip gencode_v37.gtf.gz

#Get chromosomes sizes
wget -nc --quiet http://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/hg19.chrom.sizes
grep -v chrM hg19.chrom.sizes| grep -v _hap | grep -v Un_gl |grep -v random > hg19.chrom.filtered.sizes
rm hg19.chrom.sizes

# Convert Gtf to genePred
./gtfToGenePred -genePredExt -geneNameAsName2 gencode_v37.gtf gencode_v37.genePred
#Swap the Ensembl transcript ID for the geneID
awk ' { t = $1; $1 = $12; $12 = t; print; } ' gencode_v37.genePred > gencode_v37_name.genePred
# Convert genPred to bed12
./genePredToBed gencode_v37_name.genePred gencode_v37.bed12
# sort bed12
sort -k1,1 -k2,2n gencode_v37.bed12 > /input_dir/corona_analysis/annotations/gencode_v37.sorted.bed
#cleanup
rm gencode_v37.gtf \
    gencode_v37.genePred \
    gencode_v37.bed12


In [None]:
%%bash

#Redo for hg38
#Get gencode GTF (basic or full) and flip it to bed12
wget --quiet -nc -O gencode_v38.gtf.gz \
    ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_32/gencode.v32.basic.annotation.gtf.gz
gunzip gencode_v38.gtf.gz

#Get chromosomes sizes
wget -nc --quiet http://hgdownload.cse.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes
grep -v chrM hg38.chrom.sizes| grep -v _hap | grep -v Un_gl |grep -v random > hg38.chrom.filtered.sizes
rm hg38.chrom.sizes

# Convert Gtf to genePred
./gtfToGenePred -genePredExt -geneNameAsName2 gencode_v38.gtf gencode_v38.genePred
#Swap the Ensembl transcript ID for the geneID
awk ' { t = $1; $1 = $12; $12 = t; print; } ' gencode_v38.genePred > gencode_v38_name.genePred
# Convert genPred to bed12
./genePredToBed gencode_v38_name.genePred gencode_v38.bed12
# sort bed12
sort -k1,1 -k2,2n gencode_v38.bed12 > /input_dir/corona_analysis/annotations/gencode_v38.sorted.bed
#cleanup
rm  \
    gencode_v38.genePred \
    gencode_v38.bed12 \
    gencode_v38.gtf 


### Repeats

#### L1 elements

In [None]:
%%bash

#Get LINE elements from L1Base2 all full length (>4500nt) in hg38 and flip to hg19
# http://l1base.charite.de/BED/hsflil1_3836.bed == intact ones
wget --quiet http://l1base.charite.de/BED/hsflnil1_8438_rm.bed
sort-bed hsflnil1_8438_rm.bed > /input_dir/corona_analysis/annotations/hsflnil1_hg38_sorted.bed
rm hsflnil1_8438_rm.bed


In [None]:

#LINE elements from hg38 to hg19
liftover_bed(from_genome="hg38",
             to_genome="hg19",
             in_bed="/input_dir/corona_analysis/annotations/hsflnil1_hg38_sorted.bed",
             out_bed="/input_dir/corona_analysis/annotations/hsflnil1_hg19_sorted.bed",
            canonical_chrom_filter = True)

