
## Make zebrafish genome annotation directory

## Get zebrafish genome fasta

Get zebrafish fasta file using 2bit from UCSC

## Get zebrafish gene annotations

Get zebrafish gene annotations in GTF form from Gencode (v33)


In [None]:
%%bash

mkdir -p /data_dir/zebrafish_experiments/annotations/zebrafish
cd /data_dir/zebrafish_experiments/annotations/zebrafish


In [None]:
%%bash

ANNOTATION_DIR="/data_dir/zebrafish_experiments/annotations/zebrafish/"
curl -s -L ftp://ftp.ensembl.org/pub/release-99/fasta/danio_rerio/dna/Danio_rerio.GRCz11.dna_sm.primary_assembly.fa.gz > \
    $ANNOTATION_DIR"Danio_rerio.GRCz11.dna_sm.primary_assembly.fa.gz"
gunzip $ANNOTATION_DIR"Danio_rerio.GRCz11.dna_sm.primary_assembly.fa.gz"
#Toss everything except canonical chromosomes
awk "/^>/ {n++} n>27 {exit} {print}" $ANNOTATION_DIR"Danio_rerio.GRCz11.dna_sm.primary_assembly.fa" > GRCz11_filt_dna_sm.fa


In [None]:
%%bash
curl -s -L ftp://ftp.ensembl.org/pub/release-99/gtf/danio_rerio/Danio_rerio.GRCz11.99.gtf.gz > \
    $ANNOTATION_DIR"Danio_rerio.GRCz11.99.gtf.gz"
gunzip $ANNOTATION_DIR"Danio_rerio.GRCz11.99.gtf.gz"

## Download whitelist for chromiums


In [None]:
%%bash
#V2
curl -s -L https://raw.githubusercontent.com/10XGenomics/cellranger/master/lib/python/cellranger/barcodes/737K-august-2016.txt > \
     /data_dir/zebrafish_experiments/annotations/Chromium_v2_barcodes.txt

#V3
curl -s -L https://github.com/10XGenomics/cellranger/raw/master/lib/python/cellranger/barcodes/3M-february-2018.txt.gz > \
     /data_dir/zebrafish_experiments/annotations/Chromium_v3_barcodes.txt.gz
gunzip /data_dir/zebrafish_experiments/annotations/Chromium_v3_barcodes.txt.gz


# Make genome annotations for STAR and RSEM


In [None]:
%%bash
#Generate genome indices for STAR
mkdir -p /data_dir/zebrafish_experiments/annotations/zebrafish/STAR_ix
STAR --runThreadN 12 \
     --runMode genomeGenerate --outTmpDir /data_dir/zebrafish_experiments/temp/star_tmp \
     --genomeDir /data_dir/zebrafish_experiments/annotations/zebrafish/STAR_ix \
     --genomeFastaFiles /data_dir/zebrafish_experiments/annotations/zebrafish/GRCz11_filt_dna_sm.fa \
     --sjdbGTFfile /data_dir/zebrafish_experiments/annotations/zebrafish/Danio_rerio.GRCz11.99.gtf \
     --sjdbOverhang 149 --limitGenomeGenerateRAM 30000000000 --genomeSAsparseD 2


In [None]:
%%bash

#Generate genome annotations for RSEM
rsem-prepare-reference --gtf /data_dir/zebrafish_experiments/annotations/zebrafish/Danio_rerio.GRCz11.99.gtf  \
                       /data_dir/zebrafish_experiments/annotations/zebrafish/GRCz11_filt_dna_sm.fa \
                       /data_dir/zebrafish_experiments/annotations/zebrafish/STAR_ix 


# Generate annotation for Alevin scRNA-seq analysis

In [4]:
%%bash

cd /data_dir/zebrafish_experiments/annotations/zebrafish/

#Get hard masked GRCz11 genome and filter for only canonical chromosomes
curl -s -L ftp://ftp.ensembl.org/pub/release-99/fasta/danio_rerio/dna/Danio_rerio.GRCz11.dna_rm.primary_assembly.fa.gz > \
    /data_dir/zebrafish_experiments/annotations/zebrafish/Danio_rerio.GRCz11.dna_rm.primary_assembly.fa.gz
gunzip Danio_rerio.GRCz11.dna_rm.primary_assembly.fa.gz
#Toss everything except canonical chromosomes
awk "/^>/ {n++} n>26 {exit} {print}"  /data_dir/zebrafish_experiments/annotations/zebrafish/Danio_rerio.GRCz11.dna_rm.primary_assembly.fa > GRCz11_filt_dna_rm.fa
#Get list of decoy sequences for salmon
grep "^>" GRCz11_filt_dna_rm.fa | cut -d " " -f 1 > decoys.txt
sed -i.bak -e 's/>//g' decoys.txt


gzip: Danio_rerio.GRCz11.dna_rm.primary_assembly.fa already exists;	not overwritten


In [6]:
%%bash

cd /data_dir/zebrafish_experiments/annotations/zebrafish/
#Get cDNA of transcripts for GRCz11
curl -s -L http://ftp.ensembl.org/pub/release-99/fasta/danio_rerio/cdna/Danio_rerio.GRCz11.cdna.all.fa.gz > \
    Danio_rerio.GRCz11.cdna.all.fa.gz
gunzip Danio_rerio.GRCz11.cdna.all.fa.gz

#Filter the transcripts for only those in STAR 
cut -f 1 /data_dir/zebrafish_experiments/annotations/zebrafish/STAR_ix/transcriptInfo.tab | tail -n +2 > in_enst_names.txt
awk '{ if ((NR>1)&&($0~/^>/)) { printf("\n%s", $0); } else if (NR==1) { printf("%s", $0); } else { printf("\t%s", $0); } }' \
    Danio_rerio.GRCz11.cdna.all.fa \
    | grep -Ff in_enst_names.txt - \
    | tr "\t" "\n" > GRCz11_cdna_filt.fa

# coding_genes_hgnc.txt = list of all HGNC protein coding symbols from Biomart using "protein_coding" filter
#Filter the transcripts to ONLY protein coding
awk '{ if ((NR>1)&&($0~/^>/)) { printf("\n%s", $0); } else if (NR==1) { printf("%s", $0); } else { printf("\t%s", $0); } }' \
    GRCz11_cdna_filt.fa \
    | grep -Ff zebra_hgnc_protein.txt - \
    | tr "\t" "\n" > GRCz11_cdna_coding_filt.fa


gzip: Danio_rerio.GRCz11.cdna.all.fa already exists;	not overwritten


### Get cDNA rRNA from 

https://m.ensembl.org/biomart/martview

drop sequences in: GRCz11_cDNA_rRNA.fa rememember, JUST ensembl transcript ID w/ version!



In [22]:
%%bash
#Make gentrome for Salmon
#mkdir /data_dir/zebrafish_experiments/annotations/zebrafish/salmon_ann
cd /data_dir/zebrafish_experiments/annotations/zebrafish/salmon_ann
cat /data_dir/zebrafish_experiments/annotations/zebrafish/GRCz11_cdna_coding_filt.fa \
    /data_dir/zebrafish_experiments/annotations/zebrafish/GRCz11_cDNA_rRNA.fa \
    /data_dir/zebrafish_experiments/annotations/zebrafish/GRCz11_filt_dna_rm.fa \
    > gentrome_GRCz11_filt.fa 

#Full cdna file: Homo_sapiens.GRCh38.cdna.all.fa
#Filtered cdna file: Hg38_cdna_coding_filt.fa


In [None]:
%%bash
#Reduce ram used w/ sparse
salmon index --sparse -t gentrome_GRCz11_filt.fa -d ../decoys.txt -p 14 -i salmon_GRCz11_index


In [29]:
gtf_parse = "/data_dir/zebrafish_experiments/annotations/zebrafish/Danio_rerio.GRCz11.99.gtf"
mt_tran_out = "/data_dir/zebrafish_experiments/annotations/zebrafish/gencode_mt.txt"
out_tran_map = "/data_dir/zebrafish_experiments/annotations/zebrafish/salmon_GRCz11_gencode_tran2gene.txt"

tran_gene_dict = dict()
tran_name_dict = dict()

with open(gtf_parse, "r") as gene_in:
    with open(mt_tran_out,"w") as mito_out:
        for line in gene_in:
            if line.startswith("#"):
                continue
            else:
                arr = line.strip().split("\t")
                if (arr[2] == "transcript"):
                    tmp_arr = arr[-1].strip(';').strip().split(";")
                    tmp_arr = tmp_arr[:-1]
                    arr_tran = [feature.strip().split(" ") for feature in tmp_arr]
                    cur_tran_dict = {key: value for (key, value) in arr_tran}
                    tran_gene_dict[cur_tran_dict["transcript_id"]+"."+cur_tran_dict["transcript_version"]] = cur_tran_dict["gene_id"]+"."+cur_tran_dict["gene_version"]
                    tran_name_dict[cur_tran_dict["transcript_id"]+"."+cur_tran_dict["transcript_version"]] = cur_tran_dict["gene_name"]

                    if arr[0] == "MT":
                        mito_out.write(cur_tran_dict["transcript_id"].strip("\"")+"."+cur_tran_dict["transcript_version"].strip("\"") + "\n")
                        
                        

with open(out_tran_map, "w") as kal_tran:
    for tran in tran_gene_dict.keys():
        out_line = tran.replace("\"","") + "\t" + tran_gene_dict[tran].replace("\"","") + "\n"
        kal_tran.write(out_line)
        

In [30]:
%%bash

#Add rRNA to transcript2gene map
grep ">" /data_dir/zebrafish_experiments/annotations/zebrafish/GRCz11_cDNA_rRNA.fa | \
    sed 's/>//' | awk '{print $1 "\t" $1}' \
    >> /data_dir/zebrafish_experiments/annotations/zebrafish/salmon_GRCz11_gencode_tran2gene.txt


In [31]:
%%bash

#Get transcript IDs of rRNA w/ version ID
grep ">" /data_dir/zebrafish_experiments/annotations/zebrafish/GRCz11_cDNA_rRNA.fa | \
    sed 's/>//' > \
    /data_dir/zebrafish_experiments/annotations/zebrafish/rRNA_ensembl.txt
