Skip to content

Creating RNAseq annotation files from Ensembl

Chris Miller edited this page Jul 29, 2019 · 11 revisions

This is a rough script showing the necessary files and the commands to create them.

ens_version=95
genome_ref_build_dir=/gscmnt/gc2560/core/model_data/01b6db5e23924e738efc3e398838bf12/build0b5a8d3795254ae9bd6fa80420656257/
dirname=GRC-mouse-build38_mouse_${ens_version}_38

# make a directory structure to hold the results:
mkdir -p $dirname/rna_seq_annotation/hisat2.1.0_index
cd $dirname
for i in all_sequences.fa all_sequences.fa.fai all_sequences.dict all_sequences.genome;do 
  if [[ -s $genome_ref_build_dir/$i ]];then 
    #linked twice for convenience, more than anything else
    ln -s $genome_ref_build_dir/$i
    ln -s $genome_ref_build_dir/$i hisat2.1.0_index/
  else
     echo "ERROR: reference file $i not found in $genome_ref_build_dir"
  fi
done

#get the GTF from ensembl
wget ftp://ftp.ensembl.org/pub/release-${ens_version}/gtf/mus_musculus/Mus_musculus.GRCm38.${ens_version}.gtf.gz
gunzip Mus_musculus.GRCm38.${ens_version}.gtf.gz

#ONLY if this is human, then we need to do this step (converting 1 -> chr1, etc)
# inputs are the dict, the synonyms file from the appropriate vep cache, and the gtf
perl ~cmiller/oneoffs/convertEnsemblGTF.pl all_sequences.dict /gscmnt/gc2560/core/cwl/inputs/VEP_cache/homo_sapiens/95_GRCh38/chr_synonyms.txt Homo_sapiens.GRCh38.95.gtf >zz && mv zz Homo_sapiens.GRCh38.95.gtf

#get the cDNA from ensembl
wget ftp://ftp.ensembl.org/pub/release-${ens_version}/fasta/mus_musculus/cdna/Mus_musculus.GRCm38.cdna.all.fa.gz

#create the kallisto index
mkdir logs
bsub -M 16000000 -R 'select[mem>16000] span[hosts=1] rusage[mem=16000]' -oo logs/kallisto_index.log -q research-hpc -a "docker(mgibio/rnaseq)" -J kallisto "kallisto index -i Mus_musculus.GRCm38.cdna.all.fa.kallisto.idx Mus_musculus.GRCm38.cdna.all.fa.gz"


#make the refflat file
/gscuser/cmiller/usr/bin/gtfToGenePred -genePredExt -geneNameAsName2 -ignoreGroupsWithoutExons Mus_musculus.GRCm38.${ens_version}.gtf /dev/stdout | awk 'BEGIN { OFS="\t"} {print $12, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10}' >Mus_musculus.GRCm38.95.refFlat.txt

#make the ribosomal file
cat all_sequences.dict >Mus_musculus.GRCm38.${ens_version}.ribo_intervals
grep 'gene_biotype "rRNA"' Mus_musculus.GRCm38.${ens_version}.gtf | awk '$3 == "exon"' | cut -f1,4,5,7,9 | perl -lane '/transcript_id "([^"]+)"/ or die "no transcript_id on $.";print join "\t", (@F[0,1,2,3], $1)' | sort -k1V -k2n -k3n >>Mus_musculus.GRCm38.${ens_version}.ribo_intervals

#make a hisat index
#NOTE - if you're just updating the ensembl build and not the reference, you can skip this step and copy the other index over!
echo "This is a placeholder file to pass to CWL to refer to all the index files and be a parameter to HISAT2." >hisat2.1.0_index/mm38
bsub -M 16000000 -R 'select[mem>16000] span[hosts=1] rusage[mem=16000]' -p 8 -oo logs/hisat.log -q research-hpc -a "docker(mgibio/rnaseq)" -J hisat "/opt/hisat2/hisat2*/hisat2-build -p 8 $genome_ref_build_dir/all_sequences.fa hisat2.1.0_index/mm38"

#make STAR index
#NOTE - STAR version 2.7.0f
LSF_DOCKER_PRESERVE_ENVIRONMENT=false bsub -J star_index -n 10 -q research-hpc -oo logs/star_index.log -R "select[mem>64000] rusage[mem=64000]" -M 64000000 -a "docker(mgibio/star)" /bin/bash -c "STAR --runMode genomeGenerate --runThreadN 10 --genomeDir star_2.7.0f_index --genomeFastaFiles all_sequences.fa --sjdbGTFfile Mus_musculus.GRCm38.95.gtf --sjdbOverhang 100"

#kallisto gene translation table
#from an image with biomart/R like 'docker(chrisamiller/docker-genomic-analysis)':
Rscript ~cmiller/oneoffs/ensembl_transcript_to_gene_table.R http://jan2019.archive.ensembl.org/ mmusculus ensembl95.transcriptToGene.tsv
Clone this wiki locally