In [15]:
using CSV, DataFrames, StatsBase, BioSequences, FASTX

In [16]:
struct Feature
    type::String
    start::Int
    stop::Int
    strand::Char
    phase::Int
    attributes::Dict{String, String}
end

struct Gene
    id::String
    gene::Feature
    mRNAs::Vector{Feature}
    tRNAs::Vector{Feature}
    rRNAs::Vector{Feature}
    ncRNAs::Vector{Feature}
    CDSs::Vector{Feature}
    exons::Vector{Feature}
    introns::Vector{Feature}
    esites::Vector{Feature}
end

function attributes2dict(attribute_string::String)
    dict = Dict{String, String}()
    attributes = split(attribute_string, ";")
    for attribute in attributes
        kvs = split(attribute, "=")
        dict[first(kvs)] = last(kvs)
    end
    dict
end

function createFeature(row)
    Feature(row.feature, row.start, row.stop, row.strand[1], row.phase == "." ? 0 : parse(Int, row.phase), attributes2dict(row.attributes))
end


createFeature (generic function with 1 method)

In [17]:
function add_feature(gene, feature)
    #check feature is on same strand as the gene
    @assert feature.strand == gene.gene.strand "strand error: $(string(feature)) $(string(gene.gene))"
    #check feature is within gene
    @assert feature.start >= gene.gene.start && feature.stop <= gene.gene.stop "boundary error: $(string(feature)) $(string(gene.gene))"
    if feature.type == "mRNA"
        push!(gene.mRNAs, feature)
    elseif feature.type == "tRNA"
        push!(gene.tRNAs, feature)
    elseif feature.type == "rRNA"
        push!(gene.rRNAs, feature)
    elseif feature.type == "ncRNA"
        push!(gene.ncRNAs, feature)
    elseif feature.type == "CDS"
        push!(gene.CDSs, feature)
    elseif feature.type == "exon"
        push!(gene.exons, feature)
    elseif feature.type == "intron"
        push!(gene.introns, feature)
    elseif feature.type == "misc_feature"
        push!(gene.esites, feature)
    else
        println("unknown feature type: ", feature.type)
    end
end

add_feature (generic function with 1 method)

In [18]:
function add_features(feature_type::String)
    gff_features = filter(x->x.feature == feature_type, gff)
    for row in eachrow(gff_features)
        f = createFeature(row)
        gene = get(f.attributes, "gene", nothing)
        if isnothing(gene)
            println("no gene attribute for ", f.attributes["ID"])
        else
            mygene = findfirst(x -> x.id == gene, genes) 
            if isnothing(mygene)
                println("no gene found matching ", gene)
            else
                add_feature(genes[mygene], f)
            end
        end
    end
end

add_features (generic function with 1 method)

In [19]:
ref = FASTA.Reader(open("Phylloglossum_drummondii_mitochondria.fasta")) do infile; first(infile); end
refseq = FASTA.sequence(LongDNA{4}, ref)
refseqcomp = complement(refseq)

gff = CSV.File("Phylloglossum_drummondii_mitochondria.gff", comment = "#", header = ["sequence", "software", "feature", "start", "stop", "score", "strand", "phase", "attributes"]) |> DataFrame

Row,sequence,software,feature,start,stop,score,strand,phase,attributes
Unnamed: 0_level_1,String,String15,String15,Int64,Int64,String15,String1,String1,String
1,Phylloglossum_drummondii_mitochondria,Geneious,region,1,363297,.,+,0,Is_circular=true
2,Phylloglossum_drummondii_mitochondria,Pyrimid,misc_feature,4101,4101,7.16E-206,+,.,note=C to U RNA editing (frequency 0.83)
3,Phylloglossum_drummondii_mitochondria,Geneious,CDS,4441,4815,.,+,0,codon_start=1;gene=rps12
4,Phylloglossum_drummondii_mitochondria,Geneious,gene,4441,4815,.,+,.,Name=rps12;ID=rps12
5,Phylloglossum_drummondii_mitochondria,Pyrimid,misc_feature,4495,4495,1.34E-19,+,.,note=C to U RNA editing (frequency 0.23)
6,Phylloglossum_drummondii_mitochondria,Pyrimid,misc_feature,4614,4614,1.42E-319,+,.,note=C to U RNA editing (frequency 0.76)
7,Phylloglossum_drummondii_mitochondria,Pyrimid,misc_feature,4661,4661,.,+,.,note=C to U RNA editing (frequency 0.78)
8,Phylloglossum_drummondii_mitochondria,Pyrimid,misc_feature,5183,5183,1.53E-92,+,.,note=C to U RNA editing (frequency 0.37)
9,Phylloglossum_drummondii_mitochondria,Pyrimid,misc_feature,5647,5647,4.23E-13,+,.,note=C to U RNA editing (frequency 0.20)
10,Phylloglossum_drummondii_mitochondria,Pyrimid,misc_feature,6996,6996,1.69E-11,+,.,note=C to U RNA editing (frequency 0.14)


In [20]:
gffgenes = filter(x->x.feature == "gene", gff)
genes = Gene[]
for row in eachrow(gffgenes)
    gene = createFeature(row)
    id = get(gene.attributes, "ID", nothing)
    if isnothing(id)
        println("no ID for ", gene.attributes["Name"])
    else
        push!(genes, Gene(gene.attributes["ID"], gene, Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[]))
    end
end
genes

80-element Vector{Gene}:
 Gene("rps12", Feature("gene", 4441, 4815, '+', 0, Dict("Name" => "rps12", "ID" => "rps12")), Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[])
 Gene("cox3", Feature("gene", 7505, 9079, '+', 0, Dict("Name" => "cox3", "ID" => "cox3")), Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[])
 Gene("cox1", Feature("gene", 17633, 26147, '-', 0, Dict("Name" => "cox1", "ID" => "cox1")), Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[])
 Gene("atp4", Feature("gene", 28349, 28921, '-', 0, Dict("Name" => "atp4", "ID" => "atp4")), Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[])
 Gene("trnK-UUU", Feature("gene", 30526, 30598, '-', 0, Dict("Name" => "trnK-UUU", "ID" => "trnK-UUU")), Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[], Feature[])
 Gene("trnS-GGA", Feature("gene", 35833, 35920, '-', 0, Dic

In [21]:
#check for duplicate genes
geneids = [x.id for x in genes]
if length(geneids) ≠ length(unique(geneids))
    println("gene IDs contain duplicates")
    println(filter(x -> last(x) > 1, countmap(geneids)))
end

In [22]:
add_features("tRNA")
add_features("rRNA")
add_features("ncRNA")
add_features("CDS")
for gene in genes
    if gene.gene.strand == '-'
        reverse!(gene.CDSs)
    end
end

In [23]:
genes

80-element Vector{Gene}:
 Gene("rps12", Feature("gene", 4441, 4815, '+', 0, Dict("Name" => "rps12", "ID" => "rps12")), Feature[], Feature[], Feature[], Feature[], Feature[Feature("CDS", 4441, 4815, '+', 0, Dict("gene" => "rps12", "codon_start" => "1"))], Feature[], Feature[], Feature[])
 Gene("cox3", Feature("gene", 7505, 9079, '+', 0, Dict("Name" => "cox3", "ID" => "cox3")), Feature[], Feature[], Feature[], Feature[], Feature[Feature("CDS", 7505, 7675, '+', 0, Dict("gene" => "cox3", "codon_start" => "1")), Feature("CDS", 8453, 9079, '+', 0, Dict("gene" => "cox3", "codon_start" => "1"))], Feature[], Feature[], Feature[])
 Gene("cox1", Feature("gene", 17633, 26147, '-', 0, Dict("Name" => "cox1", "ID" => "cox1")), Feature[], Feature[], Feature[], Feature[], Feature[Feature("CDS", 17633, 18079, '-', 0, Dict("gene" => "cox1")), Feature("CDS", 19334, 19487, '-', 0, Dict("gene" => "cox1")), Feature("CDS", 22345, 23014, '-', 0, Dict("gene" => "cox1")), Feature("CDS", 25823, 26147, '-', 0, Dic

In [24]:
function calculate_codon_position(gene::Gene, position::Int)
    @assert position >= gene.gene.start && position <= gene.gene.stop
    sort!(gene.CDSs, by=x->x.start)
    if gene.gene.strand == '-'; reverse!(gene.CDSs); end
    seq = LongDNA{4}()
    cds_position = 0
    in_cds = false
    for cds in gene.CDSs
        if gene.gene.strand == '+' && position >= cds.stop
            cds_position += cds.stop - cds.start + 1
        elseif gene.gene.strand == '-' && position <= cds.start
            cds_position += cds.stop - cds.start + 1
        elseif gene.gene.strand == '+' && position >= cds.start
            cds_position += position - cds.start + 1
            in_cds = true
        elseif gene.gene.strand == '-' && position <= cds.stop
            cds_position += cds.stop - position + 1
            in_cds = true
        end
        append!(seq, gene.gene.strand == '+' ? LongDNA{4}(refseq[cds.start:cds.stop]) : reverse_complement(LongDNA{4}(refseq[cds.start:cds.stop])))
    end
    codon_no = codon_position = 0
    codon = LongDNA{4}([DNA_Gap, DNA_Gap, DNA_Gap])
    if in_cds
        codon_no = ceil(Int, cds_position/3.0)
        codon_position = cds_position - 3 * (codon_no - 1)
        codon = seq[3 * codon_no - 2:3 * codon_no]
    else
        cds_position = gene.gene.strand == '+' ? position - gene.gene.start + 1 : gene.gene.stop - position + 1
    end
    return cds_position, codon_no, codon_position, codon
end

calculate_codon_position (generic function with 1 method)

In [25]:
dna_stops = LongDNA{4}.([[DNA_T, DNA_A, DNA_A], [DNA_T, DNA_G, DNA_A], [DNA_T, DNA_A, DNA_G]])
rna_stops = convert.(LongRNA{4}, dna_stops)

3-element Vector{LongSequence{RNAAlphabet{4}}}:
 UAA
 UGA
 UAG

In [26]:
function prepact_name(position, gene, base, cds_position, aa, edited_aa)
    if ismissing(gene) || haskey(gene.gene.attributes, "pseudo") || haskey(gene.gene.attributes, "pseudogene")
        return string(position) * "e" * string(base)
    elseif aa == AA_Gap
        return gene.id * "e" * string(base) * string(cds_position)
    else
        return gene.id * "e" * string(base) * string(cds_position) * string(aa) * string(edited_aa)
    end
end

prepact_name (generic function with 1 method)

In [27]:
esites = DataFrame(genome = String[], strand = Char[], position = Int[], id = String[], reference_base = DNA[], edited_base = RNA[], proportion_edited = Float64[], gene = String[], cds_position = Int[], codon_position = Int[],
    codon = LongDNA{4}[], edited_codon = LongRNA{4}[], aa = AminoAcid[], edited_aa = AminoAcid[], synonymous = Union{Missing, Bool}[], creates_start = Bool[], creates_stop = Bool[], removes_stop = Bool[],
     preceding_base = RNA[], subsequent_base = RNA[])
gff_esites = filter(x->occursin("RNA editing", x.attributes), gff)
for row in eachrow(gff_esites)
    genome = "mt"
    position = row.start
    refbase = refseq[position]
    strand = '-'
    reference_base = complement(refbase)
    if refbase == DNA_C || refbase == DNA_T
        strand = '+'
        reference_base = refbase
    end
    edited_base = RNA(row.attributes[11])
    if refbase == DNA_C
        @assert edited_base == RNA_U
    elseif refbase == DNA_T
        @assert edited_base == RNA_C
    end
    proportion_edited = parse(Float64, row.attributes[end-5:end-2])
    mygene = missing
    gene = ""
    cds_position = codon_number = codon_position = 0
    blank = LongDNA{4}([DNA_Gap, DNA_Gap, DNA_Gap])
    codon = blank
    for g in genes
        if position >= g.gene.start && position <= g.gene.stop && strand == g.gene.strand
            mygene = g
            gene = mygene.id
            cds_position, codon_number, codon_position, codon = calculate_codon_position(mygene, position)
            break
        end
    end
    edited_codon = LongRNA{4}([RNA_Gap, RNA_Gap, RNA_Gap])
    aa = AA_Gap
    edited_aa = AA_Gap
    synonymous = missing
    creates_start = creates_stop = removes_stop = false
    if codon ≠ blank
        edited_codon = convert(LongRNA{4}, codon)
        edited_codon[codon_position] = edited_base
        aa = BioSequences.translate(codon)[1]
        edited_aa = BioSequences.translate(edited_codon)[1]
        synonymous = aa == edited_aa
        creates_start = codon_number == 1 && edited_codon == LongRNA{4}([RNA_A, RNA_U, RNA_G])
        creates_stop = position == (strand == '+' ? mygene.gene.stop - 2 : mygene.gene.start + 2)
        removes_stop = codon ∈ dna_stops
    end
    preceding_base = strand == '+' ? refseq[position - 1] : refseqcomp[position + 1]
    subsequent_base = strand == '+' ? refseq[position + 1] : refseqcomp[position - 1]
    id = prepact_name(position, mygene, edited_base, cds_position, aa, edited_aa)
    push!(esites, (genome, strand, position, id, reference_base, edited_base, proportion_edited, gene, cds_position, codon_position, codon, edited_codon, aa, edited_aa,
     synonymous, creates_start, creates_stop, removes_stop, preceding_base, subsequent_base))
end
esites


Row,genome,strand,position,id,reference_base,edited_base,proportion_edited,gene,cds_position,codon_position,codon,edited_codon,aa,edited_aa,synonymous,creates_start,creates_stop,removes_stop,preceding_base,subsequent_base
Unnamed: 0_level_1,String,Char,Int64,String,DNA,RNA,Float64,String,Int64,Int64,LongSequ…,LongSequ…,AminoAcid,AminoAcid,Bool?,Bool,Bool,Bool,RNA,RNA
1,mt,+,4101,4101eU,C,U,0.8,,0,0,---,---,-,-,missing,false,false,false,C,G
2,mt,+,4495,rps12eU55LL,C,U,0.2,rps12,55,1,CTA,UUA,L,L,true,false,false,false,A,U
3,mt,+,4614,rps12eU174TT,C,U,0.7,rps12,174,3,ACC,ACU,T,T,true,false,false,false,C,A
4,mt,+,4661,rps12eU221SL,C,U,0.7,rps12,221,2,TCG,UUG,S,L,false,false,false,false,U,G
5,mt,+,5183,5183eU,C,U,0.3,,0,0,---,---,-,-,missing,false,false,false,U,A
6,mt,+,5647,5647eU,C,U,0.2,,0,0,---,---,-,-,missing,false,false,false,U,A
7,mt,+,6996,6996eU,C,U,0.1,,0,0,---,---,-,-,missing,false,false,false,C,U
8,mt,+,7449,7449eU,C,U,0.6,,0,0,---,---,-,-,missing,false,false,false,A,U
9,mt,+,8548,cox3eU267FF,C,U,0.1,cox3,267,3,TTC,UUU,F,F,true,false,false,false,U,A
10,mt,+,8568,cox3eU287SF,C,U,0.9,cox3,287,2,TCT,UUU,S,F,false,false,false,false,U,U


In [28]:
CSV.write("mt editing events.tsv", esites, delim='\t')

"mt editing events.tsv"

In [29]:
nrow(esites)

362

In [30]:
countmap(esites.codon_position)

Dict{Int64, Int64} with 4 entries:
  0 => 84
  2 => 173
  3 => 27
  1 => 78

In [32]:
count(skipmissing(esites.synonymous) .== true)

33

In [33]:
countmap(esites.preceding_base)

Dict{RNA, Int64} with 4 entries:
  RNA_C => 106
  RNA_U => 204
  RNA_A => 43
  RNA_G => 9

In [18]:
count(esites.creates_start)

7

In [19]:
count(esites.creates_stop)

7

In [20]:
count(esites.removes_stop)

4