In [1]:
library(tidyverse)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.4.1     [32m✔[39m [34mpurrr  [39m 1.0.1
[32m✔[39m [34mtibble [39m 3.2.1     [32m✔[39m [34mdplyr  [39m 1.1.2
[32m✔[39m [34mtidyr  [39m 1.3.0     [32m✔[39m [34mstringr[39m 1.5.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [2]:
a = read_csv("https://docs.google.com/spreadsheets/d/1rlotaQ-u83HLd5ZGjR-UnCxC2Hzg8gzEWCGy37s_J7c/gviz/tq?tqx=out:csv&sheet=All", col_types = cols(annotation_source="c")) %>%
    janitor::clean_names()

[1m[22mNew names:
[36m•[39m `` -> `...7`
[36m•[39m `` -> `...28`


In [3]:
annotation_srcs =  c(
    "aug_liftoff_cds_agree_use_liftoff"="liftoff",
    "aug_liftoff_cds_disagree_use_aug"="augustus",
    "aug_liftoff_pasa_agree_subgene_tama_merges_use_pasa"="pasa",
    "aug_liftoff_pasa_cds_agree_use_pasa"="pasa",
    "aug_pasa_cds_agree_use_pasa"="pasa",
    "aug_pasa_cds_disagree_but_cds_too_long_for_isoseq_use_aug"="augustus",
    "liftoff_augustus_agree_subgene_isoseq_merges_use_liftoff"="liftoff",
    "only_aug"="augustus",
    "only_liftoff"="liftoff",
    "pasa_cds_different_use_pasa"="pasa",
    "pasa_liftoff_cds_agree_use_pasa"="pasa",
    "TOO_HARD"="manual")

In [4]:
funky = a %>%
    filter(!grepl("^at6137", chromosome)) %>%
    filter(
        is.na(chromosome)|
        !is.na(kevin_decision) & !kevin_decision %in% names(annotation_srcs) |
        is.na(readiness) | !is.na(readiness) & ! readiness %in% c("R", "NR") |
        !is.na(annotation_source) & !annotation_source %in% c("do_not_export", unique(annotation_srcs)) |
        !is.na(pseudogene) & !grepl("^protopseudogene_?|^pseudogene_?|^remnant_?", pseudogene, perl=T)
    )
funky

number,luisa_id,chromosome,locus_start,locus_end,locus_name,x7,strand,locus_coords,chrom_gene,⋯,kevin_decision,manual_annotation_status,notes,pseudogene,readiness,annotation_source,luisa_fix_me_please,not_nlr,v4,x28
<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>
,,,,,,,,,,⋯,,,remaining HR4 cluster members,,NR,,,,,
,,,,,,,,,,⋯,,,HR4 cluster members (highest so far) with total of 22.,,NR,,,,,
,,,,,,,,at9336_9_u244:51927-54716,,⋯,,,not nlr,,NR,,,,,
,,,,,,,,at9336_9_u244:54934-56168,,⋯,,,not nlr,,NR,,,,,


In [5]:
write_tsv(funky, "tmp/funky.tsv")

In [6]:
core_decisions = a %>%
    transmute(
        accession=sub("_1_chr.", "", chromosome),
        chromosome, locus_start, locus_end, locus_name, strand, locus_coords, kevin_decision, pseudogene,
        
        # luisa's already doing -> doing, which mean the same thing
        status_already=manual_annotation_status,
        manual_annotation_status = sub("already ", "", manual_annotation_status),
        # take readiness from luisa's manual annotations: done = ready, doing = not ready, blank = not started = not ready
        readiness = case_when(
            !is.na(readiness) ~ readiness,
            kevin_decision=="TOO_HARD" & manual_annotation_status == "done"    ~ "R",
            kevin_decision=="TOO_HARD" & manual_annotation_status == "doing"   ~ "NR",
            T ~ readiness,
        ),
        # keep manual decisions as to annotation source, but update annotation sources by the lookup above
        strand = case_when(
            strand == "#ERROR!" ~ "+", # interpreted a + as a broken formula
            T ~ strand,
        ),
        take_auto = !(kevin_decision == "TOO_HARD" | !is.na(annotation_source) | !is.na(manual_annotation_status) ) & kevin_decision != "aug_liftoff_cds_disagree_use_aug",
        annotation_source = case_when(
            !is.na(annotation_source) ~ annotation_source,
            grepl("already", status_already) ~ "do_not_export",
            kevin_decision=="TOO_HARD" & manual_annotation_status == "done" && readiness == "R" && is.na(annotation_source) ~ "manual",
            kevin_decision %in% names(annotation_srcs)  ~ annotation_srcs[kevin_decision],
        )
    ) %>% 
    filter(!is.na(locus_name))

In [7]:
ready = core_decisions %>%
    filter(readiness == "R")

In [8]:
write_tsv(ready, "post-curation/ready.tsv", na="")

In [9]:
core_decisions %>%
    select(accession, chromosome, locus_start, locus_end, locus_name, strand) %>%
    unique() %>%
    group_by(accession, locus_name) %>%
    summarise(n=n()) %>%
    filter(n!= 1)

[1m[22m`summarise()` has grouped output by 'accession'. You can override using the `.groups` argument.


accession,locus_name,n
<chr>,<chr>,<int>


In [10]:
ready %>%
    filter(annotation_source != "do_not_export") %>%
    group_by(accession) %>%
    summarise(n=n())

accession,n
<chr>,<int>
at6923,245
at6929,222
at7143,241
at8285,225
at9104,261
at9336,227
at9503,253
at9578,247
at9744,247
at9762,241
