Merge cd5e51b into 312f5f3

crsl4 · Jul 13, 2020 · d5de63b · d5de63b
2 parents 312f5f3 + cd5e51b
commit d5de63b
Show file tree

Hide file tree

Showing 30 changed files with 154 additions and 156 deletions.
diff --git a/Project.toml b/Project.toml
@@ -27,9 +27,9 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 [compat]
 BioSequences = "1.0, 1.1"
 BioSymbols = "3.0, 3.1"
-CSV = "0.4, 0.5, 0.6"
+CSV = "0.4, 0.5, 0.6, 0.7"
 Combinatorics = "0.7, 1.0"
-DataFrames = "0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20"
+DataFrames = "0.21"
 DataStructures = "0.9, 0.10, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17"
 Distributions = "0.15, 0.16, 0.17, 0.18, 0.19, 0.20, 0.21, 0.22, 0.23"
 GLM = "1.1, 1.2, 1.3"

diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.org/crsl4/PhyloNetworks.jl.svg)](https://travis-ci.org/crsl4/PhyloNetworks.jl)
 [![](https://img.shields.io/badge/docs-stable-blue.svg)](https://crsl4.github.io/PhyloNetworks.jl/stable)
 [![](https://img.shields.io/badge/docs-dev-blue.svg)](https://crsl4.github.io/PhyloNetworks.jl/dev)
-[![codecov.io](http://codecov.io/github/crsl4/PhyloNetworks.jl/coverage.svg?branch=master)](http://codecov.io/github/crsl4/PhyloNetworks.jl?branch=master)
+[![codecov](https://codecov.io/gh/crsl4/PhyloNetworks.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/crsl4/PhyloNetworks.jl)
 [![Coverage Status](https://coveralls.io/repos/crsl4/PhyloNetworks.jl/badge.svg?branch=master&service=github)](https://coveralls.io/github/crsl4/PhyloNetworks?branch=master)
 
 ## Overview

diff --git a/docs/Project.toml b/docs/Project.toml
@@ -14,6 +14,7 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d"
 
 [compat]
 BioSymbols = "3.0, 3.1"
-DataFrames = "0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20"
+CSV = "0.4, 0.5, 0.6, 0.7"
+DataFrames = "0.21"
 Documenter = "~0.24"
-RCall = "0.11, 0.12, = 0.13.1, = 0.13.2, = 0.13.3, = 0.13.4"
+RCall = "0.13.7"
diff --git a/docs/readme.md b/docs/readme.md
@@ -107,6 +107,11 @@ pkg> add PhyloPlots#master # to get the master branch: done by make.jl
 julia> include("make.jl")
 ```
 
+or, after project & manifest setup:
+```shell
+julia --project --color=yes -e 'include("make.jl")'
+```
+
 it will:
 - test the `jldoctest` blocks of examples in the docstrings
 - create or update a `build/` directory with html files.

diff --git a/docs/src/man/bootstrap.md b/docs/src/man/bootstrap.md
@@ -11,8 +11,8 @@ There are two ways to do a bootstrap analysis.
 
 - From quartet CFs with credibility intervals, such as if we used BUCKy. The [TICR pipeline](@ref) outputs a CF table with extra columns for credibility intervals. We could then read that table and get bootstrap networks like this, and tweak options as needed:
 ```julia
-using CSV
-df = CSV.read("tableCF_withCI.csv")
+using DataFrames, CSV
+df = DataFrame!(CSV.File("tableCF_withCI.csv"))
 bootnet = bootsnaq(startnetwork, df, hmax=1, filename="bootstrap")
 ```
 

diff --git a/docs/src/man/inputdata.md b/docs/src/man/inputdata.md
@@ -136,7 +136,7 @@ first to read the file and convert it to a 'DataFrame' object,
 and then to convert this DataFrame into a DataCF object.
 ```@repl qcf
 using CSV, DataFrames
-dat = CSV.read(buckyCFfile);
+dat = DataFrame!(CSV.File(buckyCFfile));
 first(dat, 6) # to see the first 6 rows
 buckyCF = readTableCF(dat)
 writeTableCF(buckyCF)

diff --git a/docs/src/man/multiplealleles.md b/docs/src/man/multiplealleles.md
@@ -8,7 +8,7 @@ to a species, and if only the species-level network needs to be estimated,
 then the following functions can be used:
 
 ```julia
-tm = CSV.read(mappingFile) # taxon map as a data frame
+tm = DataFrame!(CSV.File(mappingFile) # taxon map as a data frame
 taxonmap = Dict(tm[i,:allele] => tm[i,:species] for i in 1:110) # taxon map as a dictionary
 ```
 

diff --git a/docs/src/man/nj.md b/docs/src/man/nj.md
@@ -1,6 +1,6 @@
 ```@setup nj
 using PhyloNetworks
-using CSV
+using DataFrames, CSV
 ```
 # Neighbor joining
 
@@ -14,7 +14,7 @@ names (headers) are used as taxon names.  Rows are assumed to
 correspond to taxa in the same order as they do in columns.
 
 ```@repl nj
-D = CSV.read(joinpath(dirname(pathof(PhyloNetworks)), "..","examples","caudata_dist.txt"));
+D = DataFrame!(CSV.File(joinpath(dirname(pathof(PhyloNetworks)), "..","examples","caudata_dist.txt")));
 tree = nj(D)
 ```
 

diff --git a/docs/src/man/parsimony.md b/docs/src/man/parsimony.md
@@ -34,7 +34,7 @@ First, we need to read the trait table as a DataFrame object:
 ```@repl parsimony
 using CSV, DataFrames
 csvfile = joinpath(dirname(pathof(PhyloNetworks)), "..","examples","Swadesh.csv");
-dat = CSV.read(csvfile);
+dat = DataFrame!(CSV.File(csvfile);
 first(dat, 6) # to see the first 6 rows
 ```
 

diff --git a/docs/src/man/snaq_plot.md b/docs/src/man/snaq_plot.md
@@ -221,8 +221,8 @@ using Distributed
 addprocs(nruns)
 @everywhere using PhyloNetworks
 net0 = readTopology("astraltree.tre");
-using CSV
-df_sp = CSV.read("tableCF_speciesNames.csv", categorical=false);
+using DataFrames, CSV
+df_sp = DataFrame!(CSV.File("tableCF_speciesNames.csv", pool=false));
 d_sp = readTableCF!(df_sp);
 net = snaq!(net0, d_sp, hmax=h, filename=outputfile, seed=seed, runs=nruns)
 ```

diff --git a/docs/src/man/trait_tree.md b/docs/src/man/trait_tree.md
@@ -485,7 +485,7 @@ We can use this dataframe as regressors in the `phyloNetworklm` function.
 
 ```@example tree_trait
 dat = DataFrame(trait = trait_sh, tipNames = tipLabels(sim_sh))  # Data
-dat = join(dat, df_shift, on=:tipNames)                          # join the two
+dat = innerjoin(dat, df_shift, on=:tipNames)                     # join the two
 fit_sh = phyloNetworklm(@formula(trait ~ shift_6), dat, truenet) # fit
 ```
 Here, because there is only one hybrid in the network, we can directly

diff --git a/src/PhyloNetworks.jl b/src/PhyloNetworks.jl
@@ -15,7 +15,7 @@ module PhyloNetworks
     using BioSymbols
     using Combinatorics: combinations
     using CSV
-    using DataFrames
+    using DataFrames # innerjoin new in v0.21
     using DataStructures # for updateInCycle with priority queue
     using Distributions #for RateVariationAcrossSites
     using GLM # for the lm function

diff --git a/src/auxiliary.jl b/src/auxiliary.jl
@@ -1599,7 +1599,7 @@ from node 1 to node 2, the 3-cycle (left) is shrunk as on the right:
       hybrid
 
 with new branch lengths:
-new tA = tA + (γ1.t1 + γ2γ3.t3)/(γ1+γ2γ3),
+new tA = tA + (γ1.t1 + γ2γ3.(t2+t3))/(γ1+γ2γ3),
 new tB = tB + t2,
 provided that γ1, γ2=1-γ1, and γ3 are not missing. If one of them is missing
 then γ1 and γ2 remain as is, and e3 is deleted naively,

diff --git a/src/bootstrap.jl b/src/bootstrap.jl
@@ -19,7 +19,7 @@ if not given as absolute paths.
 """
 function readBootstrapTrees(filelist::AbstractString; relative2listfile=true::Bool)
     filelistdir = dirname(filelist)
-    bootfiles = CSV.read(filelist, header=false, types=Dict(1=>String))
+    bootfiles = DataFrame!(CSV.File(filelist, header=false, types=Dict(1=>String)))
     size(bootfiles)[2] > 0 ||
         error("there should be a column in file $filelist: with a single bootstrap file name on each row (no header)")
     ngenes = size(bootfiles)[1]
@@ -113,9 +113,9 @@ optional argument: `delim=','` by default: how columns are delimited.
 function sampleCFfromCI(df::DataFrame, seed=0::Integer)
     @debug "order of columns should be: t1,t2,t3,t4,cf1234,cf1324,cf1423,cf1234LO,cf1234HI,..."
     size(df,2) == 13 || size(df,2) == 14 || @warn "sampleCFfromCI function assumes table from TICR: CF, CFlo, CFhi"
-    obsCFcol = [findfirst(isequal(:CF12_34), DataFrames.names(df)),
-                findfirst(isequal(:CF13_24), DataFrames.names(df)),
-                findfirst(isequal(:CF14_23), DataFrames.names(df))]
+    obsCFcol = [findfirst(isequal(:CF12_34), DataFrames.propertynames(df)),
+                findfirst(isequal(:CF13_24), DataFrames.propertynames(df)),
+                findfirst(isequal(:CF14_23), DataFrames.propertynames(df))]
     nothing ∉ obsCFcol || error("""CF columns were not found: should be named like 'CF12_34'""")
     obsCFcol == [5,8,11] ||
         @warn """CF columns were found, but not in the expected columns.
@@ -154,7 +154,8 @@ function sampleCFfromCI!(df::DataFrame, seed=0::Integer)
     return df
 end
 
-sampleCFfromCI(file::AbstractString; delim=','::Char,seed=0::Integer) = sampleCFfromCI(CSV.read(file, delim=delim),seed)
+sampleCFfromCI(file::AbstractString; delim=','::Char,seed=0::Integer) =
+    sampleCFfromCI(DataFrame!(CSV.File(file, delim=delim)),seed)
 
 # function that will do bootstrap of snaq estimation in series
 # it repeats optTopRuns nrep times
@@ -345,7 +346,7 @@ function bootsnaq(startnet::HybridNetwork, data::Union{DataFrame,Vector{Vector{H
         error("Input data not recognized: $(typeof(data))")
 
     if !inputastrees
-    (DataFrames.names(data)[[6,7,9,10,12,13]] == [:CF12_34_lo,:CF12_34_hi,:CF13_24_lo,:CF13_24_hi,:CF14_23_lo,:CF14_23_hi]) ||
+    (DataFrames.propertynames(data)[[6,7,9,10,12,13]] == [:CF12_34_lo,:CF12_34_hi,:CF13_24_lo,:CF13_24_hi,:CF14_23_lo,:CF14_23_hi]) ||
       @warn """assume table with CI from TICR: CFlo, CFhi in columns 6,7; 9,10; and 12,13.
               Found different column names: $(DataFrames.names(data)[[6,7,9,10,12,13]])"""
     else # check 1+ genes, each with 1+ trees, all with h=0.

diff --git a/src/moves_semidirected.jl b/src/moves_semidirected.jl
@@ -743,7 +743,7 @@ individual. If a species is in the network but not listed in the mapping file,
 the tip for that species is left as is. Species listed in the mapping file
 but not present in the network are ignored.
 
-The mapping file should be readable by `CSV.read` and contain two columns:
+The mapping file should be readable by `CSV.File` and contain two columns:
 one for the species names and one for the individual (or allele) names.
 fixit: make this function more flexible by accepting column names
 
@@ -775,7 +775,7 @@ julia> species_constraints
 ```
 """
 function mapindividuals(net::HybridNetwork, mappingFile::String)
-    mappingDF = CSV.read(mappingFile)
+    mappingDF = DataFrame!(CSV.File(mappingFile))
     specieslist = unique(mappingDF[:, 1])
     individualnet = deepcopy(net)
     constraints = TopologyConstraint[]

diff --git a/src/multipleAlleles.jl b/src/multipleAlleles.jl
@@ -17,9 +17,9 @@ Optional arguments:
 - file name to write/save resulting CF table. If not specified, then the output
   data frame is not saved to a file.
 - column numbers for the taxon names. 1-4 by default.
-- any keyword arguments that `CSV.read` would accept.
+- any keyword arguments that `CSV.File` would accept.
   For example, delim=',' by default: columns are delimited by commas.
-  Unless specified otherwise by the user, `categorical`=false
+  Unless specified otherwise by the user, `pool`=false
   (to read taxon names as Strings, not levels of a categorical factor,
   for combining the 4 columns with taxon names more easily).
   The same CSV arguments are used to read both input file (mapping file and quartet file)
@@ -28,26 +28,23 @@ See also [`mapAllelesCFtable!`](@ref) to input DataFrames instead of file names.
 
 If a `filename` is specified, such as "quartetCF_speciesNames.csv"
 in the example below, this file is best read later with the option
-`categorical=false`. example:
+`pool=false`. example:
 
 ```julia
 mapAllelesCFtable("allele-species-map.csv", "allele-quartet-CF.csv";
                   filename = "quartetCF_speciesNames.csv")
-df_sp = CSV.read("quartetCF_speciesNames.csv"); # DataFrame object
+df_sp = DataFrame!(CSV.File("quartetCF_speciesNames.csv")); # DataFrame object
 dataCF_specieslevel = readTableCF!(df_sp); # DataCF object
 ```
 """
 function mapAllelesCFtable(alleleDF::AbstractString, cfDF::AbstractString;
         filename=""::AbstractString, columns=Int[]::Vector{Int}, CSVargs...)
-    # force categorical=false unless the user wants otherwise
-    if :categorical ∉ [pair[1] for pair in CSVargs]
-        CSVargs = (CSVargs..., :categorical=>false)
+    # force pool=false unless the user wants otherwise
+    if :pool ∉ [pair[1] for pair in CSVargs]
+        CSVargs = (CSVargs..., :pool=>false)
     end
-    # force :copycols = true in CSV arguments, even if user asks otherwise
-    CSVargs = [pair for pair in CSVargs if pair[1] != :copycols]
-    CSVargs = (CSVargs..., :copycols=>true)
-    d = CSV.read(alleleDF; CSVargs...)
-    d2 = CSV.read(cfDF; CSVargs...)
+    d = DataFrame!(CSV.File(alleleDF; CSVargs...))
+    d2 = DataFrame!(CSV.File(cfDF; CSVargs...))
     mapAllelesCFtable!(d2,d, columns, filename != "", filename)
 end
 
@@ -187,9 +184,11 @@ end
 # (if info on number of genes is provided) or simple average
 function mergeRows(df::DataFrame, cols::Vector{Int})
     sorttaxa!(df, cols) # sort taxa alphabetically within each row
-    colnam = names(df)[cols[5:end]]
-    df = aggregate(df, names(df)[cols[1:4]], mean);
-    rename!(df, Dict((Symbol(n, "_mean"), n) for n in colnam) )
+    colnamtax = DataFrames.propertynames(df)[cols[1:4]]
+    colnam = DataFrames.propertynames(df)[cols[5:end]]
+    df = combine(groupby(df, colnamtax, sort=false, skipmissing=false),
+                 colnam .=> mean .=> colnam)
+    # rename!(df, Dict((Symbol(n, "_mean"), n) for n in colnam) )
     n4tax = size(df,1) # total number of 4-taxon sets
     print("$n4tax unique 4-taxon sets were found. CF values of repeated 4-taxon sets will be averaged")
     println((length(cols)>7 ? " (ngenes too)." : "."))
@@ -267,8 +266,8 @@ end
 # function to check that the allele df has one column labelled alleles and one column labelled species
 function checkMapDF(alleleDF::DataFrame)
     size(alleleDF,2) >= 2 || error("Allele-Species matching Dataframe should have at least 2 columns")
-    :allele in names(alleleDF) || error("In allele mapping file there is no column named allele")
-    :species in names(alleleDF) || error("In allele mapping file there is no column named species")
+    :allele in DataFrames.propertynames(alleleDF) || error("In allele mapping file there is no column named allele")
+    :species in DataFrames.propertynames(alleleDF) || error("In allele mapping file there is no column named species")
 end
 
 

diff --git a/src/nj.jl b/src/nj.jl
@@ -160,5 +160,5 @@ For the algorithm, see
 See [`nj!`](@ref) for using a matrix as input.
 """
 function nj(D::DataFrame; force_nonnegative_edges::Bool=false)
-    nj!(convert(Matrix{Float64}, D), string.(names(D)); force_nonnegative_edges=force_nonnegative_edges)
+    nj!(convert(Matrix{Float64}, D), names(D); force_nonnegative_edges=force_nonnegative_edges)
 end
diff --git a/src/parsimony.jl b/src/parsimony.jl
@@ -140,10 +140,10 @@ function parsimonyDiscreteFitch(net::HybridNetwork, tips::Dict{String,T}) where
 end
 
 function parsimonyDiscreteFitch(net::HybridNetwork, dat::DataFrame)
-    i = findfirst(isequal(:taxon), DataFrames.names(dat))
-    if i===nothing i = findfirst(isequal(:species), DataFrames.names(dat)); end
+    i = findfirst(isequal(:taxon), DataFrames.propertynames(dat))
+    if i===nothing i = findfirst(isequal(:species), DataFrames.propertynames(dat)); end
     if i===nothing i=1; end # first column if no column named "taxon" or "species"
-    j = findfirst(isequal(:trait), DataFrames.names(dat))
+    j = findfirst(isequal(:trait), DataFrames.propertynames(dat))
     if j===nothing j=2; end
     if i==j
         error("""expecting taxon names in column 'taxon', or 'species' or column 1,
@@ -252,10 +252,10 @@ function parsimonySoftwired(net::HybridNetwork, tips::Dict{String,T}) where {T}
 end
 
 function parsimonySoftwired(net::HybridNetwork, dat::DataFrame)
-    i = findfirst(isequal(:taxon), DataFrames.names(dat))
-    if i===nothing i = findfirst(isequal(:species), DataFrames.names(dat)); end
+    i = findfirst(isequal(:taxon), DataFrames.propertynames(dat))
+    if i===nothing i = findfirst(isequal(:species), DataFrames.propertynames(dat)); end
     if i===nothing i=1; end # first column if no column named "taxon" or "species"
-    j = findfirst(isequal(:trait), DataFrames.names(dat))
+    j = findfirst(isequal(:trait), DataFrames.propertynames(dat))
     if j===nothing j=2; end
     if i==j
         error("""expecting taxon names in column 'taxon', or 'species' or column 1,
@@ -447,7 +447,7 @@ function readfastatodna(fastafile::String, countPatterns=false::Bool)
 
     #create dat here
     dat = DataFrame(siteList)
-    insertcols!(dat, 1, taxon = species)
+    insertcols!(dat, 1, :taxon => species)
     return (dat, weights)
 end
 
@@ -465,8 +465,8 @@ Warning:
 - will use all other columns as characters
 """
 function readCSVtoArray(dat::DataFrame)
-    i = findfirst(isequal(:taxon), DataFrames.names(dat))
-    if i===nothing i = findfirst(isequal(:species), DataFrames.names(dat)); end
+    i = findfirst(isequal(:taxon), DataFrames.propertynames(dat))
+    if i===nothing i = findfirst(isequal(:species), DataFrames.propertynames(dat)); end
     if i===nothing
         @warn "expecting taxon names in column 'taxon', or 'species', so will assume column 1"
         i = 1
@@ -494,7 +494,7 @@ function readCSVtoArray(dat::DataFrame)
 end
 
 function readCSVtoArray(filename::String)
-    dat = CSV.read(filename)
+    dat = DataFrame!(CSV.File(filename))
     readCSVtoArray(dat)
 end
 

diff --git a/src/readData.jl b/src/readData.jl
@@ -89,7 +89,7 @@ The last version modifies the input data frame, if species are represented by mu
 for instance (see [`readTableCF!`](@ref)(data frame, columns)).
 """
 function readTableCF(file::AbstractString; delim=','::Char, summaryfile=""::AbstractString)
-    df = CSV.read(file, delim=delim)
+    df = DataFrame!(CSV.File(file, delim=delim))
     readTableCF!(df, summaryfile=summaryfile)
 end
 
@@ -105,10 +105,10 @@ function readTableCF!(df::DataFrames.DataFrame; summaryfile=""::AbstractString)
         [:CF13_24, Symbol("CF13.24"), :obsCF13],
         [:CF14_23, Symbol("CF14.23"), :obsCF14]
     ]
-    obsCFcol = [findfirst(x-> x ∈ alternativecolnames[1], DataFrames.names(df)),
-                findfirst(x-> x ∈ alternativecolnames[2], DataFrames.names(df)),
-                findfirst(x-> x ∈ alternativecolnames[3], DataFrames.names(df))]
-    ngenecol =  findfirst(isequal(:ngenes), DataFrames.names(df))
+    obsCFcol = [findfirst(x-> x ∈ alternativecolnames[1], DataFrames.propertynames(df)),
+                findfirst(x-> x ∈ alternativecolnames[2], DataFrames.propertynames(df)),
+                findfirst(x-> x ∈ alternativecolnames[3], DataFrames.propertynames(df))]
+    ngenecol =  findfirst(isequal(:ngenes), DataFrames.propertynames(df))
     withngenes = ngenecol !== nothing
     if nothing in obsCFcol # one or more col names for CFs were not found
         size(df,2) == (withngenes ? 8 : 7) ||
@@ -597,7 +597,7 @@ data: [1.0, 0.0, 0.0, 0.5]
 julia> df = writeTableCF(q,t); # to get a DataFrame that can be saved to a file later
 
 julia> show(df, allcols=true, splitcols=false)
-5×8 DataFrames.DataFrame
+5×8 DataFrame
 │ Row │ t1     │ t2     │ t3     │ t4     │ CF12_34 │ CF13_24 │ CF14_23 │ ngenes  │
 │     │ String │ String │ String │ String │ Float64 │ Float64 │ Float64 │ Float64 │
 ├─────┼────────┼────────┼────────┼────────┼─────────┼─────────┼─────────┼─────────┤
@@ -617,7 +617,7 @@ Reading in trees, looking at 5 quartets in each...
   **
 
 julia> show(writeTableCF(q,t), allcols=true, splitcols=false)
-5×8 DataFrames.DataFrame
+5×8 DataFrame
 │ Row │ t1     │ t2     │ t3     │ t4     │ CF12_34  │ CF13_24  │ CF14_23  │ ngenes  │
 │     │ String │ String │ String │ String │ Float64  │ Float64  │ Float64  │ Float64 │
 ├─────┼────────┼────────┼────────┼────────┼──────────┼──────────┼──────────┼─────────┤
@@ -1092,9 +1092,8 @@ function updateBL!(net::HybridNetwork,d::DataCF)
     end
     parts = edgesParts(net)
     df = makeTable(net,parts,d)
-    x = by(df, :edge, Nquartets= :CF => length,
-                      edgeL = :CF => x -> -log(3/2*(1. - mean(x))))
-    # ommitting columns: meanCF= :CF => mean, sdCF= :CF => std
+    x = combine(groupby(df, :edge), nrow => :Nquartets,
+                :CF => (x -> -log(3/2*(1. - mean(x)))) => :edgeL)
     edges = x[!,:edge]
     lengths = x[!,:edgeL]
     for i in 1:length(edges)