Fix hcat of hcat with non-identical names

diegozea · Jun 3, 2021 · f42cf9b · f42cf9b · diegozea · Jun 3, 2021
1 parent 2146ae3
commit f42cf9b
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 7 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,10 +1,18 @@
 ## MIToS.jl Release Notes
 
+### Changes from v2.8.1 to v2.8.5
+
+* Fix bugs when concatenating concatenated MSAs using `hcat`.
+
+### Changes from v2.8.1 to v2.8.4
+
+* Ensure that `gaussdca` use the correct project file.
+
 ### Changes from v2.8.1 to v2.8.3
 
 * Increase `PairwiseListMatrices` required version.
 
-* Fix bug when concatenating concatenated MSAs using `hcat`.
+* Fix bugs when concatenating concatenated MSAs using `hcat`.
 
 ### Changes from v2.8.0 to v2.8.1
 

diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "MIToS"
 uuid = "51bafb47-8a16-5ded-8b04-24ef4eede0b5"
-version = "2.8.4"
+version = "2.8.5"
 
 [deps]
 ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63"

diff --git a/src/MSA/Concatenation.jl b/src/MSA/Concatenation.jl
@@ -62,11 +62,14 @@ function _concatenate_annotfile(data::Annotations...)
 	annotfile
 end
 
-function _get_seqname_mapping(concatenated_seqnames)
+function _get_seqname_mapping(concatenated_seqnames, msas...)
 	mapping = Dict{Tuple{Int, String}, String}()
-	for concatenated_seqname in concatenated_seqnames
-		for (i, seqname) in enumerate(split(concatenated_seqname, "_&_"))
-			mapping[(i, seqname)] = concatenated_seqname
+	seq_names = hcat([sequencenames(msa) for msa in msas]...)
+	nseq, nmsa = size(seq_names)
+	@assert nseq == length(concatenated_seqnames)
+	for i in 1:nseq
+		for j in 1:nmsa
+			mapping[(j, seq_names[i, j])] = concatenated_seqnames[i]
 		end
 	end
 	mapping
@@ -156,7 +159,7 @@ function Base.hcat(msa::T...) where T <: AnnotatedAlignedObject
 	colnames = _concatenated_col_names(msa...)
 	setnames!(concatenated_msa, seqnames, 1)
 	setnames!(concatenated_msa, colnames, 2)
-	seqname_mapping = _get_seqname_mapping(seqnames)
+	seqname_mapping = _get_seqname_mapping(seqnames, msa...)
 	seq_lengths = _get_seq_lengths(msa...)
 	old_annot = annotations.([msa...])
 	new_annot = Annotations(

diff --git a/test/MSA/Concatenation.jl b/test/MSA/Concatenation.jl
@@ -91,9 +91,12 @@
 
     @testset "Inception" begin
         concatenated_in = hcat(msa, msa_2)
+        concatenated_diff_a = hcat(msa[[2, 1], :], msa_2)
+        concatenated_diff_b = hcat(msa_2, msa[[2, 1], :])
 
         @testset "concatenated concatenated" begin
             concatenated_out = hcat(concatenated_in, concatenated_in)
+            concat_ab = hcat(concatenated_diff_a, concatenated_diff_b)
 
             @test size(concatenated_out) == (2, 8)
             @test sequencenames(concatenated_out) == ["ONE", "TWO"]
@@ -111,6 +114,27 @@
             @test getannotresidue(concatenated_out, "TWO", "OnlyTWO") == "yyyyyyyy"
             @test getannotcolumn(concatenated_out, "example") == "  HE  HE"
             @test gethcatmapping(concatenated_out) == [1, 1, 2, 2, 3, 3, 4, 4]
+
+            @test size(concat_ab) == (2, 8)
+            @test sequencenames(concat_ab) == [
+                "TWO_&_ONE_&_ONE_&_TWO", "ONE_&_TWO_&_TWO_&_ONE"]
+            @test columnnames(concat_ab) == [
+                "1_1", "1_2", "2_1", "2_2", "3_1", "3_2", "4_1", "4_2"]
+            @test getcolumnmapping(concat_ab) == [1, 2, 1, 2, 1, 2, 1, 2]
+            @test getsequencemapping(concat_ab, 
+                "TWO_&_ONE_&_ONE_&_TWO") == [1, 2, 1, 2, 1, 2, 1, 2]
+            @test getsequencemapping(concat_ab, 
+                "ONE_&_TWO_&_TWO_&_ONE") == [1, 2, 1, 2, 1, 2, 1, 2]
+            @test getannotresidue(concat_ab, 
+                "TWO_&_ONE_&_ONE_&_TWO", "example") == "cdababcd"
+            @test getannotresidue(concat_ab, 
+                "ONE_&_TWO_&_TWO_&_ONE", "example") == "abcdcdab"
+            @test getannotresidue(concat_ab, 
+                "TWO_&_ONE_&_ONE_&_TWO", "OnlyONE") == "  xxxx  "
+            @test getannotresidue(concat_ab, 
+                "TWO_&_ONE_&_ONE_&_TWO", "OnlyTWO") == "yy    yy"
+            @test getannotcolumn(concat_ab, "example") == "  HEHE  "
+            @test gethcatmapping(concat_ab) == [1, 1, 2, 2, 3, 3, 4, 4]
         end
 
         @testset "concatenated non_concatenated" begin
@@ -138,6 +162,25 @@
                 end
                 @test gethcatmapping(concatenated_out) == [1, 1, 2, 2, 3, 3]
             end
+
+            concat_a = hcat(concatenated_diff_a, msa)
+
+            @test size(concat_a) == (2, 6)
+            @test sequencenames(concat_a) == [
+                "TWO_&_ONE_&_ONE", "ONE_&_TWO_&_TWO"]
+            @test columnnames(concat_a) == [
+                "1_1", "1_2", "2_1", "2_2", "3_1", "3_2"]
+            @test getcolumnmapping(concat_a) == [1, 2, 1, 2, 1, 2]
+            @test getsequencemapping(concat_a, "TWO_&_ONE_&_ONE") == [1, 2, 1, 2, 1, 2]
+            @test getsequencemapping(concat_a, "ONE_&_TWO_&_TWO") == [1, 2, 1, 2, 1, 2]
+            @test getannotresidue(concat_a, "TWO_&_ONE_&_ONE", "example") == "cdabab"
+            @test getannotresidue(concat_a, "ONE_&_TWO_&_TWO", "example") == "abcdcd"
+            @test getannotresidue(concat_a, 
+                "TWO_&_ONE_&_ONE", "OnlyONE") == "  xxxx"
+            @test getannotresidue(concat_a, 
+                "TWO_&_ONE_&_ONE", "OnlyTWO") == "yy    "
+            @test getannotcolumn(concat_a, "example") == "  HE  "
+            @test gethcatmapping(concat_a) == [1, 1, 2, 2, 3, 3]
         end
     end
 end