-
Notifications
You must be signed in to change notification settings - Fork 0
/
tree_mut_branches.R
125 lines (114 loc) · 4.51 KB
/
tree_mut_branches.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Find mutations in a tree.
## General code for broad use in other phylogenies.
## Feb 28th 2019
######
## Generate ancestral datastructures -> ancestral_reconstruction from treetime
## Phylo tree from file.
## Read seq and get the aa sequence too-> aligned.
#####
create_ancestral_data = function(tree_file, aln_file, rdata_file = NULL, obj_names = c('tre', 'nt', 'aa')){
library(tidyverse);library(ape); library(Biostrings)
## Read the data.
tre = read.nexus(tree_file)
nt = readDNAStringSet(aln_file) ## Make sure the seq are in frame.
## Translate to AA.
aa = gsub('-', '', nt) %>% DNAStringSet() %>% translate
if(length(unique(width(aa))) > 10){
## Make an alignment if the translated sequences are not of the same length.
writeXStringSet(aa, 'tmp.fa')
system("muscle -in tmp.fa -out tmp_aa.fa")
aa = readAAStringSet("tmp_aa.fa")
file.remove("tmp.fa")
file.remove("tmp_aa.fa")
}
## If rdata-file is null -> return a list
if(!is.null(rdata_file)){
save(tre, nt, aa, file = rdata_file)
return(paste("Tree and sequence alignment successfully stored in", rdata_file))
}else{
assign(obj_names[1], tre, env = globalenv())
assign(obj_names[2], nt, env = globalenv())
assign(obj_names[3], aa, env = globalenv())
return(paste("Tree and sequence alignments sucessfully loaded in global environment: ", obj_names))
}
}
######
## Find mutations between a given sequnece and a reference. StringSets
## Output dataframe with old, new characters, position and label columns.
#####
find_mutations = function(x, ref){
## Check that both are X StringSets.
if( ! class(x) %in% c('AAStringSet', 'DNAStringSet') ) stop ("Weird x, not XXStringSet")
if( ! class(ref) %in% c('AAStringSet', 'DNAStringSet') ) stop ("Weird ref, not XXStringSet")
if(width(x) != width(ref)) stop()
## Search variable positions and find the prev and new character.
cm = consensusMatrix(c(ref,x))
var_pos = apply(cm, 2, function(i) sum(i > 0) > 1) %>% which
## Get characters.
old_char = sapply(var_pos, function(i) substr(ref, i, i))
new_char = sapply(var_pos, function(i) substr(x, i, i))
if(length(var_pos) == 0) old_char = new_char = vector()
mut = data.frame(pos= var_pos, old = old_char, new = new_char,
label = paste(old_char, var_pos, new_char, sep = ''))
return( mut )
}
######
## Map mutations to tree branches.
## Input: tre - phylo; nt and aa DNA or AAStringsets.
## Output: tree tibble with branch.mut_aa / nt with mutation labels.
#####
mutations_per_node = function(tre, nt, aa = FALSE){
library(tidyverse);library(tidytree); library(ape)
#####
##Checking input
#####
## Check the tree is a phylo or tree tible str.
if("tbl_tree" %in% class(tre)){
tree_dt = tre
}else if(class(tre) == 'phylo'){
tree_dt = as_tibble(tre)
}else{
stop("Tree is not in a phylo or tbl tree data structure")
}
## Check the seq is a DNAStringSet and all samples have same length.
if(class(nt) != 'DNAStringSet') stop('Nt is not a DNAStringSet')
if(length(unique(width(nt))) > 1) stop('Sequences of different length are present')
## Check the names match with the seq.
if( length(setdiff(tree_dt$label, names(nt)) ) > 0 | nrow(tree_dt) < length(nt) ) stop("Missing entries")
nt = nt[tree_dt$label] ## Keep same order in tree and in sequences.
## Check if aminoacid level stuff should be checked too.
if(class(aa) == 'logical'){
if(!aa){
lvs = list(nt = nt)
}else{
lvs = list(nt = nt, aa = translate(nt))
}
}else if(class(aa) == 'AAStringSet'){
lvs = list(nt = nt, aa = aa)
}else{
stop('Incorrect aa parameter')
}
#####
## Main code.
#####
## Put sequences in same order as the tips/nodes in the tibble.
for(ch in names(lvs)) lvs[[ch]] = lvs[[ch]][tree_dt$label]
## Add the names of the columns we need.
tree_dt[,paste('branch.mut', names(lvs), sep = '_')] = ''
for(nn in nrow(tree_dt):1 ){
## Method: Just compare the sample and their parents unless I am already in the tree root.
## Keep stuff in index value -> node is index for the dataframe
nn_dad = parent(tree_dt, nn)$node
if(length(nn_dad) ==0 ) next
for(ch in names(lvs)){
## Get sequences for node and their parent.
x = lvs[[ch]][nn]
ref = lvs[[ch]][nn_dad]
if(x == ref) tmp = data.frame() else tmp = find_mutations(x, ref)
if(nrow(tmp) > 0){
tree_dt[nn, paste('branch.mut', ch, sep = '_')] = paste(tmp$label, collapse = "-")
}
}
}
return(tree_dt)
}