<br><br>
<h1 style="font-size:36px" align="center"> Analyse Ancestral States</h1><br><br><br><br><br><br>

In [20]:
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import SeqIO
import copy
import json
import os
import csv
import tqdm
import re
import math
import matplotlib.pyplot as plt
class Node:
    def __init__(self):
        self.sequence = Seq("")
        self.probabilities = []
        self.grasp_sequence = Seq("")
        self.ancestral_positions = []
        self.gapped_sequence_length = 0

    def append(self, amino_acid, position, probability):
        self.sequence = self.sequence + Seq(amino_acid)
        self.probabilities.append(probability)
        if(len(self.sequence) != position or len(self.probabilities) != position):
            print(self.sequence, position, self.probabilities)
            print(len(self.sequence))
            print(len(self.probabilities))
            raise Exception("Something went wrong, the lengths of probabilies, sequences and position are unequal.\nCheck input file")
    
    def add_grasp_sequence(self,grasp_sequence):
        if(len(grasp_sequence) != len(self.sequence)):
            raise Exception("Length of grasp sequence and IqTree sequence are not the same, something has gone wrong")
        self.grasp_sequence = grasp_sequence
        for char in grasp_sequence:
            self.ancestral_positions.append(char == "-")
            self.gapped_sequence_length += (0 if char == "-" else 1)
        
    def __str__(self):
        return "Node(sequence={})".format(self.sequence)

aa_positions = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
def get_most_likely_sequence(node):
    most_likely_sequence = ""
    for probability in node.probabilities:
        max_prob_index = probability.index(max(probability))
        most_likely_sequence += aa_positions[max_prob_index]
    return Seq(most_likely_sequence)

def get_mean_probability(node):
    total_prob = 0
    for probability in node.probabilities:
        total_prob += float(max(probability))
    total_prob = total_prob/len(node.sequence)
    return total_prob

def get_gapped_sequence(node):
    seq = Seq("")
    for i in range(0,len(node.grasp_sequence)):
        if(node.grasp_sequence[i] == "-"):
            seq += Seq("-")
        else:
            seq += Seq(node.sequence[i])
    return seq

def get_ungapped_sequence(node):
    seq = Seq("")
    for i in range(0,len(node.grasp_sequence)):
        if(node.grasp_sequence[i] != "-"):
            seq += Seq(node.sequence[i])
    return seq

def get_gapped_probabilities(node):
    probabilities = []
    for i in range(0,len(node.probabilities)):
        if(node.grasp_sequence[i] == "-"):
            probabilities.append(0)
        else:
            probabilities.append(float(max(node.probabilities[i])))
    return probabilities

def get_mean_gapped_probabilities(node):
    total_prob = 0
    for i in range(0,len(node.probabilities)):
        if(node.grasp_sequence[i] != "-"):
            total_prob += float(max(node.probabilities[i]))
    total_prob = total_prob/node.gapped_sequence_length
    return total_prob

def generate_select_uncertain_residues(node,uncertainty_threshold):
    uncertain_positions_in_MSA = []
    uncertain_positions = []
    for i in range(0,len(node.probabilities)):
        if float(max(node.probabilities[i])) < uncertainty_threshold:
            uncertain_positions_in_MSA.append(i)
            if(node.grasp_sequence[i] != "-"):
               uncertain_positions.append(i - node.grasp_sequence[:i].count("-")+1)
    print(f"select #1/A:{','.join([str(x) for x in uncertain_positions])}")

<h3 style="font-size:24px"> Define Parameters</h3><br>

In [2]:
grasp_ancestors_filename = "../processed_sequences/clustal_hmm_5_per_cluster_18022023/ancestors/seqs11_ancestors.fa"
interesting_nodes_filename = "../processed_sequences/clustal_hmm_5_per_cluster_18022023/interesting_ancestors.txt"
state_filename = "../processed_sequences/clustal_hmm_5_per_cluster_18022023/seqs11.txt.state"
sequence_output_filename = "../processed_sequences/clustal_hmm_5_per_cluster_18022023/ancestors/interesting_ancestors.fa"
sequence_output_directory = "../processed_sequences/clustal_hmm_5_per_cluster_18022023/ancestors/"

In [3]:
# work_dir = "../processed_sequences/clustal_hmm_10_per_cluster_test_copy/"

# # state_filename = None

# # for filename in os.listdir(work_dir):
# #     if filename.endswith(".state"):
# #         state_filename = os.path.join(work_dir, filename)
# #         break

# files_by_node = {}
# for node, dirs, files in os.walk(work_dir):
#     for file in files:
#         if re.search(r'.*N\d+\.tsv', file):
#             files_by_node[node.split("/")[-1]] = os.path.join(node, file)
interesting_nodes = []
with open(interesting_nodes_filename, "r") as file:
    lines = file.readlines()
    for line in lines:
        line = line.split("\n")[0]
        interesting_nodes.append(line.split(","))


<h3 style="font-size:24px"> Load State File</h3><br>

In [4]:
with open(state_filename, "r") as file:
    reader = csv.reader(file, delimiter="\t")
    tsv_data = list(reader)

In [5]:
nodes = {}
header = True
for i in tqdm.tqdm(range(0,len(tsv_data))):
    row = tsv_data[i]
    if("#" in row[0]):
        continue
    if header:
        header = False
        continue
    node_num = str(int(re.search(r'\d+', row[0]).group()))
    if node_num not in nodes:
        nodes[node_num] = Node()
    nodes[node_num].append(row[2],int(row[1]),row[3:])


100%|██████████| 5122392/5122392 [00:44<00:00, 113962.58it/s]


In [6]:
grasp_sequences = [seq for seq in SeqIO.parse(grasp_ancestors_filename,"fasta")]
grasp_nodes = {}
for sequence in grasp_sequences:
    node = str(int(sequence.id[1:])+1)
    nodes[node].add_grasp_sequence(sequence.seq)
    grasp_nodes[node] = sequence.seq

In [23]:
def Analyse_Node(node_index,node_description):
    node = nodes[node_index]
    print(f"Analysing Node{node_index}, {node_description}")
    print(f"Mean residue probability: {get_mean_gapped_probabilities(node)}")
    print(get_ungapped_sequence(node))
    print(node.gapped_sequence_length)
    seq = get_gapped_sequence(node)
    probs = get_gapped_probabilities(node)

    fig, ax = plt.subplots(figsize=(100, 5))  # Set the figure size

    # Plot the probabilities against the character at each position
    ax.scatter(range(len(probs)), probs)

    # Set the x-axis tick labels to the characters in the sequence
    ax.set_xticks(range(len(seq)))
    ax.set_xticklabels(seq)

    # Set the x-axis and y-axis labels
    ax.set_xlabel('Sequence position')
    ax.set_ylabel('Probability')

    # Show the plot
    plt.show()

sequences_to_save = []
for node in interesting_nodes:
#     Analyse_Node(node[0],node[1])
    n = nodes[node[0]]
    id = f"Node{node[0]}_{round(get_mean_gapped_probabilities(n),2)}_mrp"
    print(id,node[1])
    seq_rec = SeqRecord(seq=get_ungapped_sequence(n),id=id,description=node[1])
    generate_select_uncertain_residues(n,0.5)
#     sequences_to_save.append(seq_rec)
#     SeqIO.write(seq_rec, os.path.join(sequence_output_directory,f"Node{node[0]}.fa"), "fasta")

# SeqIO.write(sequences_to_save, sequence_output_filename, "fasta")


Node664_0.71_mrp  Bottromycin Ancestor
select #1/A:2,6,24,25,26,28,33,36,37,38,40,42,43,47,52,56,72,76,77,81,84,87,89,93,97,98,104,114,116,119,122,126,129,148,149,153,164,169,171,174,176,177,178,179,195,196,198,199,200,202,203,216,220,229,232,234,236,239,241,243,244,247,252,260,266,272,273,274,275,276,279,280,281,285,287,288,293,295,296,300,301,303,304,309,312,313,314,315,319,321,334,338,346
Node1125_0.75_mrp  Global Azoline YcaOs
select #1/A:8,12,24,25,28,31,32,35,36,45,47,56,67,68,69,70,71,73,91,105,107,108,110,119,127,156,167,168,176,177,180,182,183,200,201,202,204,205,206,209,211,224,234,242,244,247,254,256,257,258,260,261,263,265,268,272,273,275,276,278,283,285,286,287,291,294,295,298,303,304,314,324,338,339,341,342,343,347,349,360,364,367,372,374,375,377,378,385
Node2539_0.9_mrp  Tridomain YcaO Ancestor 1
select #1/A:69,71,100,116,119,174,211,246,254,257,293,294,366
Node66_0.72_mrp  Global Ancestor. ancestor between EcYcaO and TfuA YcaOs
select #1/A:4,24,25,27,28,31,32,42,43,46,5

In [None]:
node = nodes["1126"]
print(get_ungapped_sequence(node))
print(node.gapped_sequence_length)

<h3 style="font-size:24px"> Me Playing Around</h3><br>

In [None]:
ancestor_filename = "../processed_sequences/clustal_hmm_5_per_cluster_18022023/ancestors/Node2539.fa"
extant_filename = "../raw_sequences/Azoline/Cyanobactins/Cyanobactins_TruD/TruD_Fasta.Fa"

In [None]:
ancestor = [seqrec for seqrec in SeqIO.parse(ancestor_filename,"fasta")][0]
extant = [seqrec for seqrec in SeqIO.parse(extant_filename,"fasta")][0]
print(ancestor.seq)
print(extant.seq)