In [19]:
import os
import glob
from Bio import SeqIO
from tqdm.auto import tqdm
import pylev
import matplotlib.pyplot as plt
import numpy as np
import copy
import csv
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

In [4]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
def print_alignment(seq1,seq2,gap_cost = -5):
    alignment = pairwise2.align.globalms(seq1, seq2,2,-1, gap_cost,-0.3)#gap penalty of -5 seems to work well at preventing them from forming!
    al1 = alignment[0][0]
    al2 = alignment[0][1]

    i = 0
    print("Alignment:")
    while(i <= len(al1)):
        sub_al1 = al1[i:min(len(al1),i+70)]
        sub_al2 = al2[i:min(len(al2),i+70)]
        
        print(f"{i}-{min(len(al1),i+70)}")
        print(sub_al1)
        print("".join(["|" if sub_al1[j] == sub_al2[j] else " " if sub_al1[j] == "-" or sub_al2[j] == "-" else "." for j in range(0,len(sub_al1))]))
        print(sub_al2)
        print()
        i += 70
    
    gap_percentage = ( len(al1)-max(len(seq1),len(seq2)) )/len(al1)*100
    print(f"Gap percentage: {gap_percentage}")
    


In [33]:
non_gapped_filename = "../mock_data/gapped_msa_test/not_gapped/example.phy"

def LoadSequences(filename):
    seq_recs = [seqrec for seqrec in SeqIO.parse(filename,"phylip")]
    seq_dict = {}
    for seq_rec in seq_recs:
        seq_dict[seq_rec.id] = seq_rec.seq
    return seq_dict

non_gapped_seq_dict = LoadSequences(non_gapped_filename)

print_alignment(non_gapped_seq_dict["Human"], non_gapped_seq_dict["LngfishAu"],gap_cost=-5)

Alignment:
0-70
CTACCACACCCCAGGAAACAGCAGTGATTAACCTTTAGCATAAACGAAGTTTAACTAAGCTATAAAGGGT
||.||||||||||||||.||||||||||||||.||.|||||||.|||||.||.|||.|||.|....|||.
CTCCCACACCCCAGGAACCAGCAGTGATTAACATTGAGCATAAGCGAAGCTTGACTCAGCCACCTCGGGC

70-100
TGGTCAATTTCGTGCAGCCACCGCGGTCAC
.|||.||..||||||||||||||||||.|.
CGGTAAACCTCGTGCAGCCACCGCGGTTAT

Gap percentage: 0.0


In [34]:
non_gapped_asr_state_filename = "../mock_data/gapped_msa_test/not_gapped/example.phy.state"

def LoadStateFile(filename):
    seqs = {}
    reader = csv.reader(open(filename), delimiter="\t")
    while True:
        if "#" not in next(reader)[0]:
            break
    for row in reader:
        if row[0] not in seqs:
            seqs[row[0]] = ""
        seqs[row[0]] += row[2]
    for node, raw_seq in seqs.items(): 
        seqs[node] = Seq(raw_seq)
    return seqs
    
non_gapped_asr_seqs = LoadStateFile(non_gapped_asr_state_filename)
print(non_gapped_asr_seqs["Node12"])

CTACCACACCCCAGGAAACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTTATAAAGGGTTGGTCAATTTCGTGCAGCCACCGCGGTCAT


In [35]:
print("Human, Opossum")
print_alignment(non_gapped_seq_dict["Human"],non_gapped_seq_dict["Opossum"])
print("Opossum, Node12")
print_alignment(non_gapped_seq_dict["Opossum"],non_gapped_asr_seqs["Node12"])
print("Human, Node12")
print_alignment(non_gapped_seq_dict["Human"],non_gapped_asr_seqs["Node12"])

Human, Opossum
Alignment:
0-70
CTACCACACCCCAGGAAACAGCAGTGATTAACCTTTAGCATAAACGAAGTTTAACTAAGCTATAAAGGGT
||.|||||||||||||.||||||||||||||..||.||||||||||||||||.||||||..||..|||||
CTTCCACACCCCAGGAGACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTCATTTAGGGT

70-100
TGGTCAATTTCGTGCAGCCACCGCGGTCAC
|||||||||||||||||||||||||||||.
TGGTCAATTTCGTGCAGCCACCGCGGTCAT

Gap percentage: 0.0
Opossum, Node12
Alignment:
0-70
CTTCCACACCCCAGGAGACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTCATTTAGGGT
||.|||||||||||||.|||||||||||||||||||||||||||||||||||||||||||.||..|||||
CTACCACACCCCAGGAAACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTTATAAAGGGT

70-100
TGGTCAATTTCGTGCAGCCACCGCGGTCAT
||||||||||||||||||||||||||||||
TGGTCAATTTCGTGCAGCCACCGCGGTCAT

Gap percentage: 0.0
Human, Node12
Alignment:
0-70
CTACCACACCCCAGGAAACAGCAGTGATTAACCTTTAGCATAAACGAAGTTTAACTAAGCTATAAAGGGT
|||||||||||||||||||||||||||||||..||.||||||||||||||||.||||||.||||||||||
CTACCACACCCCAGGAAACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTTATAAAGGGT

70-100
TGGTCAATTTCGTGCA

In [44]:
deletion_sequences_filename = "../mock_data/gapped_msa_test/gapped/example.phy"
deletion_asr_state_filename = "../mock_data/gapped_msa_test/gapped/example.phy.state"

deletion_seq_dict = LoadSequences(deletion_sequences_filename)
deletion_asr_seqs = LoadStateFile(non_gapped_asr_state_filename)

print("Human, Opossum")
print_alignment(deletion_seq_dict["Human"],deletion_seq_dict["Opossum"])
print("Human, Cow")
print_alignment(deletion_seq_dict["Human"],deletion_seq_dict["Cow"])
print("Opossum, Node12")
print_alignment(deletion_seq_dict["Opossum"],deletion_asr_seqs["Node12"])
print("Human, Node12")
print_alignment(deletion_seq_dict["Human"],deletion_asr_seqs["Node12"])

Human, Opossum
Alignment:
0-70
---CCACACCCCAGGAAACAGCAGTGATTAACCTTTAGCATAAACGAAGTTTAACTAAGCTATAAAGGGT
||||||||||||||||.||||||||||||||..||.||||||||||||||||.||||||..||..|||||
---CCACACCCCAGGAGACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTCATTTAGGGT

70-100
TGGTCAATTTCGTGCAGCCACCGCGGTCAC
|||||||||||||||||||||||||||||.
TGGTCAATTTCGTGCAGCCACCGCGGTCAT

Gap percentage: 0.0
Human, Cow
Alignment:
0-70
---CCACACCCCAGGAAACAGCAGTGATTAACCTTTAGCATAAACGAAGTTTAACTAAGCTATAAAGGGT
|||||||||||||||||||||||||||..||..||.||||||||||||||||.||||||.||||||||||
---CCACACCCCAGGAAACAGCAGTGACAAAAATTAAGCATAAACGAAGTTTGACTAAGTTATAAAGGGT

70-100
TGGTCAATTTCGTGCAGCCACCGCGGTCAC
||||.|||.||||||||||||||||||||.
TGGTAAATCTCGTGCAGCCACCGCGGTCAT

Gap percentage: 0.0
Opossum, Node12
Alignment:
0-70
---CCACACCCCAGGAGACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTCATTTAGGGT
   |||||||||||||.|||||||||||||||||||||||||||||||||||||||||||.||..|||||
CTACCACACCCCAGGAAACAGCAGTGATTAAAATTAAGCATAAACGAAGTTTGACTAAGTTATAAAGGGT

70-100
TGGTCAATTTCGTGCAGCC