In [22]:
from Bio import SeqIO

for record in SeqIO.parse("COVID19_TP10_CLUSTAL_MSA.fasta", "fasta"):
    print(record.seq)

MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEIFRSDTLYLTQDLFLPFYSNVTGFHTINHTFGNPVIPFKDGIYFAATEKSNVVRGWVFGSTMNNKSQSVIIINNSTNVVIRACNFELCDNPFFAVSKPMGTQTHTMIFDNAFNCTFEYISDAFSLDVSEKSGNFKHLREFVFKNKDGFLYVYKGYQPIDVVRDLPSGFNTLKPIFKLPLGINITNFRAILTAFSPAQDIWGTSAAAYFVGYLKPTTFMLKYDENGTITDAVDCSQNPLAELKCSVKSFEIDKGIYQTSNFRVVPSGDVVRFPNITNLCPFGEVFNATKFPSVYAWERKKISNCVADYSVLYNSTFFSTFKCYGVSATKLNDLCFSNVYADSFVVKGDDVRQIAPGQTGVIADYNYKLPDDFMGCVLAWNTRNIDATSTGNYNYKYRYLRHGKLRPFERDISNVPFSPDGKPCTPPALNCYWPLNDYGFYTTTGIGYQPYRVVVLSFELLNAPATVCGPKLSTDLIKNQCVNFNFNGLTGTGVLTPSSKRFQPFQQFGRDVSDFTDSVRDPKTSEILDISPCSFGGVSVITPGTNASSEVAVLYQDVNCTDVSTAIHADQLTPAWRIYSTGNNVFQTQAGCLIGAEHVDTSYECDIPIGAGICASYHTVSLLRSTSQKSIVAYTMSLGADSSIAYSNNTIAIPTNFSISITTEVMPVSMAKTSVDCNMYICGDSTECANLLLQYGSFCTQLNRALSGIAAEQDRNTREVFAQVKQMYKTPTLKYFGGFNFSQILPDPLKPTKRSFIEDLLFNKVTLADAGFMKQYGECLGDINARDLICAQKFNGLTVLPPLLTDDMIAAYTAALVSGTATAGWTFGAGAALQIPFAMQMAYRFNGIGVTQNVLYENQKQIANQFNKAISQIQESLTTTSTALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEI

In [24]:
from pymsa import MSA, Entropy, PercentageOfNonGaps, PercentageOfTotallyConservedColumns, Star, SumOfPairs
from pymsa import PAM250, Blosum62, FileMatrix
from pymsa.util.fasta import print_alignment


def run_all_scores(sequences: list) -> None:
    aligned_sequences = list(pair[1] for pair in sequences)
    sequences_id = list(pair[0] for pair in sequences)

    msa = MSA(aligned_sequences, sequences_id)
    print_alignment(msa)

    # Percentage of non-gaps and totally conserved columns
    non_gaps = PercentageOfNonGaps(msa)
    totally_conserved_columns = PercentageOfTotallyConservedColumns(msa)

    percentage = non_gaps.compute()
    print("Percentage of non-gaps: {0} %".format(percentage))

    conserved = totally_conserved_columns.compute()
    print("Percentage of totally conserved columns: {0}".format(conserved))

    # Entropy
    value = Entropy(msa).compute()
    print("Entropy score: {0}".format(value))

    # Sum of pairs
    value = SumOfPairs(msa, Blosum62()).compute()
    print("Sum of Pairs score (Blosum62): {0}".format(value))

    value = SumOfPairs(msa, PAM250()).compute()
    print("Sum of Pairs score (PAM250): {0}".format(value))

    value = SumOfPairs(msa, FileMatrix('PAM380.txt')).compute()
    print("Sum of Pairs score (PAM380): {0}".format(value))

    # Star
    value = Star(msa, Blosum62()).compute()
    print("Star score (Blosum62): {0}".format(value))

    value = Star(msa, PAM250()).compute()
    print("Star score (PAM250): {0}".format(value))


if __name__ == '__main__':
    sequences = [("sp|P59594|SPIKE_CVHSA  Spike glycoprotein OS=Human SARS coronavirus OX=694009 GN=S PE=1 SV=1",
"MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEIFRSDTLYLTQDLFLPFYSNVTGFHTINHTFGNPV"
"IPFKDGIYFAATEKSNVVRGWVFGSTMNNKSQSVIIINNSTNVVIRACNFELCDNPFFAVSKPMGTQTHTMIFDNAFNCT"
"FEYISDAFSLDVSEKSGNFKHLREFVFKNKDGFLYVYKGYQPIDVVRDLPSGFNTLKPIFKLPLGINITNFRAILTAFSP"
"AQDIWGTSAAAYFVGYLKPTTFMLKYDENGTITDAVDCSQNPLAELKCSVKSFEIDKGIYQTSNFRVVPSGDVVRFPNIT"
"NLCPFGEVFNATKFPSVYAWERKKISNCVADYSVLYNSTFFSTFKCYGVSATKLNDLCFSNVYADSFVVKGDDVRQIAPG"
"QTGVIADYNYKLPDDFMGCVLAWNTRNIDATSTGNYNYKYRYLRHGKLRPFERDISNVPFSPDGKPCTPPALNCYWPLND"
"YGFYTTTGIGYQPYRVVVLSFELLNAPATVCGPKLSTDLIKNQCVNFNFNGLTGTGVLTPSSKRFQPFQQFGRDVSDFTD"
"SVRDPKTSEILDISPCSFGGVSVITPGTNASSEVAVLYQDVNCTDVSTAIHADQLTPAWRIYSTGNNVFQTQAGCLIGAE"
"HVDTSYECDIPIGAGICASYHTVSLLRSTSQKSIVAYTMSLGADSSIAYSNNTIAIPTNFSISITTEVMPVSMAKTSVDC"
"NMYICGDSTECANLLLQYGSFCTQLNRALSGIAAEQDRNTREVFAQVKQMYKTPTLKYFGGFNFSQILPDPLKPTKRSFI"
"EDLLFNKVTLADAGFMKQYGECLGDINARDLICAQKFNGLTVLPPLLTDDMIAAYTAALVSGTATAGWTFGAGAALQIPF"
"AMQMAYRFNGIGVTQNVLYENQKQIANQFNKAISQIQESLTTTSTALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLN"
"DILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLMSFPQAAPH"
"GVVFLHVTYVPSQERNFTTAPAICHEGKAYFPREGVFVFNGTSWFITQRNFFSPQIITTDNTFVSGNCDVVIGIINNTVY"
"DPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELGKYEQYIKWPWYVWL"
"GFIAGLIAIVMVTILLCCMTSCCSCLKGACSCGSCCKFDEDDSEPVLKGVKLHYT"),
("tp10",
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"---------AGYL-------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"----------------------LGKINLK---------------------------------------------------"
"-----------------------------------------------------------ALAALAKKIL-----------"
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"--------------------------------------------------------------------------------"
"-------------------------------------------------------")
]
    run_all_scores(sequences)

sp|P59594|SPIKE_CVHSA  Spike glycoprotein OS=Human SARS coronavirus OX=694009 GN=S PE=1 SV=1	MFIFLLFLTLTSGSDLDRCTTFDDVQAPNYTQHTSSMRGVYYPDEIFRSDTLYLTQDLFLPFYSNVTGFHTINHTFGNPVIPFKDGIYFAATEKSNVVRG
tp10                                                                                        	----------------------------------------------------------------------------------------------------

sp|P59594|SPIKE_CVHSA  Spike glycoprotein OS=Human SARS coronavirus OX=694009 GN=S PE=1 SV=1	WVFGSTMNNKSQSVIIINNSTNVVIRACNFELCDNPFFAVSKPMGTQTHTMIFDNAFNCTFEYISDAFSLDVSEKSGNFKHLREFVFKNKDGFLYVYKGY
tp10                                                                                        	----------------------------------------------------------------------------------------------------

sp|P59594|SPIKE_CVHSA  Spike glycoprotein OS=Human SARS coronavirus OX=694009 GN=S PE=1 SV=1	QPIDVVRDLPSGFNTLKPIFKLPLGINITNFRAILTAFSPAQDIWGTSA[44m[97mA[0m[46m[97mA[0m[44m[97mY[0mFVGYLKPTTFMLKYDENGTITDAVDCSQNPLAELK