In [3]:
from sequence import *
from phylo import *

In [4]:
print(readNewick('cyp1a1.tree'))

((((CP11_DOG:0.0573186,CP11_SHEEP:0.0573186):0.0718983,(((CP11_HUMAN:0.0234765,CP11_MONKEY:0.0234765):0.0416636,((CP11_HAMSTER:0.0154805,CP11_MOUSE:0.0154805):0.0316513,CP11_RAT:0.0316513):0.0416636):0.0647976,CP11_RABIT:0.0647976):0.0718983):0.123896,CP11_GuineaPig:0.123896):0.23279,(((CP11_COD:0.0662806,CP11_FlatFish:0.0662806):0.071057,((CP11_ToadFish:0.0316513,CP11_BREAM:0.0316513):0.0489659,CP11_SCUP:0.0489659):0.071057):0.10074,CP11_TROUT:0.10074):0.23279):0.23279


In [5]:
my_tree = readNewick('cyp1a1.tree') # Read the newick file
aln = readClustalFile('cyp1a1.aln', Protein_Alphabet) # Load the alignment

mynode = my_tree.findLabel("CP11_HUMAN")
print(mynode.getSequence()) # Look at selected node after loading the tree

my_tree.putAlignment(aln)   # Associate the tree with the alignment
print(mynode.getSequence()) # Look at selected node again to see new info

None
CP11_HUMAN: AGFDTVTTAISWSLMYLVMNPRVQRKIQEELDAFILETFRHSSFIIFGMGKRKCCIGETIARWEVF


In [6]:
print("Here's the tree labeled by part of sequence:") 
print(my_tree.strSequences(30, 40)) # Print the tree with labels being the content at columns 30-39

print("\n\nHere's the tree labeled by the whole sequence:")
print(my_tree.strSequences()) # Print the tree with labels being the content of all columns

Here's the tree labeled by part of sequence:
((((LDAFILETFR:0.0573186,LDAFILETFR:0.0573186):0.0718983,(((LDAFILETFR:0.0234765,LDAFILETFR:0.0234765):0.0416636,((LDAFILETFR:0.0154805,LDAFILETFR:0.0154805):0.0316513,LDAFILETFR:0.0316513):0.0416636):0.0647976,LDAVIMETFR:0.0647976):0.0718983):0.123896,LDAFISEVFR:0.123896):0.23279,(((IKAFIFEIFR:0.0662806,IEAFILEILR:0.0662806):0.071057,((IKAFILEIFR:0.0316513,MKAFILEIFR:0.0316513):0.0489659,MNAFILETFR:0.0489659):0.071057):0.10074,LKAFILEIFR:0.10074):0.23279):0.23279


Here's the tree labeled by the whole sequence:
((((AGFDTVTTAISWSLLYLVTNPNVQKKIQKELDAFILETFRHASFILFGLGKRKCCIGETIARLEVF:0.0573186,AGFDTVTTAISWSLLYLVTSPRVQKKIQEELDAFILETFRHSSFIIFGLGKRQCCIGEIIARLEVF:0.0573186):0.0718983,(((AGFDTVTTAISWSLMYLVMNPRVQRKIQEELDAFILETFRHSSFIIFGMGKRKCCIGETIARWEVF:0.0234765,AGFDTVTTAISWSLMYLVTNPRVQRKIQEELDAFILETFRHSSFILFGLGKRKCCIGETIARWEVF:0.0234765):0.0416636,((AGFDTVTTAISWSLMYLVTNPGVQRKIQEELDAFILETFRHSSFTLFGLGKRKCCIGETIGRLEVF:0.0154805,AGFDTVTTAISWSLMYLVTNP

In [7]:
# Extract the nodes for each clade
left_root_children = my_tree.root.left.getDescendants(transitive = True)
right_root_children = my_tree.root.right.getDescendants(transitive = True)

# Get the corresponding sequences
print("Left child")
left_list = []
right_list = []
for ele in left_root_children:
    if ele.getSequence() is not None:
        left_list.append(ele.sequence)
        print(ele.getSequence())
print("\nRight child")
for ele in right_root_children:
    if ele.getSequence() is not None:
        right_list.append(ele.sequence)
        print(ele.getSequence())

# Build two new alignments for each clade
left_aln = Alignment(left_list)
right_aln = Alignment(right_list)

# Calculate consensus sequences
print("Mammals consensus:", left_aln.getConsensus())
print("Fish consensus:", right_aln.getConsensus())

Left child
CP11_GuineaPig: AGFDTITTAISWSLLYLVMNPRIQKKIQEELDAFISEVFRYSSFTIFGLGKRRCCLGEVIGRWEVF
CP11_DOG: AGFDTVTTAISWSLLYLVTNPNVQKKIQKELDAFILETFRHASFILFGLGKRKCCIGETIARLEVF
CP11_SHEEP: AGFDTVTTAISWSLLYLVTSPRVQKKIQEELDAFILETFRHSSFIIFGLGKRQCCIGEIIARLEVF
CP11_RABIT: AGFDTVTTAISWSLMYLVTKPRIQRKIQEELDAVIMETFRHTSFLLFGLGKRKCCIGETIGRLEVF
CP11_HUMAN: AGFDTVTTAISWSLMYLVMNPRVQRKIQEELDAFILETFRHSSFIIFGMGKRKCCIGETIARWEVF
CP11_MONKEY: AGFDTVTTAISWSLMYLVTNPRVQRKIQEELDAFILETFRHSSFILFGLGKRKCCIGETIARWEVF
CP11_RAT: AGFDTITTAISWSLMYLVTNPRIQRKIQEELDAFILETFRHSSFILFGLGKRKCCIGETIGRLEVF
CP11_HAMSTER: AGFDTVTTAISWSLMYLVTNPGVQRKIQEELDAFILETFRHSSFTLFGLGKRKCCIGETIGRLEVF
CP11_MOUSE: AGFDTVTTAISWSLMYLVTNPRVQRKIQEELDAFILETFRHSSFTLFGLGKRKCCIGETIGRSEVF

Right child
CP11_TROUT: AGFDTISTALSWAVVYLVAYPEIQERLHQELKAFILEIFRHSSFLVFGMDKRRCCIGEAIGRNEVF
CP11_COD: AGFDTVSTALSWSVMYLVAHPEIQERLHQEIKAFIFEIFRHSSFMLFGMGKRRCCIGEMVARNEVF
CP11_FlatFish: AGFDTVSTALSWSVMYLVAHPEIQERLYQEIEAFILEILRHSSFMAFGMGKRRCCIGEVIARNEVY
CP11_SCUP: AGFDTISTALSWS

### Part 2 : Exploring evolutionary relationships in yeast MalS proteins

In [11]:
import csv
yeasts_list = []
yeasts = dict()
with open('sugars.csv', 'rt') as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        if row[0] != "Yeast": #Ignore the header
            yeasts_list.append(row[0])
            #Update dictionary where key is species (row[0])
            #Value is list of booleans 
            yeasts[row[0]] = [y == 'True' for y in row[1:]]
print(yeasts)
print("\nyeasts_list: ",yeasts_list)

{'S.cerevisiae': [True, True, True, True, True, True, True, True, True], 'S.paradoxus': [True, True, True, True, True, True, True, True, True], 'S.mikatae': [True, True, True, True, False, True, True, True, True], 'S.kudriavzevii': [True, True, True, True, True, True, True, True, True], 'S.bayanus': [True, True, True, True, True, True, True, True, False], 'S.castelli': [False, True, False, False, False, True, False, False, False], 'C.glabrata': [False, True, False, False, False, False, False, False, False], 'K.polysporus': [False, True, False, False, False, False, False, False, True], 'K.thermotolerans': [True, True, True, True, True, False, True, True, True], 'L.waltii': [True, True, True, False, True, True, False, True, True], 'S.kluyverii': [True, True, False, False, True, False, False, True, True], 'K.lactis': [True, True, True, True, True, False, False, True, True], 'A.gossypii': [True, False, False, False, False, False, False, False, False], 'L.elongisporus': [True, True, True, T

In [12]:
# read fasta file
seqs = readFastaFile('MalS.fa')

Here we need to run multiple times, until the result settle

In [16]:
# remove unmatched seq
for item in seqs:
    if item.name not in yeasts_list:
        seqs.remove(item)
print(len(seqs))

31


In [17]:
for seq in seqs:
    # print(seq.name, len(seq), len(seq.alphabet))
    seq.name = seq.info.replace(" ", "_")
    print(seq.name)

# save filtered seqs to a new fasta file
writeFastaFile('select.fa', seqs)

K.thermotolerans_GI:255711056
S.kudriavzevii_IFO1802_c1888
S.mikatae_IFO1815_c789
S.paradoxus_N_45
S.cerevisiae_YPS606
S.cerevisiae_S288c_IMA2
S.cerevisiae_S288c_IMA4
S.cerevisiae_S288c_IMA3
S.paradoxus_UFRJ50791
S.bayanus_MYC623
S.kudriavzevii_IFO1802_c1565
S.mikatae_IFO1815_c633
S.cerevisiae_S288c_IMA1
S.paradoxus_CBS432
S.cerevisiae_YIIc17_E5
S.cerevisiae_MAL32
S.cerevisiae_MAL12
S.mikatae_IFO1815_c203
S.kudriavzevii_IFO1802_c1970
K.thermotolerans_GI:255715867
S.kluyverii_SAKL0A05698g
S.kluyverii_SAKL0A05654g
S.kluyverii_SAKL0A00154g
S.kluyverii_SAKL0C00176g
S.cerevisiae_S288c_IMA5
S.cerevisiae_273614N
S.paradoxus_DBVPG6304
K.thermotolerans_GI:255719660
K.thermotolerans_GI:255719187
K.lactis_GI:50312678
L.elongisporus_GI:149243808


### Exercise 3 : Infer the phylogenetic relationships among select MalS proteins

Below is the code to generate a tree for the CYP1A1 alignment using the UPGMA method.

In [18]:
aln = readClustalFile('cyp1a1.aln', Protein_Alphabet)
my_tree = runUPGMA(aln, 'fractional')
print(my_tree)
writeNewickFile('my_cyp1a1.nwk', my_tree)

((CP11_GuineaPig:0.10511363636363635,((CP11_SHEEP:0.05303030303030303,CP11_DOG:0.05303030303030303):0.011994949494949496,((((CP11_HAMSTER:0.015151515151515152,CP11_MOUSE:0.015151515151515152):0.015151515151515152,CP11_RAT:0.030303030303030304):0.00883838383838384,(CP11_HUMAN:0.022727272727272728,CP11_MONKEY:0.022727272727272728):0.016414141414141416):0.01994949494949494,CP11_RABIT:0.05909090909090908):0.005934343434343445):0.040088383838383826):0.06856762065095401,(((CP11_FlatFish:0.06060606060606061,CP11_COD:0.06060606060606061):0.0037878787878787845,(CP11_SCUP:0.045454545454545456,(CP11_ToadFish:0.030303030303030304,CP11_BREAM:0.030303030303030304):0.015151515151515152):0.018939393939393936):0.023484848484848483,CP11_TROUT:0.08787878787878788):0.08580246913580249):0.0


In [20]:
aln = readClustalFile('MalS_select.aln', Protein_Alphabet)
my_tree = runUPGMA(aln, 'poisson')
print(my_tree)
writeNewickFile('my_MalS.nwk', my_tree)

(((((((S_kluyverii_SAKL0A05654g:0.07203228321923372,S_kluyverii_SAKL0A05698g:0.07203228321923372):0.06425830176061018,(S_kluyverii_SAKL0C00176g:0.06497678563831935,S_kluyverii_SAKL0A00154g:0.06497678563831935):0.07131379934152454):0.052365614190002435,((S_cerevisiae_S288c_IMA5:0.019321631290719628,S_cerevisiae_273614N:0.019321631290719628):0.02303670380766093,S_paradoxus_DBVPG6304:0.04235833509838056):0.14629786407146578):0.01731863257403715,((((S_kudriavzevii_IFO1802_c1888:0.05269600536442903,(S_paradoxus_N_45:0.026587381243628062,S_mikatae_IFO1815_c789:0.026587381243628062):0.02610862412080097):0.008401221173280016,(S_kudriavzevii_IFO1802_c1565:0.03983314818577777,(S_bayanus_MYC623:0.03733663677140231,(S_cerevisiae_S288c_IMA1:0.03577531263635547,((((S_cerevisiae_S288c_IMA4:-0.0,S_cerevisiae_S288c_IMA3:-0.0):0.003585053446167813,(S_cerevisiae_YPS606:0.0017889106739419833,S_cerevisiae_S288c_IMA2:0.0017889106739419833):0.0017961427722258299):0.016455352326200396,S_paradoxus_UFRJ50791:0.

to generate a Newick string of the MalS tree where the tips are labeled with the sequence at positions 65-70.

In [21]:
my_tree.putAlignment(aln)
# Print the tree with labels being the content at columns 65-70
print(my_tree.strSequences(65, 70))

(((((((LGVDA:0.07203228321923372,LGVDA:0.07203228321923372):0.06425830176061018,(LGVDA:0.06497678563831935,VGIDA:0.06497678563831935):0.07131379934152454):0.052365614190002435,((LGVDA:0.019321631290719628,LGVDA:0.019321631290719628):0.02303670380766093,LGVDA:0.04235833509838056):0.14629786407146578):0.01731863257403715,((((LGVDA:0.05269600536442903,(LGADA:0.026587381243628062,LGADA:0.026587381243628062):0.02610862412080097):0.008401221173280016,(LGADA:0.03983314818577777,(LGADA:0.03733663677140231,(LGADA:0.03577531263635547,((((LGTDA,LGTDA):0.003585053446167813,(LGADA:0.0017889106739419833,LGADA:0.0017889106739419833):0.0017961427722258299):0.016455352326200396,LGADA:0.02004040577236821):0.009193855048833528,LGADA:0.029234260821201737):0.006541051815153735):0.0015613241350468376):0.0024965114143754574):0.02126407835193128):0.022189684911931032,LGADA:0.08328691144964008):0.08034817571029375,((LGADA:0.07606190129054118,((((LGVDA:0.0008984728383508152,LGVDA:0.0008984728383508152):0.001350

### Exercise 4 : Ancestral sequence reconstruction of MalS proteins

In [22]:
aln.writeHTML('MalS_select.html')

'<html><head><meta content="text/html; charset=ISO-8859-1" http-equiv="Content-Type">\n<title>Sequence Alignment</title>\n</head><body><pre>\n                                       1         2         3         4         5         6         7         8         9         1         1         1         1         1         1         1         1         1         1         2         2         2         2         2         2         2         2         2         2         3         3         3         3         3         3         3         3         3         3         4         4         4         4         4         4         4         4         4         4         5         5         5         5         5         5         5         5         5         5         600\n                                       0         0         0         0         0         0         0         0         0         0         1         2         3         4         5         6         7         8         9    

In [46]:
extracted_list = []
for i in range(len(aln)):
    extracted = aln[i][172]+aln[i][231]+aln[i][232]+aln[i][233]+aln[i][234]+aln[i][294]+aln[i][295]+aln[i][324]+aln[i][437]
    seq = Sequence(name='seq', sequence=extracted, alphabet=Protein_wGAP , gappy=True)
    extracted_list.append(seq)
print(extracted_list)

[<sequence.Sequence object at 0x000002BBDE9063A0>, <sequence.Sequence object at 0x000002BBDE9064C0>, <sequence.Sequence object at 0x000002BBDE9063D0>, <sequence.Sequence object at 0x000002BBDE906460>, <sequence.Sequence object at 0x000002BBDE906490>, <sequence.Sequence object at 0x000002BBDE9064F0>, <sequence.Sequence object at 0x000002BBDE906520>, <sequence.Sequence object at 0x000002BBDE906550>, <sequence.Sequence object at 0x000002BBDE9065B0>, <sequence.Sequence object at 0x000002BBDE9065E0>, <sequence.Sequence object at 0x000002BBDE906610>, <sequence.Sequence object at 0x000002BBDE906640>, <sequence.Sequence object at 0x000002BBDE906670>, <sequence.Sequence object at 0x000002BBDE9066A0>, <sequence.Sequence object at 0x000002BBDE9066D0>, <sequence.Sequence object at 0x000002BBDE906700>, <sequence.Sequence object at 0x000002BBDE906730>, <sequence.Sequence object at 0x000002BBDE906760>, <sequence.Sequence object at 0x000002BBDE906790>, <sequence.Sequence object at 0x000002BBDE9067C0>,

In [45]:
print(extracted_list[0])
test_aln = Alignment(extracted_list)

YGSLYQHVI
