# Augur for Influenza A - HA protein

In [1]:
import json

## Metadata is required

Making metadata by parsing FASTA

In [60]:
from Bio import SeqIO
import re


records = []
for record in SeqIO.parse("../data/fluHA_sequences.fasta", "fasta"):
    records.append(record.description)

# Can I compile these regexes together? 

# Get strains
strainsearch = re.compile('Strain Name:(?P<strain>.*\d{2}|d{4})')

# Get ids
idsearch = re.compile('gb:(?P<id>[A-Z]+\d+[:]\d+[-]\d{4})')
# idsearch_exact = re.compile('(?P<id>gb:[A-Z]+\d+[:]\d+[-]\d{4})')

strainsList = []
idList = []
for record in records:
        i = idsearch.search(record)
#         i = idsearch_exact.search(record)
        id = i.group('id')
        idList.append(id)

        s = strainsearch.search(record)
        strain = s.group('strain')
        strainsList.append(strain)


# Get years
yearsSearch = re.compile('(?P<year>\d{4}$|\d{2}$)')
yearsList = []
for strain in strainsList:
    y = yearsSearch.search(strain)
    year = y.group('year')
    yearsList.append(year)
    
# Fix dates such that all years are 4 digits 
index = 0
for year in yearsList:
    if len(year) == 2:
        yearsList[index] = '19' + year
    index += 1

# Fix dates such that dates are presented in yyyy-mm-dd format with X's where N/A
index = 0
for year in yearsList:
    yearsList[index] = year + '-XX' + '-XX'
    index += 1


# Concatenate strain strings with id strings for unique identifiers
newID = []
counter = 0
for id in idList:
    newID.append(id + "|" + strainsList[counter])
    
    # Remove spaces
    if " " in newID[counter]:
        newID[counter] = newID[counter].replace(" ", "")
    counter += 1

# for id in newID:
#     print(id)
# metadata has columns:
# strain	virus	accession	date	region	country	division	city	db	segment


In [61]:
from Bio import SeqIO

original_file = r"../data/fluHA_sequences.fasta"
corrected_file = r"../data/corrected.fasta"

position = 0
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
#         print(record.id)
        record.id = newID[position]
        record.description = newID[position]
        position += 1
#         print(record.id)             
        SeqIO.write(record, corrected, 'fasta')

In [62]:
import pandas as pd

# metadata_id = list(zip(fasta_ids, strainsList, yearsList))
# metadata = list(zip(idList, strainsList, yearsList))
metadata = list(zip(newID, yearsList))

# df = pd.DataFrame(data = metadata, columns=['id', 'strain', 'year'])
df = pd.DataFrame(data = metadata, columns=['strain', 'date'])

df.to_csv('../data/metadata.tsv',sep='\t',index=False,header=True)

## Tree with 2 sequences per group

Tree takes 0.82 seconds to build

In [5]:
!augur tree \
  --alignment ../results/aligned-2.fasta \
  --output ../results/tree_raw-2.nwk

Building a tree via:
	iqtree -ninit 2 -n 2 -me 0.05 -nt 1 -s ../results/aligned-2-delim.fasta -m GTR > ../results/aligned-2-delim.iqtree.log
	Nguyen et al: IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies.
	Mol. Biol. Evol., 32:268-274. https://doi.org/10.1093/molbev/msu300

Building original tree took 0.8197963237762451 seconds


In [6]:
from ete3 import Tree


tree_raw = Tree("../results/tree_raw-2.nwk")
print(tree_raw)


   /-CY021709_19-1719|A/AA/Huston/1945
  |
  |   /-CY009452_20-1720|A/Weiss/43
  |--|
  |   \-CY147366_20-1720|A/Weiss/JY2/1943
  |
  |         /-CY009276_19-1719|A/Bel/1942
  |      /-|
  |     |   \-CY146769_19-1719|A/Bellamy/JY2/1942
  |     |
  |     |         /-CY147326_20-1720|A/BH/JY2/1935
  |     |        |
  |     |      /-|      /-U08903_1-1746|A/NWS/1933
  |     |     |  |   /-|
  |     |     |   \-|   \-CY090845_20-1720|A/UnitedKingdom/1-MA/1933
  |     |     |     |
  |   /-|     |      \-CY010788_20-1717|A/WSN/1933TS61
  |  |  |   /-|
  |  |  |  |  |      /-S62154_33-1733|A/AlmaAta/1417/84
  |  |  |  |  |   /-|
  |  |  |  |  |  |  |   /-CY021957_7-1707|A/NewJersey/1976
  |  |  |  |  |  |   \-|
  |  |  |  |  |  |     |   /-CY026139_17-1717|A/Wisconsin/301/1976
  |  |  |  |   \-|      \-|
  |  |  |  |     |        |   /-U53162_1-1778|A/Wisconsin/4754/1994
  |  |  |  |     |         \-|
  |  |   \-|     |            \-U53163_1-1778|A/Wisconsin/4755/1994
  |  |     |     |
-

## Tree with 5 sequences per group

In [7]:
!mkdir -p ../results/

!augur filter \
  --sequences ../data/corrected.fasta \
  --metadata ../data/metadata.tsv \
  --exclude None \
  --output ../results/filtered-5.fasta \
  --group-by year \
  --sequences-per-group 5 \
  --min-date 1918

In [8]:
!augur align \
  --sequences ../results/filtered-5.fasta \
  --output ../results/aligned-5.fasta 


using mafft to align via:
	mafft --reorder --anysymbol --thread 1 ../results/filtered-5.fasta 1> ../results/aligned-5.fasta 2> ../results/aligned-5.fasta.log 

	Katoh et al, Nucleic Acid Research, vol 30, issue 14
	https://doi.org/10.1093%2Fnar%2Fgkf436



Tree takes 2.04 seconds to build

In [9]:
!augur tree \
  --alignment ../results/aligned-5.fasta \
  --output ../results/tree_raw-5.nwk

Building a tree via:
	iqtree -ninit 2 -n 2 -me 0.05 -nt 1 -s ../results/aligned-5-delim.fasta -m GTR > ../results/aligned-5-delim.iqtree.log
	Nguyen et al: IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies.
	Mol. Biol. Evol., 32:268-274. https://doi.org/10.1093/molbev/msu300

Building original tree took 2.038604736328125 seconds


In [10]:
from ete3 import Tree


tree_raw = Tree("../results/tree_raw-5.nwk")
print(tree_raw)


   /-CY021709_19-1719|A/AA/Huston/1945
  |
  |   /-CY009452_20-1720|A/Weiss/43
  |--|
  |   \-CY147366_20-1720|A/Weiss/JY2/1943
  |
  |      /-CY020285_11-1711|A/AA/Marton/1943
  |   /-|
  |  |   \-CY020461_11-1708|A/Iowa/1943
  |  |
  |  |         /-CY009276_19-1719|A/Bel/1942
  |  |      /-|
  |  |     |   \-CY146769_19-1719|A/Bellamy/JY2/1942
  |  |     |
  |  |     |                     /-CY019955_11-1708|A/Alaska/1935
  |  |     |                  /-|
  |  |     |               /-|   \-Z54287_1-1728|A/Mongolia/153/88
  |  |     |              |  |
  |  |     |            /-|   \-CY084006_21-1718|A/PuertoRico/8-WG/1934
  |  |     |           |  |
  |  |     |           |  |   /-CY146857_4-1701|A/PuertoRico/8-SV40/1934
  |  |     |           |   \-|
  |  |     |           |     |   /-CY147494_4-1701|A/PuertoRico/8-SV8/1934
  |  |     |         /-|      \-|
  |  |     |        |  |        |   /-CY146873_1-1698|A/PuertoRico/8-SV120/1934
  |  |     |        |  |         \-|
  |  |   /

## Tree with 10 sequences per group

In [11]:
!mkdir -p ../results/

!augur filter \
  --sequences ../data/corrected.fasta \
  --metadata ../data/metadata.tsv \
  --exclude None \
  --output ../results/filtered-10.fasta \
  --group-by year \
  --sequences-per-group 10 \
  --min-date 1918

In [12]:
!augur align \
  --sequences ../results/filtered-10.fasta \
  --output ../results/aligned-10.fasta 


using mafft to align via:
	mafft --reorder --anysymbol --thread 1 ../results/filtered-10.fasta 1> ../results/aligned-10.fasta 2> ../results/aligned-10.fasta.log 

	Katoh et al, Nucleic Acid Research, vol 30, issue 14
	https://doi.org/10.1093%2Fnar%2Fgkf436



Tree takes 2.98 seconds to build

In [13]:
!augur tree \
  --alignment ../results/aligned-10.fasta \
  --output ../results/tree_raw-10.nwk

Building a tree via:
	iqtree -ninit 2 -n 2 -me 0.05 -nt 1 -s ../results/aligned-10-delim.fasta -m GTR > ../results/aligned-10-delim.iqtree.log
	Nguyen et al: IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies.
	Mol. Biol. Evol., 32:268-274. https://doi.org/10.1093/molbev/msu300

Building original tree took 2.984182834625244 seconds


In [14]:
from ete3 import Tree


tree_raw = Tree("../results/tree_raw-10.nwk")
print(tree_raw)


   /-J02176_1-1775|A/WSN/1933
  |
  |   /-CY010788_20-1717|A/WSN/1933TS61
  |--|
  |   \-HE802059_33-1730|A/WSN/1933
  |
  |   /-CY034132_6-1703|A/WSN/1933
  |  |
  |  |   /-DQ508905_1-1698|A/Wilson-Smith/1933
  |  |  |
  |  |  |      /-CY096811_11-1711|A/bh/1935
  |  |  |   /-|
  |  |  |  |   \-CY147326_20-1720|A/BH/JY2/1935
  |  |  |  |
  |  |  |  |            /-U08903_1-1746|A/NWS/1933
--|  |  |  |         /-|
  |  |  |  |      /-|   \-U08904_1-1746|A/WS/1933
  |  |  |  |     |  |
  |  |  |  |   /-|   \-CY009604_20-1720|A/Wilson-Smith/1933
  |  |  |  |  |  |
  |  |  |  |  |  |   /-CY090845_20-1720|A/UnitedKingdom/1-MA/1933
  |  |  |  |  |   \-|
  |  |  |  |  |      \-CY045756_1-1701|A/UnitedKingdom/1/1933
  |  |  |  |  |
  |  |  |  |  |                                 /-CY019955_11-1708|A/Alaska/1935
  |  |  |  |  |                              /-|
  |  |  |  |  |                             |   \-Z54287_1-1728|A/Mongolia/153/88
  |  |  |  |  |                           /-|
  |  | 

In [2]:
!mkdir -p ../results/

!augur filter \
  --sequences ../data/corrected.fasta \
  --metadata ../data/metadata.tsv \
  --exclude None \
  --output ../results/filtered-2.fasta \
  --group-by year \
  --sequences-per-group 2 \
  --min-date 1918

In [4]:
!augur align \
  --sequences ../results/filtered-2.fasta \
  --output ../results/aligned-2.fasta 


using mafft to align via:
	mafft --reorder --anysymbol --thread 1 ../results/filtered-2.fasta 1> ../results/aligned-2.fasta 2> ../results/aligned-2.fasta.log 

	Katoh et al, Nucleic Acid Research, vol 30, issue 14
	https://doi.org/10.1093%2Fnar%2Fgkf436



In [4]:
!augur refine \
  --tree ../results/tree_raw.nwk \
  --alignment ../results/aligned.fasta \
  --metadata ../data/metadata.tsv \
  --output-tree ../results/tree.nwk \
  --output-node-data ../results/branch_lengths.json \
  --timetree \
  --coalescent opt \
  --date-confidence \
  --date-inference joint 
#   --clock-filter-iqd 4



































0.21	ERROR: At least 30\% terminal nodes cannot be assigned with a sequence!



0.26	-ERROR: ALMOST NO VALID DATE CONSTRAINTS, EXITING
Traceback (most recent call last):
  File "/home/gboyle/.local/bin//augur", line 11, in <module>
    sys.exit(main())
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/__main__.py", line 10, in main
    return augur.run( argv[1:] )
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/__init__.py", line 66, in run
    return args.__command__.run(args)
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/refine.py", line 187, in run
    clock_filter_iqd=args.clock_filter_iqd)
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/refine.py", line 36, in refine
    verbose=verbosity, gtr='JC69')
  File "/home/gboyle/.local/lib/python3.6/site-packages/treetime/treetime.py", line 31, in __init__
    super(TreeTime, self).__init__(*args, **kwargs)
  File "/home/gboyle/.local/lib/python3.6/