# Augur for Influenza A - HA protein

In [1]:
import json

## Metadata is required

Making metadata by parsing FASTA

In [60]:
from Bio import SeqIO
import re


records = []
for record in SeqIO.parse("../data/fluHA_sequences.fasta", "fasta"):
    records.append(record.description)

# Can I compile these regexes together? 

# Get strains
strainsearch = re.compile('Strain Name:(?P<strain>.*\d{2}|d{4})')

# Get ids
idsearch = re.compile('gb:(?P<id>[A-Z]+\d+[:]\d+[-]\d{4})')
# idsearch_exact = re.compile('(?P<id>gb:[A-Z]+\d+[:]\d+[-]\d{4})')

strainsList = []
idList = []
for record in records:
        i = idsearch.search(record)
#         i = idsearch_exact.search(record)
        id = i.group('id')
        idList.append(id)

        s = strainsearch.search(record)
        strain = s.group('strain')
        strainsList.append(strain)


# Get years
yearsSearch = re.compile('(?P<year>\d{4}$|\d{2}$)')
yearsList = []
for strain in strainsList:
    y = yearsSearch.search(strain)
    year = y.group('year')
    yearsList.append(year)
    
# Fix dates such that all years are 4 digits 
index = 0
for year in yearsList:
    if len(year) == 2:
        yearsList[index] = '19' + year
    index += 1

# Fix dates such that dates are presented in yyyy-mm-dd format with X's where N/A
index = 0
for year in yearsList:
    yearsList[index] = year + '-XX' + '-XX'
    index += 1


# Concatenate strain strings with id strings for unique identifiers
newID = []
counter = 0
for id in idList:
    newID.append(id + "|" + strainsList[counter])
    
    # Remove spaces
    if " " in newID[counter]:
        newID[counter] = newID[counter].replace(" ", "")
    counter += 1

# for id in newID:
#     print(id)
# metadata has columns:
# strain	virus	accession	date	region	country	division	city	db	segment


In [61]:
from Bio import SeqIO

original_file = r"../data/fluHA_sequences.fasta"
corrected_file = r"../data/corrected.fasta"

position = 0
with open(original_file) as original, open(corrected_file, 'w') as corrected:
    records = SeqIO.parse(original_file, 'fasta')
    for record in records:
#         print(record.id)
        record.id = newID[position]
        record.description = newID[position]
        position += 1
#         print(record.id)             
        SeqIO.write(record, corrected, 'fasta')

In [62]:
import pandas as pd

# metadata_id = list(zip(fasta_ids, strainsList, yearsList))
# metadata = list(zip(idList, strainsList, yearsList))
metadata = list(zip(newID, yearsList))

# df = pd.DataFrame(data = metadata, columns=['id', 'strain', 'year'])
df = pd.DataFrame(data = metadata, columns=['strain', 'date'])

df.to_csv('../data/metadata.tsv',sep='\t',index=False,header=True)

In [9]:
!mkdir -p ../results/

!augur filter \
  --sequences ../data/corrected.fasta \
  --metadata ../data/metadata.tsv \
  --exclude None \
  --output ../results/filtered.fasta \
  --group-by year \
  --sequences-per-group 2 \
  --min-date 1918

### Does pairing down the zika metadata result in the same error?

In [11]:
zika = pd.read_csv("../data/metadata-Copy1.tsv", sep='\t')

col_list = ['strain','date']
newmeta = zika[col_list]
newmeta.to_csv('../data/new-zika-metadata.tsv',sep='\t',index=False,header=True)


In [12]:
!mkdir -p ../results/

!augur filter \
  --sequences ../data/sequences-Copy1.fasta \
  --metadata ../data/new-zika-metadata.tsv \
  #--exclude zika-tutorial/config/dropped_strains.txt \
  --output ../results/filtered.fasta \
#   --group-by year \
#   --sequences-per-group 2 \
#   --min-date 1918

Traceback (most recent call last):
  File "/home/gboyle/.local/bin//augur", line 11, in <module>
    sys.exit(main())
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/__main__.py", line 10, in main
    return augur.run( argv[1:] )
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/__init__.py", line 66, in run
    return args.__command__.run(args)
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/filter.py", line 293, in run
    SeqIO.write(seq_to_keep, args.output, 'fasta')
  File "/home/gboyle/.local/lib/python3.6/site-packages/Bio/SeqIO/__init__.py", line 529, in write
    fp.write(format_function(record))
AttributeError: 'NoneType' object has no attribute 'write'


### How much of the metadata can we remove and still have it survive the `augur filter` command?

Can we remove a single item from the metadata file?

Removed paper_url

In [13]:
zika = pd.read_csv("../data/metadata-Copy1.tsv", sep='\t')

newmeta = zika.drop(['paper_url'], axis=1)
newmeta.to_csv('../data/new-zika-metadata.tsv',sep='\t',index=False,header=True)
# metadata has columns:
# strain	virus	accession	date	region	country	division	city	db	segment	authors	url	title	journal	paper_url

In [14]:
!mkdir -p ../results/

!augur filter \
  --sequences ../data/sequences-Copy1.fasta \
  --metadata ../data/new-zika-metadata.tsv \
  #--exclude zika-tutorial/config/dropped_strains.txt \
  --output ../results/filtered.fasta \
#   --group-by year \
#   --sequences-per-group 2 \
#   --min-date 1918

Traceback (most recent call last):
  File "/home/gboyle/.local/bin//augur", line 11, in <module>
    sys.exit(main())
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/__main__.py", line 10, in main
    return augur.run( argv[1:] )
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/__init__.py", line 66, in run
    return args.__command__.run(args)
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/filter.py", line 293, in run
    SeqIO.write(seq_to_keep, args.output, 'fasta')
  File "/home/gboyle/.local/lib/python3.6/site-packages/Bio/SeqIO/__init__.py", line 529, in write
    fp.write(format_function(record))
AttributeError: 'NoneType' object has no attribute 'write'


Turns out `augur filter` just didn't like that there was no `--exclude` command. `--exclude None` resolves the issue.

In [23]:
# Test with zika-tutorial data 
!mkdir -p ../results/

!augur filter \
  --sequences ../data/sequences-Copy1.fasta \
  --metadata ../data/metadata-Copy1.tsv \
  --exclude None#zika-tutorial/config/dropped_strains.txt \
  --output ../results/filtered.fasta \
  --group-by year \
  --sequences-per-group 20 \
  --min-date 2012

### Resume augur

In [10]:
!augur align \
  --sequences ../results/filtered.fasta \
  --output ../results/aligned.fasta 


using mafft to align via:
	mafft --reorder --anysymbol --thread 1 ../results/filtered.fasta 1> ../results/aligned.fasta 2> ../results/aligned.fasta.log 

	Katoh et al, Nucleic Acid Research, vol 30, issue 14
	https://doi.org/10.1093%2Fnar%2Fgkf436



In [11]:
!augur tree \
  --alignment ../results/aligned.fasta \
  --output ../results/tree_raw.nwk

Building a tree via:
	iqtree -ninit 2 -n 2 -me 0.05 -nt 1 -s ../results/aligned-delim.fasta -m GTR > ../results/aligned-delim.iqtree.log
	Nguyen et al: IQ-TREE: A fast and effective stochastic algorithm for estimating maximum likelihood phylogenies.
	Mol. Biol. Evol., 32:268-274. https://doi.org/10.1093/molbev/msu300

Building original tree took 0.8095102310180664 seconds


In [12]:
from ete3 import Tree


tree_raw = Tree("../results/tree_raw.nwk")
print(tree_raw)


   /-CY021709_19-1719|A/AA/Huston/1945
  |
  |   /-CY020285_11-1711|A/AA/Marton/1943
  |--|
  |   \-CY020461_11-1708|A/Iowa/1943
  |
  |         /-CY009276_19-1719|A/Bel/1942
  |      /-|
  |     |   \-CY146769_19-1719|A/Bellamy/JY2/1942
  |     |
  |     |                  /-CY019955_11-1708|A/Alaska/1935
  |     |               /-|
  |     |            /-|   \-Z54287_1-1728|A/Mongolia/153/88
  |     |           |  |
  |     |         /-|   \-CY146881_4-1701|A/PuertoRico/8-CV10/1934
  |   /-|        |  |
  |  |  |      /-|   \-CY146857_4-1701|A/PuertoRico/8-SV40/1934
  |  |  |     |  |
  |  |  |   /-|   \-CY020445_11-1711|A/Henry/1936
  |  |  |  |  |
--|  |  |  |   \-CY009324_19-1719|A/Melbourne/1935
  |  |  |  |
  |  |  |  |         /-U08903_1-1746|A/NWS/1933
  |  |   \-|      /-|
  |  |     |   /-|   \-CY010788_20-1717|A/WSN/1933TS61
  |  |     |  |  |
  |  |     |  |   \-CY045756_1-1701|A/UnitedKingdom/1/1933
  |  |     |  |
  |  |     |  |      /-S62154_33-1733|A/AlmaAta/1417/84


In [4]:
!augur refine \
  --tree ../results/tree_raw.nwk \
  --alignment ../results/aligned.fasta \
  --metadata ../data/metadata.tsv \
  --output-tree ../results/tree.nwk \
  --output-node-data ../results/branch_lengths.json \
  --timetree \
  --coalescent opt \
  --date-confidence \
  --date-inference joint 
#   --clock-filter-iqd 4



































0.21	ERROR: At least 30\% terminal nodes cannot be assigned with a sequence!



0.26	-ERROR: ALMOST NO VALID DATE CONSTRAINTS, EXITING
Traceback (most recent call last):
  File "/home/gboyle/.local/bin//augur", line 11, in <module>
    sys.exit(main())
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/__main__.py", line 10, in main
    return augur.run( argv[1:] )
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/__init__.py", line 66, in run
    return args.__command__.run(args)
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/refine.py", line 187, in run
    clock_filter_iqd=args.clock_filter_iqd)
  File "/home/gboyle/.local/lib/python3.6/site-packages/augur/refine.py", line 36, in refine
    verbose=verbosity, gtr='JC69')
  File "/home/gboyle/.local/lib/python3.6/site-packages/treetime/treetime.py", line 31, in __init__
    super(TreeTime, self).__init__(*args, **kwargs)
  File "/home/gboyle/.local/lib/python3.6/