# Purpose

Make sequences formatted in the same fashion as IRD.

In [156]:
import pandas as pd
import numpy as np
from Bio import SeqIO
from random import sample
from collections import defaultdict

In [166]:
ird_sequences = [s for s in SeqIO.parse('ird_seqs_name_as_accession.fasta', 'fasta')]
len(ird_sequences)

21868

In [170]:
new_sequences = [s for s in SeqIO.parse('Alaska_waterfowl_79viruses_seqs_20151223.fasta', 'fasta')]
len(new_sequences)

647

In [172]:
combined_sequences = []
combined_sequences.extend(ird_sequences)
combined_sequences.extend(new_sequences)
SeqIO.write(combined_sequences, '20160104_brandt Sequences.fasta', 'fasta')

22515

In [None]:
# Write the IRD-like table for the sequences. There should be 2648 rows at the end.
# The IRD tables have the following columns:
# - Segment (number)
# - Protein Name (string)
# - Sequence Accession (alphanumeric)
# - Complete Genome (Yes/No)
# - Segment Length (number)
# - Subtype (alphanumerica)
# - Collection Date (date)
# - Host Species (string)
# - Country (string)
# - State/Province (string)
# - Flu Season (alphanumeric)
# - Strain Name (alphanumeric)

In [159]:
# Read in the data
ird_df = pd.read_csv('20160104_brandt_ird.csv', parse_dates=['Collection Date'], na_filter=False)
# Clean strain name. Remove subtype from strain name, replace spaces with underscore.
ird_df['Strain Name'] = ird_df['Strain Name'].str.split('(').str[0].str.replace(' ', '_')
# Clean the Host Species names by removing "IRD" and removing "/Avian"
ird_df['Host Species'] = ird_df['Host Species'].str.split(':').str[1].str.replace('/Avian', '')
ird_df

Unnamed: 0,Segment,Protein Name,Sequence Accession,Complete Genome,Segment Length,Subtype,Collection Date,Host Species,Country,State/Province,Flu Season,Strain Name
0,7,"M1,M2",KU289739,Yes,1027,H7N3,2010-02-18,American Black Duck,USA,Delaware,09-10,A/American_black_duck/Delaware/A00870108/2010
1,5,NP,KU289741,Yes,1540,H7N3,2010-02-18,American Black Duck,USA,Delaware,09-10,A/American_black_duck/Delaware/A00870108/2010
2,8,"NS1,NS2",KU289742,Yes,865,H7N3,2010-02-18,American Black Duck,USA,Delaware,09-10,A/American_black_duck/Delaware/A00870108/2010
3,3,"PA,PA-N155,PA-N182,PA-X protein(+61)",KU289743,Yes,2208,H7N3,2010-02-18,American Black Duck,USA,Delaware,09-10,A/American_black_duck/Delaware/A00870108/2010
4,1,PB2,KU289745,Yes,2307,H7N3,2010-02-18,American Black Duck,USA,Delaware,09-10,A/American_black_duck/Delaware/A00870108/2010
5,4,HA,KU289738,Yes,1706,H7N3,2010-02-18,American Black Duck,USA,Delaware,09-10,A/American_black_duck/Delaware/A00870108/2010
6,6,,KU289740,Yes,1428,H7N3,2010-02-18,American Black Duck,USA,Delaware,09-10,A/American_black_duck/Delaware/A00870108/2010
7,2,"PB1,PB1-F2,PB1-N40",KU289744,Yes,2316,H7N3,2010-02-18,American Black Duck,USA,Delaware,09-10,A/American_black_duck/Delaware/A00870108/2010
8,1,PB2,CY079459,Yes,2307,H5N2,2008-10-26,American Black Duck,USA,Illinois,08-09,A/American_black_duck/Illinois/08OS2688/2008
9,8,"NS1,NS2",CY079456,Yes,852,H5N2,2008-10-26,American Black Duck,USA,Illinois,08-09,A/American_black_duck/Illinois/08OS2688/2008


In [160]:
# The 79 new sequences that have been added to the dataset are not in the format of the IRD. I will make them as such.
newseqs_df = pd.read_csv('Alaska_waterfowl_79viruses_metadata_20151223.csv', parse_dates=['CollectionDate'])
newseqs_df['Strain_name'] = newseqs_df['Strain_name'].str.replace(' ', '_')
newseqs_df.set_index('Strain_name')

Unnamed: 0_level_0,Subtype,Blinded Number,Organism Name provided by collaborator,Updated Organism Name (names that changed are in blue),Special Note_JCVIs,Complete/Draft,CEIRS Sample ID,UPDATED?,Age,Age_Brandt,...,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35
Strain_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A/northern_pintail/Interior_Alaska/10BM03314/2010,H4N4,NIGSP_CEIRS_CIP105_RUNC_00058,Influenza A virus (A/northern pintail/Interior...,Influenza A virus (A/northern pintail/Interior...,,Complete,10BM03314,Y,After hatch year,AHY,...,,,,,,,,,,
A/mallard/Interior_Alaska/10BM12027/2010,H12N5,NIGSP_CEIRS_CIP105_RUNC_00061,Influenza A virus (A/mallard/Interior Alaska/1...,Influenza A virus (A/mallard/Interior Alaska/1...,,Complete,10BM12027,Y,Hatch year,HY,...,,,,,,,,,,
A/northern_pintail/Interior_Alaska/10BM12537/2010,H12N5,NIGSP_CEIRS_CIP105_RUNC_00062,Influenza A virus (A/northern pintail/Interior...,Influenza A virus (A/northern pintail/Interior...,,Complete,10BM12537,Y,Hatch year,HY,...,,,,,,,,,,
A/mallard/Interior_Alaska/10BM13225/2010,H12N5,NIGSP_CEIRS_CIP105_RUNC_00064,Influenza A virus (A/mallard/Interior Alaska/1...,Influenza A virus (A/mallard/Interior Alaska/1...,,Complete,10BM13225,Y,Hatch year,HY,...,,,,,,,,,,
A/mallard/Interior_Alaska/10BM13335/2010,H12N5,NIGSP_CEIRS_CIP105_RUNC_00065,Influenza A virus (A/mallard/Interior Alaska/1...,Influenza A virus (A/mallard/Interior Alaska/1...,,Complete,10BM13335,Y,Hatch year,HY,...,,,,,,,,,,
A/northern_pintail/Interior_Alaska/10BM13543/2010,H12N5,NIGSP_CEIRS_CIP105_RUNC_00066,Influenza A virus (A/Northern pintail/Interior...,Influenza A virus (A/northern pintail/Interior...,,Complete,10BM13543,Y,Hatch year,HY,...,,,,,,,,,,
A/northern_pintail/Interior_Alaska/10BM15017/2010,H12N5,NIGSP_CEIRS_CIP105_RUNC_00070,Influenza A virus (A/Northern pintail/Interior...,Influenza A virus (A/northern pintail/Interior...,,Complete,10BM15017,Y,Hatch year,HY,...,,,,,,,,,,
A/American_green-winged_teal/Interior_Alaska/11BM00293/2011,H10N5,NIGSP_CEIRS_CIP105_RUNC_00074,Influenza A virus (A/American green-winged tea...,Influenza A virus (A/American green-winged tea...,,Complete,11BM00293,Y,After hatch year,AHY,...,,,,,,,,,,
A/American_green-winged_teal/Interior_Alaska/11BM00304/2011,H10N5,NIGSP_CEIRS_CIP105_RUNC_00075,Influenza A virus (A/American green-winged tea...,Influenza A virus (A/American green-winged tea...,,Complete,11BM00304,Y,After hatch year,AHY,...,,,,,,,,,,
A/mallard/Interior_Alaska/11BM01009/2011,H12N5,NIGSP_CEIRS_CIP105_RUNC_00076,Influenza A virus (A/mallard/Interior Alaska/1...,Influenza A virus (A/mallard/Interior Alaska/1...,,Complete,11BM01009,Y,After hatch year,AHY,...,,,,,,,,,,


In [161]:
newseqs = [s for s in SeqIO.parse('Alaska_waterfowl_79viruses_seqs_20151223.fasta', 'fasta')]
len(newseqs) / 8

80.875

In [162]:
segnum_name = dict()
segnum_name[1] = 'PB2'
segnum_name[2] = 'PB1'
segnum_name[3] = 'PA'
segnum_name[4] = 'HA'
segnum_name[5] = 'NP'
segnum_name[6] = 'NA'
segnum_name[7] = 'M'
segnum_name[8] = 'NS'

name_segnum = dict()
for k, v in segnum_name.items():
    name_segnum[v] = k
name_segnum

{'HA': 4, 'M': 7, 'NA': 6, 'NP': 5, 'NS': 8, 'PA': 3, 'PB1': 2, 'PB2': 1}

In [163]:
newstrains_data = []
for s in newseqs:
    strain_name = s.id.split('|')[0]
    segment_name = s.id.split('|')[1]
    d = dict()
    d['Segment'] = name_segnum[segment_name]
    d['Protein Name'] = segment_name
    d['Sequence Accession'] = 'Not assigned'
    d['Complete Genome'] = 'Unconfirmed'
    d['Segment Length'] = len(s.seq)
    d['Subtype'] = newseqs_df.set_index('Strain_name').ix[strain_name]['Subtype']
    d['Collection Date'] = newseqs_df.set_index('Strain_name').ix[strain_name]['CollectionDate']
    d['Host Species'] = newseqs_df.set_index('Strain_name').ix[strain_name]['Species_final']
    d['Country'] = 'USA'
    d['State/Province'] = 'Alaska'
    d['Flu Season'] = np.nan
    d['Strain Name'] = strain_name
    newstrains_data.append(d)
newstrains_df = pd.DataFrame(newstrains_data)

In [164]:
len(newstrains_data) / 8

80.875

In [165]:
final_df = ird_df.append(newstrains_df).reset_index(drop=True)

final_df['Protein Name'] = final_df['Segment'].apply(lambda x: segnum_name[x])
final_df['Sequence Accession'] = final_df['Strain Name'].str.cat(final_df['Protein Name'], sep='|')
final_df.to_csv('20160104_brandt Sequences.csv')
final_df

Unnamed: 0,Collection Date,Complete Genome,Country,Flu Season,Host Species,Protein Name,Segment,Segment Length,Sequence Accession,State/Province,Strain Name,Subtype
0,2010-02-18,Yes,USA,09-10,American Black Duck,M,7,1027,A/American_black_duck/Delaware/A00870108/2010|M,Delaware,A/American_black_duck/Delaware/A00870108/2010,H7N3
1,2010-02-18,Yes,USA,09-10,American Black Duck,NP,5,1540,A/American_black_duck/Delaware/A00870108/2010|NP,Delaware,A/American_black_duck/Delaware/A00870108/2010,H7N3
2,2010-02-18,Yes,USA,09-10,American Black Duck,NS,8,865,A/American_black_duck/Delaware/A00870108/2010|NS,Delaware,A/American_black_duck/Delaware/A00870108/2010,H7N3
3,2010-02-18,Yes,USA,09-10,American Black Duck,PA,3,2208,A/American_black_duck/Delaware/A00870108/2010|PA,Delaware,A/American_black_duck/Delaware/A00870108/2010,H7N3
4,2010-02-18,Yes,USA,09-10,American Black Duck,PB2,1,2307,A/American_black_duck/Delaware/A00870108/2010|PB2,Delaware,A/American_black_duck/Delaware/A00870108/2010,H7N3
5,2010-02-18,Yes,USA,09-10,American Black Duck,HA,4,1706,A/American_black_duck/Delaware/A00870108/2010|HA,Delaware,A/American_black_duck/Delaware/A00870108/2010,H7N3
6,2010-02-18,Yes,USA,09-10,American Black Duck,,6,1428,A/American_black_duck/Delaware/A00870108/2010|NA,Delaware,A/American_black_duck/Delaware/A00870108/2010,H7N3
7,2010-02-18,Yes,USA,09-10,American Black Duck,PB1,2,2316,A/American_black_duck/Delaware/A00870108/2010|PB1,Delaware,A/American_black_duck/Delaware/A00870108/2010,H7N3
8,2008-10-26,Yes,USA,08-09,American Black Duck,PB2,1,2307,A/American_black_duck/Illinois/08OS2688/2008|PB2,Illinois,A/American_black_duck/Illinois/08OS2688/2008,H5N2
9,2008-10-26,Yes,USA,08-09,American Black Duck,NS,8,852,A/American_black_duck/Illinois/08OS2688/2008|NS,Illinois,A/American_black_duck/Illinois/08OS2688/2008,H5N2
