In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import brewer2mpl
import colorsys
import math
import dendropy as dp
import json
import os

from datetime import datetime
from Bio import AlignIO, SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Levenshtein import distance
from itertools import combinations, product, permutations
from time import time
from __future__ import division
from collections import Counter, defaultdict
from copy import deepcopy
from random import shuffle, choice, sample
from scipy.stats.mstats import mquantiles
from scipy.stats import norm, expon, poisson, binom
from scipy.misc import comb
from IPython.display import Math
from networkx.readwrite import json_graph 

%matplotlib inline

In [6]:
dir_handle = 'mds_coordinates'
coordinate_files = {int(f.split('_')[0]):f for f in os.listdir(dir_handle) if 'avg' in f and 'csv' in f}

In [56]:
host_codes = {'Avian':0, 
              'Swine':1, 
              'Human':2, 
              'Environment':3,
              'Other':4}

code_hosts = {v:k for k, v in host_codes.items()}

def get_host_code(x, host_codes):
    if '/' in x:
        x = x.split('/')[1]
        
    if x in host_codes.keys():
        return host_codes[x]
    else:
        return host_codes['Other']
    
def get_state(x):
    if len(x.split('/')) == 5:
        return x.split('/')[2]
    if len(x.split('/')) == 4:
        return x.split('/')[1]

In [45]:
masterfile = pd.read_csv('20150112_All_IRD_HA_Sequences.csv', parse_dates=['Collection Date'])
masterfile['Sequence Accession'] = masterfile['Sequence Accession'].str.strip('*')
masterfile['Strain Name'] = masterfile['Strain Name'].apply(lambda x: x.split('(')[0] if '(' in x else x)
masterfile['Host Species'] = masterfile['Host Species'].str.split(':').str[1]
masterfile['Host Code'] = masterfile['Host Species'].apply(lambda x: get_host_code(x, host_codes))
masterfile['Year'] = masterfile['Collection Date'].apply(lambda x: x.year)
masterfile['State/Province'] = masterfile['Strain Name'].apply(lambda x: get_state(x))
masterfile.set_index('Sequence Accession', inplace=True)

In [46]:
masterfile

Unnamed: 0_level_0,Name,Complete Genome,Segment,Segment Length,Subtype,Collection Date,Host Species,Country,State/Province,Flu Season,Strain Name,Host Code,Year
Sequence Accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
K00429,HA,No,4,1730,H7N7,2009-06-03,,USA,Mass,-N/A-,A/seal/Mass/1/1980,4,2009
CY083910,HA,Yes,4,1752,H1N1,2009-11-23,Human,Denmark,Aalborg,10-Sep,A/Aalborg/INS132/2009,2,2009
CY063606,HA,Yes,4,1734,H1N1,2009-12-02,Human,Denmark,Aalborg,10-Sep,A/Aalborg/INS133/2009,2,2009
CY083776,HA,No,4,1748,H1N1,2009-11-25,Human,Denmark,Aalborg,10-Sep,A/Aalborg/INS282/2009,2,2009
CY073725,HA,Yes,4,1734,H1N1,2009-12-07,Human,Denmark,Aalborg,10-Sep,A/Aalborg/INS283/2009,2,2009
CY062691,HA,Yes,4,1734,H1N1,2009-11-24,Human,Denmark,Aarhus,10-Sep,A/Aarhus/INS116/2009,2,2009
CY062699,HA,Yes,4,1734,H1N1,2009-11-27,Human,Denmark,Aarhus,10-Sep,A/Aarhus/INS118/2009,2,2009
CY066935,HA,Yes,4,1734,H1N1,2009-11-19,Human,Denmark,Aarhus,10-Sep,A/Aarhus/INS236/2009,2,2009
CY066943,HA,Yes,4,1734,H1N1,2009-11-23,Human,Denmark,Aarhus,10-Sep,A/Aarhus/INS237/2009,2,2009
CY066951,HA,Yes,4,1734,H1N1,2009-11-24,Human,Denmark,Aarhus,10-Sep,A/Aarhus/INS238/2009,2,2009


In [57]:
# Color MDS plots by host
host_code_colormap = {0:'r', 1:'b', 2:'g', 3:'orange', 4:'purple'}
for i, f in coordinate_files.items():
    print(i, f)
    coords = pd.read_csv('{0}/{1}'.format(dir_handle, f), index_col=0)
    coords_joined = coords.join(masterfile)
    fig = plt.figure(i)
    for group, data in coords_joined.groupby('Host Code'):
        plt.scatter(data['x'], data['y'], color=host_code_colormap[group], label=code_hosts[group])
    plt.legend()
    plt.title('Subgraph {0} MDS'.format(i))
    plt.savefig('mds_coordinates/{0}_mds_by_host.pdf'.format(i))
    plt.close()

(0, '0_mds_avg_coordinates.csv')
(1, '1_mds_avg_coordinates.csv')
(2, '2_mds_avg_coordinates.csv')
(3, '3_mds_avg_coordinates.csv')
(4, '4_mds_avg_coordinates.csv')
(5, '5_mds_avg_coordinates.csv')
(6, '6_mds_avg_coordinates.csv')
(7, '7_mds_avg_coordinates.csv')
(8, '8_mds_avg_coordinates.csv')
(9, '9_mds_avg_coordinates.csv')
(10, '10_mds_avg_coordinates.csv')
(11, '11_mds_avg_coordinates.csv')
(12, '12_mds_avg_coordinates.csv')
(13, '13_mds_avg_coordinates.csv')
(14, '14_mds_avg_coordinates.csv')
(15, '15_mds_avg_coordinates.csv')
(16, '16_mds_avg_coordinates.csv')
(17, '17_mds_avg_coordinates.csv')
(18, '18_mds_avg_coordinates.csv')
(19, '19_mds_avg_coordinates.csv')
(20, '20_mds_avg_coordinates.csv')
(21, '21_mds_avg_coordinates.csv')
(22, '22_mds_avg_coordinates.csv')
(23, '23_mds_avg_coordinates.csv')
(24, '24_mds_avg_coordinates.csv')
(25, '25_mds_avg_coordinates.csv')
(26, '26_mds_avg_coordinates.csv')
(27, '27_mds_avg_coordinates.csv')
(28, '28_mds_avg_coordinates.csv')
(29, 

In [82]:
norm = plt.Normalize()
norm.autoscale(sorted(masterfile['Year'].values))

cmap_red = plt.get_cmap('gist_rainbow')

for i, f in coordinate_files.items():
    coords = pd.read_csv('{0}/{1}'.format(dir_handle, f), index_col=0)
    coords_joined = coords.join(masterfile)
    fig = plt.figure(i, figsize=(12, 12))
    for group, data in coords_joined.groupby('Year'):
        plt.scatter(data['x'], data['y'], color=cmap_red(norm(group)), label=group)
    plt.title('Subgraph {0} MDS by Year'.format(i))
    plt.legend()
    plt.savefig('mds_coordinates/{0}_mds_by_year.pdf'.format(i))
    plt.close()

In [74]:
norm.autoscale(sorted(masterfile['Year'].values))


<matplotlib.colors.Normalize at 0x136af1e10>

In [77]:
for group, data in masterfile.groupby('Year'):
    print(group, cmap_red(norm(group)))

(1980, (0.0, 0.0, 0.0, 1.0))
(1981, (0.25622745098039218, 0.0, 0.29279215686274512, 1.0))
(1982, (0.47845294117647058, 0.0, 0.54507058823529408, 1.0))
(1983, (0.51501764705882347, 0.0, 0.58169019607843131, 1.0))
(1984, (0.34507647058823532, 0.0, 0.62354117647058815, 1.0))
(1985, (0.052284313725490472, 0.0, 0.66016078431372538, 1.0))
(1986, (0.0, 0.0, 0.77258235294117639, 1.0))
(1987, (0.0, 0.036603921568627448, 0.86670000000000003, 1.0))
(1988, (0.0, 0.3294352941176471, 0.86670000000000003, 1.0))
(1989, (0.0, 0.50067843137254897, 0.86670000000000003, 1.0))
(1990, (0.0, 0.5843176470588235, 0.86670000000000003, 1.0))
(1991, (0.0, 0.62877254901960777, 0.78042549019607865, 1.0))
(1992, (0.0, 0.66669999999999996, 0.65885294117647075, 1.0))
(1993, (0.0, 0.66669999999999996, 0.58561372549019608, 1.0))
(1994, (0.0, 0.6510058823529411, 0.40781764705882351, 1.0))
(1995, (0.0, 0.61438627450980388, 0.11502549019607844, 1.0))
(1996, (0.0, 0.65488823529411766, 0.0, 1.0))
(1997, (0.0, 0.7385313725490