In [1]:
import pandas as pd
import numpy as np
from itertools import combinations, chain
from sklearn.decomposition import PCA
from scipy.spatial.distance import braycurtis

import holoviews as hv
import bokeh.io

hv.extension('bokeh')
bokeh.io.output_notebook()

# Town labels
town_dict = {'(48.6326, 3.4845)':['Louan-Villegruis','LO','France'],
             '(44.3203, 3.0658)':['Sévérac-le-Château',' MC','France'],
             '(47.4532, 0.5949)':['Angers','AN','France'],
             '(46.3764, 6.1202)':['Divonne les Bains','DB','France'],
             '(43.3529, 1.447)':['Espelette','ES','France'],
             '(48.659, 6.1415)':['Nancy','NA','France'],
             '(48.7922, 8.6354)':['Langenbrand','SL','Germany'],
             '(51.0017, 7.0383)':['Cologne-Bonn','CB','Germany']}

town_dict2 = {}

for i in town_dict:
    town_dict2[i] = ', '.join([town_dict[i][0],town_dict[i][-1]])

In [2]:
alldata = pd.read_csv('data/filtData.csv')

alldata.head()

Unnamed: 0,taxonomy,run,reads,low tax,geo_loc_name,latitude,longitude,sample,coord,Phylum
0,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.000481,o__Bacteroidales,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
1,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.080886,g__Bacteroides,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
2,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.073182,s__acidifaciens,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
3,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.155513,s__uniformis,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
4,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.000963,g__Parabacteroides,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes


In [3]:
transit_df = pd.read_csv('data/transittimes.csv')

transit_df.head()

Unnamed: 0,loc1,loc2,dist (km),adj dist (au)
0,"Divonne les Bains, France","Espelette, France",498.513044,113.470674
1,"Divonne les Bains, France","Sévérac-le-Château, France",330.474243,330.474243
2,"Cologne-Bonn, Germany","Louan-Villegruis, France",366.533095,17.157575
3,"Louan-Villegruis, France","Sévérac-le-Château, France",480.574395,71.453359
4,"Divonne les Bains, France","Louan-Villegruis, France",319.546489,59.667409


In [4]:
pca_df = alldata.loc[:,['taxonomy','sample','reads']]

pca_df = pca_df.pivot(columns='taxonomy',index='sample')
pca_df = pca_df.droplevel(0,axis=1)

pca_df.head()

taxonomy,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__acidifaciens,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__uniformis,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Parabacteroides;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__S24-7;g__;s__,Root;k__Bacteria;p__Deferribacteres;c__Deferribacteres;o__Deferribacterales;f__Deferribacteraceae;g__Mucispirillum;s__schaedleri,Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__,...,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus];s__gnavus,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__;s__,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Oscillospira;s__,Root;k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Flexispira;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Helicobacter;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Helicobacter;s__apodemus,Root;k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__;s__
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERS194036,0.000481,0.080886,0.073182,0.155513,0.000963,0.015888,0.002889,0.279249,0.001926,0.001926,...,0.00337,0.022629,0.10207,0.114107,0.0,0.0,0.0,0.002407,0.001926,0.000481
ERS194037,0.0,0.031843,0.026332,0.264544,0.000306,0.0,0.007961,0.38763,0.0,0.000306,...,0.00398,0.011635,0.00643,0.000612,0.0,0.0,0.04654,0.0,0.000306,0.0
ERS194038,0.050063,0.043179,0.001877,0.209324,0.000626,0.006571,0.005632,0.055069,0.000626,0.0,...,0.005006,0.072278,0.154881,0.000626,0.033792,0.00438,0.191802,0.001877,0.005632,0.0
ERS194039,0.001012,0.022267,0.362348,0.078947,0.003036,0.0,0.318826,0.072874,0.003036,0.003036,...,0.003036,0.013158,0.009109,0.0,0.001012,0.002024,0.002024,0.0,0.0,0.001012
ERS194040,0.0,0.01886,0.384994,0.114391,0.00205,0.0,0.297253,0.0328,0.00574,0.00779,...,0.00082,0.01722,0.00984,0.0,0.0,0.00287,0.00697,0.00205,0.00738,0.00041


In [5]:
pca_taxon = PCA(n_components=2)
pc_analysis = pca_taxon.fit_transform(pca_df)

x = [i[0] for i in pc_analysis]
y = [i[1] for i in pc_analysis]
samp = pca_df.index

pc_df = pd.DataFrame(list(zip(samp,x,y)),columns=['sample','PC1','PC2'])

for i in pc_df.index:
    sample = pc_df.loc[i,'sample']
    subdf = alldata[alldata['sample'] == sample]
    
    pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']

pc_df.head()

  pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']


Unnamed: 0,sample,PC1,PC2,coord
0,ERS194036,0.040307,-0.147742,"(47.4532, 0.5949)"
1,ERS194037,-0.038629,-0.236362,"(47.4532, 0.5949)"
2,ERS194038,-0.072992,0.038153,"(47.4532, 0.5949)"
3,ERS194039,0.100357,-0.067975,"(47.4532, 0.5949)"
4,ERS194040,0.129284,-0.045646,"(47.4532, 0.5949)"


In [6]:
hv.Scatter(pc_df,
          kdims=['PC1'],
          vdims=['PC2','sample','coord']
          ).opts(height=500,
                 width=800,
                 legend_position='right',
                 color='coord',
                 cmap='Category10',
                 tools=['hover'])

In [7]:
sample_pairs = list(combinations(pca_df.index,2))
sample_pairs = [list(i) for i in sample_pairs]

pca_df.loc[sample_pairs[0]]

taxonomy,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__acidifaciens,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__uniformis,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Parabacteroides;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__S24-7;g__;s__,Root;k__Bacteria;p__Deferribacteres;c__Deferribacteres;o__Deferribacterales;f__Deferribacteraceae;g__Mucispirillum;s__schaedleri,Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__,...,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus];s__gnavus,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__;s__,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Oscillospira;s__,Root;k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Flexispira;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Helicobacter;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Helicobacter;s__apodemus,Root;k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__;s__
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERS194036,0.000481,0.080886,0.073182,0.155513,0.000963,0.015888,0.002889,0.279249,0.001926,0.001926,...,0.00337,0.022629,0.10207,0.114107,0.0,0.0,0.0,0.002407,0.001926,0.000481
ERS194037,0.0,0.031843,0.026332,0.264544,0.000306,0.0,0.007961,0.38763,0.0,0.000306,...,0.00398,0.011635,0.00643,0.000612,0.0,0.0,0.04654,0.0,0.000306,0.0


In [8]:
bc_ls = []

for sp in sample_pairs:
    subdf = pca_df.loc[sp]
    bc_dist = braycurtis(subdf.iloc[0],subdf.iloc[1])
    
    bc_ls.append((sp[0],sp[1],bc_dist))
    
bc_df = pd.DataFrame(bc_ls,columns=['sample1','sample2','braycurtis'])

bc_df.head()

Unnamed: 0,sample1,sample2,braycurtis
0,ERS194036,ERS194037,0.383489
1,ERS194036,ERS194038,0.504865
2,ERS194036,ERS194039,0.642923
3,ERS194036,ERS194040,0.643051
4,ERS194036,ERS194041,0.698617


In [9]:
for i in bc_df.index:
    sample = bc_df.loc[i,'sample1']
    subdf = alldata[alldata['sample'] == sample]
    
    bc_df.loc[i,'coord1'] = subdf.iloc[0]['coord']
    
    sample = bc_df.loc[i,'sample2']
    subdf = alldata[alldata['sample'] == sample]
    
    bc_df.loc[i,'coord2'] = subdf.iloc[0]['coord']

bc_df.head()

  bc_df.loc[i,'coord1'] = subdf.iloc[0]['coord']
  bc_df.loc[i,'coord2'] = subdf.iloc[0]['coord']


Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2
0,ERS194036,ERS194037,0.383489,"(47.4532, 0.5949)","(47.4532, 0.5949)"
1,ERS194036,ERS194038,0.504865,"(47.4532, 0.5949)","(47.4532, 0.5949)"
2,ERS194036,ERS194039,0.642923,"(47.4532, 0.5949)","(47.4532, 0.5949)"
3,ERS194036,ERS194040,0.643051,"(47.4532, 0.5949)","(47.4532, 0.5949)"
4,ERS194036,ERS194041,0.698617,"(47.4532, 0.5949)","(47.4532, 0.5949)"


In [10]:
for i in bc_df.index:
    coord = bc_df.loc[i,'coord1']
    bc_df.loc[i,'loc1'] = town_dict2[coord]
    
    coord = bc_df.loc[i,'coord2']
    bc_df.loc[i,'loc2'] = town_dict2[coord]

bc_df.head()

  bc_df.loc[i,'loc1'] = town_dict2[coord]
  bc_df.loc[i,'loc2'] = town_dict2[coord]


Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2,loc1,loc2
0,ERS194036,ERS194037,0.383489,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
1,ERS194036,ERS194038,0.504865,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
2,ERS194036,ERS194039,0.642923,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
3,ERS194036,ERS194040,0.643051,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
4,ERS194036,ERS194041,0.698617,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"


In [11]:
transit = transit_df.copy()

transit['loc12'] = transit['loc1'] + transit['loc2']
transit.set_index('loc12',inplace=True)

bc_locs = list(zip(bc_df['loc1'],bc_df['loc2']))
bc_locs = [np.sort(i) for i in bc_locs]
bc_locs = [''.join(i) for i in bc_locs]

bc_df['loc12'] = bc_locs 

bc_df.head()

Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2,loc1,loc2,loc12
0,ERS194036,ERS194037,0.383489,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
1,ERS194036,ERS194038,0.504865,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
2,ERS194036,ERS194039,0.642923,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
3,ERS194036,ERS194040,0.643051,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
4,ERS194036,ERS194041,0.698617,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"


In [12]:
for i in np.unique(bc_df['loc12']):
    subdf = bc_df[bc_df['loc12'] == i]
    
    bc_avg = np.mean(subdf['braycurtis'])
    bc_std = np.std(subdf['braycurtis'])
    
    transit.loc[i,'avg braycurtis'] = bc_avg
    transit.loc[i,'std braycurtis'] = bc_std
    
transit.head()

Unnamed: 0_level_0,loc1,loc2,dist (km),adj dist (au),avg braycurtis,std braycurtis
loc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Divonne les Bains, FranceEspelette, France","Divonne les Bains, France","Espelette, France",498.513044,113.470674,0.762022,0.170915
"Divonne les Bains, FranceSévérac-le-Château, France","Divonne les Bains, France","Sévérac-le-Château, France",330.474243,330.474243,0.741006,0.167194
"Cologne-Bonn, GermanyLouan-Villegruis, France","Cologne-Bonn, Germany","Louan-Villegruis, France",366.533095,17.157575,0.706124,0.160358
"Louan-Villegruis, FranceSévérac-le-Château, France","Louan-Villegruis, France","Sévérac-le-Château, France",480.574395,71.453359,0.668665,0.159226
"Divonne les Bains, FranceLouan-Villegruis, France","Divonne les Bains, France","Louan-Villegruis, France",319.546489,59.667409,0.754587,0.197596


In [13]:
for i in transit.index:
    if type(transit.loc[i,'loc1']) == str:
        pass
    else:
        location = i.split(', ')
        location = ', '.join([location[0],location[-1]])
        
        transit.loc[i,'loc1'] = location
        transit.loc[i,'loc2'] = location
        
        transit.loc[i,'dist (km)'] = 0
        transit.loc[i,'adj dist (au)'] = 0
        
transit.tail()

Unnamed: 0_level_0,loc1,loc2,dist (km),adj dist (au),avg braycurtis,std braycurtis
loc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Espelette, FranceEspelette, France","Espelette, France","Espelette, France",0.0,0.0,0.699556,0.174781
"Langenbrand, GermanyLangenbrand, Germany","Langenbrand, Germany","Langenbrand, Germany",0.0,0.0,0.61548,0.167082
"Louan-Villegruis, FranceLouan-Villegruis, France","Louan-Villegruis, France","Louan-Villegruis, France",0.0,0.0,0.681945,0.203802
"Nancy, FranceNancy, France","Nancy, France","Nancy, France",0.0,0.0,0.726954,0.184673
"Sévérac-le-Château, FranceSévérac-le-Château, France","Sévérac-le-Château, France","Sévérac-le-Château, France",0.0,0.0,0.613801,0.16625


In [28]:
dist_plot = hv.Scatter(transit,
                      kdims=['dist (km)'],
                      vdims=['avg braycurtis','loc1','loc2']
                      ).opts(height=400,
                             width=400,
                             size=5,
                             alpha=0.5,
                            color='blue',
                            tools=['hover'])

dist_error = hv.ErrorBars(list(zip(transit['dist (km)'],transit['avg braycurtis'],transit['std braycurtis'])))

dist_plot = dist_plot * dist_error

adj_plot = hv.Scatter(transit,
                      kdims=['adj dist (au)'],
                      vdims=['avg braycurtis','loc1','loc2']
                      ).opts(height=400,
                             width=400,
                             size=5,
                             alpha=0.5,
                             xlim=(-90,400),
                             title = 'Full Taxonomy Bray-Curtis',
                            color='tomato',
                            tools=['hover'])

adj_error = hv.ErrorBars(list(zip(transit['adj dist (au)'],transit['avg braycurtis'],transit['std braycurtis'])))

adj_plot = adj_plot * adj_error

adj_plot + dist_plot

## Clustering by phylum only

In [15]:
pca_df = alldata.loc[:,['Phylum','sample','reads']]

pca_df = pca_df.groupby(['Phylum','sample']).reads.sum().reset_index()

pca_df = pca_df.pivot(columns='Phylum',index='sample')

pca_df = pca_df.droplevel(0,axis=1)

pca_df.head()

Phylum,p__Bacteroidetes,p__Deferribacteres,p__Firmicutes,p__Proteobacteria
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ERS194036,0.609052,0.001926,0.384208,0.004815
ERS194037,0.718616,0.0,0.234538,0.046846
ERS194038,0.37234,0.000626,0.389549,0.237484
ERS194039,0.859312,0.003036,0.131579,0.006073
ERS194040,0.850349,0.00574,0.124231,0.01968


In [16]:
pca_taxon = PCA(n_components=2)
pc_analysis = pca_taxon.fit_transform(pca_df)

x = [i[0] for i in pc_analysis]
y = [i[1] for i in pc_analysis]
samp = pca_df.index

pc_df = pd.DataFrame(list(zip(samp,x,y)),columns=['sample','PC1','PC2'])

for i in pc_df.index:
    sample = pc_df.loc[i,'sample']
    subdf = alldata[alldata['sample'] == sample]
    
    pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']

pc_df.head()

  pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']


Unnamed: 0,sample,PC1,PC2,coord
0,ERS194036,-0.145801,0.216901,"(47.4532, 0.5949)"
1,ERS194037,-0.248232,0.05665,"(47.4532, 0.5949)"
2,ERS194038,0.171264,0.119846,"(47.4532, 0.5949)"
3,ERS194039,-0.408608,-0.022347,"(47.4532, 0.5949)"
4,ERS194040,-0.395754,-0.034806,"(47.4532, 0.5949)"


In [17]:
hv.Scatter(pc_df,
          kdims=['PC1'],
          vdims=['PC2','sample','coord']
          ).opts(height=500,
                 width=800,
                 size=5,
                 alpha=0.5,
                 legend_position='right',
                 color='coord',
                 cmap='Category10',
                 tools=['hover'])

In [18]:
sample_pairs = list(combinations(pca_df.index,2))
sample_pairs = [list(i) for i in sample_pairs]

pca_df.loc[sample_pairs[0]]

Phylum,p__Bacteroidetes,p__Deferribacteres,p__Firmicutes,p__Proteobacteria
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ERS194036,0.609052,0.001926,0.384208,0.004815
ERS194037,0.718616,0.0,0.234538,0.046846


In [19]:
bc_ls = []

for sp in sample_pairs:
    subdf = pca_df.loc[sp]
    bc_dist = braycurtis(subdf.iloc[0],subdf.iloc[1])
    
    bc_ls.append((sp[0],sp[1],bc_dist))
    
bc_df = pd.DataFrame(bc_ls,columns=['sample1','sample2','braycurtis'])

bc_df.head()

Unnamed: 0,sample1,sample2,braycurtis
0,ERS194036,ERS194037,0.151596
1,ERS194036,ERS194038,0.238011
2,ERS194036,ERS194039,0.252629
3,ERS194036,ERS194040,0.259977
4,ERS194036,ERS194041,0.295459


In [20]:
for i in bc_df.index:
    sample = bc_df.loc[i,'sample1']
    subdf = alldata[alldata['sample'] == sample]
    
    bc_df.loc[i,'coord1'] = subdf.iloc[0]['coord']
    
    sample = bc_df.loc[i,'sample2']
    subdf = alldata[alldata['sample'] == sample]
    
    bc_df.loc[i,'coord2'] = subdf.iloc[0]['coord']

bc_df.head()

  bc_df.loc[i,'coord1'] = subdf.iloc[0]['coord']
  bc_df.loc[i,'coord2'] = subdf.iloc[0]['coord']


Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2
0,ERS194036,ERS194037,0.151596,"(47.4532, 0.5949)","(47.4532, 0.5949)"
1,ERS194036,ERS194038,0.238011,"(47.4532, 0.5949)","(47.4532, 0.5949)"
2,ERS194036,ERS194039,0.252629,"(47.4532, 0.5949)","(47.4532, 0.5949)"
3,ERS194036,ERS194040,0.259977,"(47.4532, 0.5949)","(47.4532, 0.5949)"
4,ERS194036,ERS194041,0.295459,"(47.4532, 0.5949)","(47.4532, 0.5949)"


In [21]:
for i in bc_df.index:
    coord = bc_df.loc[i,'coord1']
    bc_df.loc[i,'loc1'] = town_dict2[coord]
    
    coord = bc_df.loc[i,'coord2']
    bc_df.loc[i,'loc2'] = town_dict2[coord]

bc_df.head()

  bc_df.loc[i,'loc1'] = town_dict2[coord]
  bc_df.loc[i,'loc2'] = town_dict2[coord]


Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2,loc1,loc2
0,ERS194036,ERS194037,0.151596,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
1,ERS194036,ERS194038,0.238011,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
2,ERS194036,ERS194039,0.252629,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
3,ERS194036,ERS194040,0.259977,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
4,ERS194036,ERS194041,0.295459,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"


In [22]:
transit = transit_df.copy()

transit['loc12'] = transit['loc1'] + transit['loc2']
transit.set_index('loc12',inplace=True)

bc_locs = list(zip(bc_df['loc1'],bc_df['loc2']))
bc_locs = [np.sort(i) for i in bc_locs]
bc_locs = [''.join(i) for i in bc_locs]

bc_df['loc12'] = bc_locs 

bc_df.head()

Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2,loc1,loc2,loc12
0,ERS194036,ERS194037,0.151596,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
1,ERS194036,ERS194038,0.238011,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
2,ERS194036,ERS194039,0.252629,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
3,ERS194036,ERS194040,0.259977,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
4,ERS194036,ERS194041,0.295459,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"


In [23]:
for i in np.unique(bc_df['loc12']):
    subdf = bc_df[bc_df['loc12'] == i]
    
    bc_avg = np.mean(subdf['braycurtis'])
    bc_std = np.std(subdf['braycurtis'])
    
    transit.loc[i,'avg braycurtis'] = bc_avg
    transit.loc[i,'std braycurtis'] = bc_std
    
transit.head()

Unnamed: 0_level_0,loc1,loc2,dist (km),adj dist (au),avg braycurtis,std braycurtis
loc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Divonne les Bains, FranceEspelette, France","Divonne les Bains, France","Espelette, France",498.513044,113.470674,0.486552,0.249475
"Divonne les Bains, FranceSévérac-le-Château, France","Divonne les Bains, France","Sévérac-le-Château, France",330.474243,330.474243,0.446084,0.238867
"Cologne-Bonn, GermanyLouan-Villegruis, France","Cologne-Bonn, Germany","Louan-Villegruis, France",366.533095,17.157575,0.469347,0.228518
"Louan-Villegruis, FranceSévérac-le-Château, France","Louan-Villegruis, France","Sévérac-le-Château, France",480.574395,71.453359,0.420841,0.211914
"Divonne les Bains, FranceLouan-Villegruis, France","Divonne les Bains, France","Louan-Villegruis, France",319.546489,59.667409,0.45877,0.289337


In [24]:
for i in transit.index:
    if type(transit.loc[i,'loc1']) == str:
        pass
    else:
        location = i.split(', ')
        location = ', '.join([location[0],location[-1]])
        
        transit.loc[i,'loc1'] = location
        transit.loc[i,'loc2'] = location
        
        transit.loc[i,'dist (km)'] = 0
        transit.loc[i,'adj dist (au)'] = 0
        
transit.tail()

Unnamed: 0_level_0,loc1,loc2,dist (km),adj dist (au),avg braycurtis,std braycurtis
loc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Espelette, FranceEspelette, France","Espelette, France","Espelette, France",0.0,0.0,0.448535,0.205724
"Langenbrand, GermanyLangenbrand, Germany","Langenbrand, Germany","Langenbrand, Germany",0.0,0.0,0.300602,0.145839
"Louan-Villegruis, FranceLouan-Villegruis, France","Louan-Villegruis, France","Louan-Villegruis, France",0.0,0.0,0.41992,0.251573
"Nancy, FranceNancy, France","Nancy, France","Nancy, France",0.0,0.0,0.47377,0.279929
"Sévérac-le-Château, FranceSévérac-le-Château, France","Sévérac-le-Château, France","Sévérac-le-Château, France",0.0,0.0,0.372103,0.183137


In [27]:
dist_plot = hv.Scatter(transit,
                      kdims=['dist (km)'],
                      vdims=['avg braycurtis','loc1','loc2']
                      ).opts(height=400,
                             width=400,
                             size=5,
                             alpha=0.5,
                            color='blue',
                            tools=['hover'])

dist_error = hv.ErrorBars(list(zip(transit['dist (km)'],transit['avg braycurtis'],transit['std braycurtis'])))

dist_plot = dist_plot * dist_error

adj_plot = hv.Scatter(transit,
                      kdims=['adj dist (au)'],
                      vdims=['avg braycurtis','loc1','loc2']
                      ).opts(height=400,
                             width=400,
                             size=5,
                             alpha=0.5,
                             xlim=(-90,400),
                              title = 'Phylum Bray-Curtis',
                            color='tomato',
                            tools=['hover'])

adj_error = hv.ErrorBars(list(zip(transit['adj dist (au)'],transit['avg braycurtis'],transit['std braycurtis'])))

adj_plot = adj_plot * adj_error

adj_plot + dist_plot

In [26]:
%load_ext watermark
%watermark -v --iversions

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.15.0

bokeh    : 3.2.1
pandas   : 2.1.1
holoviews: 1.18.0
numpy    : 1.24.3

