# PCA and distance analysis

**Drew Honson**

**30 November 2023**

In [2]:
import pandas as pd
import numpy as np
from itertools import combinations, chain
from sklearn.decomposition import PCA
from scipy.spatial.distance import braycurtis
from sklearn.metrics import jaccard_score

import holoviews as hv
import bokeh.io

hv.extension('bokeh')
bokeh.io.output_notebook()

# Town labels
town_dict = {'(48.6326, 3.4845)':['Louan-Villegruis','LO','France'],
             '(44.3203, 3.0658)':['Sévérac-le-Château',' MC','France'],
             '(47.4532, 0.5949)':['Angers','AN','France'],
             '(46.3764, 6.1202)':['Divonne les Bains','DB','France'],
             '(43.3529, 1.447)':['Espelette','ES','France'],
             '(48.659, 6.1415)':['Nancy','NA','France'],
             '(48.7922, 8.6354)':['Langenbrand','SL','Germany'],
             '(51.0017, 7.0383)':['Cologne-Bonn','CB','Germany']}

town_dict2 = {}

for i in town_dict:
    town_dict2[i] = ', '.join([town_dict[i][0],town_dict[i][-1]])

## Import data

In [3]:
alldata = pd.read_csv('data/filtData.csv')

alldata.head()

Unnamed: 0,taxonomy,run,reads,low tax,geo_loc_name,latitude,longitude,sample,coord,Phylum
0,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.000481,o__Bacteroidales,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
1,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.080886,g__Bacteroides,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
2,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.073182,s__acidifaciens,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
3,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.155513,s__uniformis,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes
4,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroid...,ERR197719,0.000963,g__Parabacteroides,France,47.4532,0.5949,ERS194036,"(47.4532, 0.5949)",p__Bacteroidetes


In [4]:
transit_df = pd.read_csv('data/transittimes.csv')

transit_df.head()

Unnamed: 0,loc1,loc2,dist (km),adj dist (au)
0,"Angers, France","Espelette, France",460.751877,1246.77154
1,"Cologne-Bonn, Germany","Langenbrand, Germany",270.996044,4094.58763
2,"Angers, France","Louan-Villegruis, France",251.659635,1139.709862
3,"Cologne-Bonn, Germany","Sévérac-le-Château, France",799.990157,25814.522367
4,"Langenbrand, Germany","Louan-Villegruis, France",378.26948,1652.25272


## PCA: Full taxonomy

To begin, I performed principle component analysis using sci-kit learn. The objective was to see whether clustering the data by microbial content would also cluster it by sampling site.

To start, the data is pivoted to suit the requirements of sci-kit learn:

In [5]:
pca_df = alldata.loc[:,['taxonomy','sample','reads']]

pca_df = pca_df.pivot(columns='taxonomy',index='sample')
pca_df = pca_df.droplevel(0,axis=1)

pca_df.head()

taxonomy,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__acidifaciens,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__uniformis,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Porphyromonadaceae;g__Parabacteroides;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Rikenellaceae;g__;s__,Root;k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__S24-7;g__;s__,Root;k__Bacteria;p__Deferribacteres;c__Deferribacteres;o__Deferribacterales;f__Deferribacteraceae;g__Mucispirillum;s__schaedleri,Root;k__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Lactobacillaceae;g__Lactobacillus;s__,...,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Lachnospiraceae;g__[Ruminococcus];s__gnavus,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__;s__,Root;k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales;f__Ruminococcaceae;g__Oscillospira;s__,Root;k__Bacteria;p__Firmicutes;c__Erysipelotrichi;o__Erysipelotrichales;f__Erysipelotrichaceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Deltaproteobacteria;o__Desulfovibrionales;f__Desulfovibrionaceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Flexispira;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Helicobacter;s__,Root;k__Bacteria;p__Proteobacteria;c__Epsilonproteobacteria;o__Campylobacterales;f__Helicobacteraceae;g__Helicobacter;s__apodemus,Root;k__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacteriales;f__Enterobacteriaceae;g__;s__
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERS194036,0.000481,0.080886,0.073182,0.155513,0.000963,0.015888,0.002889,0.279249,0.001926,0.001926,...,0.00337,0.022629,0.10207,0.114107,0.0,0.0,0.0,0.002407,0.001926,0.000481
ERS194037,0.0,0.031843,0.026332,0.264544,0.000306,0.0,0.007961,0.38763,0.0,0.000306,...,0.00398,0.011635,0.00643,0.000612,0.0,0.0,0.04654,0.0,0.000306,0.0
ERS194038,0.050063,0.043179,0.001877,0.209324,0.000626,0.006571,0.005632,0.055069,0.000626,0.0,...,0.005006,0.072278,0.154881,0.000626,0.033792,0.00438,0.191802,0.001877,0.005632,0.0
ERS194039,0.001012,0.022267,0.362348,0.078947,0.003036,0.0,0.318826,0.072874,0.003036,0.003036,...,0.003036,0.013158,0.009109,0.0,0.001012,0.002024,0.002024,0.0,0.0,0.001012
ERS194040,0.0,0.01886,0.384994,0.114391,0.00205,0.0,0.297253,0.0328,0.00574,0.00779,...,0.00082,0.01722,0.00984,0.0,0.0,0.00287,0.00697,0.00205,0.00738,0.00041


A two-component PCA is then performed. The PC1 and PC2 coordinates are then paired to their respective samples, and the samples are used to recover the sampling site:

In [6]:
pca_taxon = PCA(n_components=3)
pc_analysis = pca_taxon.fit_transform(pca_df)

x = [i[0] for i in pc_analysis]
y = [i[1] for i in pc_analysis]
z = [i[2] for i in pc_analysis]
samp = pca_df.index

pc_df = pd.DataFrame(list(zip(samp,x,y,z)),columns=['sample','PC1','PC2','PC3'])

for i in pc_df.index:
    sample = pc_df.loc[i,'sample']
    subdf = alldata[alldata['sample'] == sample]
    
    pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']

pc_df.head()

  pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']


Unnamed: 0,sample,PC1,PC2,PC3,coord
0,ERS194036,0.040307,-0.147742,-0.034057,"(47.4532, 0.5949)"
1,ERS194037,-0.038629,-0.236362,-0.099671,"(47.4532, 0.5949)"
2,ERS194038,-0.072992,0.038153,-0.109815,"(47.4532, 0.5949)"
3,ERS194039,0.100357,-0.067975,-0.073118,"(47.4532, 0.5949)"
4,ERS194040,0.129284,-0.045646,-0.11076,"(47.4532, 0.5949)"


Visualizing the data, there is no obvious clustering and certainly no apparent pattern based on coordinates.

In [13]:
alltax12 = hv.Scatter(pc_df,
          kdims=['PC1'],
          vdims=['PC2','sample','coord']
          ).opts(height=500,
                 width=600,
                 size=5,
                 legend_position='right',
                 show_legend=False,
                 color='coord',
                 cmap='Category10',
                 fontscale=1.2,
                 tools=['hover'])

alltax13 = hv.Scatter(pc_df,
          kdims=['PC1'],
          vdims=['PC3','sample','coord']
          ).opts(height=500,
                 width=750,
                 size=5,
                 legend_position='right',
                 color='coord',
                 cmap='Category10',
                 fontscale=1.2,
                 tools=['hover'])

alltax12+alltax13

## Bray-Curtis Distance

I next performed a Bray-Curtis dissimilarity test as described in Linnenbrink et al 2013. Essentially this treats every sample as an ecological site. Conventionally, Bray-Curtis distance is calculated based on a number of individual specimens from two sites. For each species found in both sites, the lower number of specimens as noted. These numbers are then summed and divided by the total number of specimens recorded. For this analysis, normalized reads rather than specimens are used.

It is unclear to me whether this analysis is appropriate at all for sequencing data and if so what numbers should be used. Regardless, Bray-Curtis dissimilarity is very sensitive to differences in amount of data collected at each site. In this case, sequencing depth would be the main source of these errors. The depth for each sample was similar but the normalizations applied probably compress some distances and exaggerate others. It might be worth repeating this with either raw or simply depth-normalized reads.

Conveniently, scipy has a Bray-Curtis function so I did not need to code one up myself.

In [14]:
sample_pairs = []

for i in pca_df.index:
    for j in pca_df.index:
        if i == j:
            pass
        else:
            sample_pairs.append([i,j])
            
sample_pairs[:5]

[['ERS194036', 'ERS194037'],
 ['ERS194036', 'ERS194038'],
 ['ERS194036', 'ERS194039'],
 ['ERS194036', 'ERS194040'],
 ['ERS194036', 'ERS194041']]

In [15]:
bc_ls = []

for sp in sample_pairs:
    subdf = pca_df.loc[sp]
    bc_dist = braycurtis(subdf.iloc[0],subdf.iloc[1])
    
    bc_ls.append((sp[0],sp[1],bc_dist))
    
bc_df = pd.DataFrame(bc_ls,columns=['sample1','sample2','braycurtis'])

bc_df.head()

Unnamed: 0,sample1,sample2,braycurtis
0,ERS194036,ERS194037,0.383489
1,ERS194036,ERS194038,0.504865
2,ERS194036,ERS194039,0.642923
3,ERS194036,ERS194040,0.643051
4,ERS194036,ERS194041,0.698617


After calculating Bray-Curtis dissimilarity, I added the coordinates for each sample:

In [16]:
for i in bc_df.index:
    sample = bc_df.loc[i,'sample1']
    subdf = alldata[alldata['sample'] == sample]
    
    bc_df.loc[i,'coord1'] = subdf.iloc[0]['coord']
    
    sample = bc_df.loc[i,'sample2']
    subdf = alldata[alldata['sample'] == sample]
    
    bc_df.loc[i,'coord2'] = subdf.iloc[0]['coord']

bc_df.head()

  bc_df.loc[i,'coord1'] = subdf.iloc[0]['coord']
  bc_df.loc[i,'coord2'] = subdf.iloc[0]['coord']


Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2
0,ERS194036,ERS194037,0.383489,"(47.4532, 0.5949)","(47.4532, 0.5949)"
1,ERS194036,ERS194038,0.504865,"(47.4532, 0.5949)","(47.4532, 0.5949)"
2,ERS194036,ERS194039,0.642923,"(47.4532, 0.5949)","(47.4532, 0.5949)"
3,ERS194036,ERS194040,0.643051,"(47.4532, 0.5949)","(47.4532, 0.5949)"
4,ERS194036,ERS194041,0.698617,"(47.4532, 0.5949)","(47.4532, 0.5949)"


In [17]:
for i in bc_df.index:
    coord = bc_df.loc[i,'coord1']
    bc_df.loc[i,'loc1'] = town_dict2[coord]
    
    coord = bc_df.loc[i,'coord2']
    bc_df.loc[i,'loc2'] = town_dict2[coord]

bc_df.head()

  bc_df.loc[i,'loc1'] = town_dict2[coord]
  bc_df.loc[i,'loc2'] = town_dict2[coord]


Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2,loc1,loc2
0,ERS194036,ERS194037,0.383489,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
1,ERS194036,ERS194038,0.504865,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
2,ERS194036,ERS194039,0.642923,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
3,ERS194036,ERS194040,0.643051,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
4,ERS194036,ERS194041,0.698617,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"


In [18]:
transit = transit_df.copy()

transit['loc12'] = transit['loc1'] + transit['loc2']
transit.set_index('loc12',inplace=True)

bc_locs = list(zip(bc_df['loc1'],bc_df['loc2']))
bc_locs = [np.sort(i) for i in bc_locs]
bc_locs = [''.join(i) for i in bc_locs]

bc_df['loc12'] = bc_locs 

bc_df.head()

Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2,loc1,loc2,loc12
0,ERS194036,ERS194037,0.383489,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
1,ERS194036,ERS194038,0.504865,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
2,ERS194036,ERS194039,0.642923,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
3,ERS194036,ERS194040,0.643051,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
4,ERS194036,ERS194041,0.698617,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"


I next calculated the average Bray-Curtis distance for each sample site (e.g. the mean Bray-Curtis dissimilarity of all pairwise combinations of samples from Angers and Espelette). I also calculated the mean within-site Bray-Curtis distance (e.g. the mean Bray-Curtis dissimilarity of all pairwise combinations of samples collected in Angers). I added these ecological distance measures to the dataframe containing geographic and public transit distances:

In [19]:
for i in np.unique(bc_df['loc12']):
    subdf = bc_df[bc_df['loc12'] == i]
    
    bc_avg = np.mean(subdf['braycurtis'])
    bc_std = np.std(subdf['braycurtis'])
    
    transit.loc[i,'avg braycurtis'] = bc_avg
    transit.loc[i,'std braycurtis'] = bc_std
    
transit.head()

Unnamed: 0_level_0,loc1,loc2,dist (km),adj dist (au),avg braycurtis,std braycurtis
loc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Angers, FranceEspelette, France","Angers, France","Espelette, France",460.751877,1246.77154,0.718224,0.158016
"Cologne-Bonn, GermanyLangenbrand, Germany","Cologne-Bonn, Germany","Langenbrand, Germany",270.996044,4094.58763,0.662346,0.162988
"Angers, FranceLouan-Villegruis, France","Angers, France","Louan-Villegruis, France",251.659635,1139.709862,0.712397,0.154335
"Cologne-Bonn, GermanySévérac-le-Château, France","Cologne-Bonn, Germany","Sévérac-le-Château, France",799.990157,25814.522367,0.637353,0.158508
"Langenbrand, GermanyLouan-Villegruis, France","Langenbrand, Germany","Louan-Villegruis, France",378.26948,1652.25272,0.676966,0.165441


Because within site distances were not calculated initially (they would all be zero), I set those distances to zero in the updated dataframe:

In [20]:
for i in transit.index:
    if type(transit.loc[i,'loc1']) == str:
        pass
    else:
        location = i.split(', ')
        location = ', '.join([location[0],location[-1]])
        
        transit.loc[i,'loc1'] = location
        transit.loc[i,'loc2'] = location
        
        transit.loc[i,'dist (km)'] = 0
        transit.loc[i,'adj dist (au)'] = 0
        
transit.tail()

Unnamed: 0_level_0,loc1,loc2,dist (km),adj dist (au),avg braycurtis,std braycurtis
loc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Espelette, FranceEspelette, France","Espelette, France","Espelette, France",0.0,0.0,0.699556,0.174781
"Langenbrand, GermanyLangenbrand, Germany","Langenbrand, Germany","Langenbrand, Germany",0.0,0.0,0.61548,0.167082
"Louan-Villegruis, FranceLouan-Villegruis, France","Louan-Villegruis, France","Louan-Villegruis, France",0.0,0.0,0.681945,0.203802
"Nancy, FranceNancy, France","Nancy, France","Nancy, France",0.0,0.0,0.726954,0.184673
"Sévérac-le-Château, FranceSévérac-le-Château, France","Sévérac-le-Château, France","Sévérac-le-Château, France",0.0,0.0,0.613801,0.16625


Next, I plotted the Bray-Curtis distance against geographic and public transit adjusted distances:

In [38]:
dist_plot = hv.Scatter(transit,
                      kdims=['dist (km)'],
                      vdims=['avg braycurtis','loc1','loc2']
                      ).opts(height=400,
                             width=400,
                             size=5,
                             alpha=0.5,
                             xlim=(-90,1300),
                            color='blue',
                            tools=['hover'])

dist_error = hv.ErrorBars(list(zip(transit['dist (km)'],transit['avg braycurtis'],transit['std braycurtis'])))

dist_plot = dist_plot * dist_error

adj_plot = hv.Scatter(transit,
                      kdims=['adj dist (au)'],
                      vdims=['avg braycurtis','loc1','loc2']
                      ).opts(height=400,
                             width=400,
                             size=5,
                             alpha=0.5,
                             xlim=(-1000,30000),
                             #title = 'Full Taxonomy Bray-Curtis',
                            color='tomato',
                            tools=['hover'])

adj_error = hv.ErrorBars(list(zip(transit['adj dist (au)'],transit['avg braycurtis'],transit['std braycurtis'])))

adj_plot = adj_plot * adj_error

dist_plot

I have not yet performed regression analysis on these data, but by eye there appears to be no correlation between Bray-Curtis dissimilarity and either of the two distance metrics. Even within the same site there is a huge diversity of distances between sample pairs, and this diversity is roughly the same regardless of how far apart two sites are. 


## PCA: Phylum only

I repeated my analyses above, but in this case only examined the top four most abundant Phyla. 

In [39]:
pca_df = alldata.loc[:,['Phylum','sample','reads']]

pca_df = pca_df.groupby(['Phylum','sample']).reads.sum().reset_index()

pca_df = pca_df.pivot(columns='Phylum',index='sample')

pca_df = pca_df.droplevel(0,axis=1)

pca_df.head()

Phylum,p__Bacteroidetes,p__Deferribacteres,p__Firmicutes,p__Proteobacteria
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ERS194036,0.609052,0.001926,0.384208,0.004815
ERS194037,0.718616,0.0,0.234538,0.046846
ERS194038,0.37234,0.000626,0.389549,0.237484
ERS194039,0.859312,0.003036,0.131579,0.006073
ERS194040,0.850349,0.00574,0.124231,0.01968


In [35]:
pca_taxon = PCA(n_components=3)
pc_analysis = pca_taxon.fit_transform(pca_df)

x = [i[0] for i in pc_analysis]
y = [i[1] for i in pc_analysis]
z = [i[2] for i in pc_analysis]
samp = pca_df.index

pc_df = pd.DataFrame(list(zip(samp,x,y,z)),columns=['sample','PC1','PC2','PC3'])

for i in pc_df.index:
    sample = pc_df.loc[i,'sample']
    subdf = alldata[alldata['sample'] == sample]
    
    pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']

pc_df.head()

  pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']


Unnamed: 0,sample,PC1,PC2,PC3,coord
0,ERS194036,-0.145801,0.216901,-0.014926,"(47.4532, 0.5949)"
1,ERS194037,-0.248232,0.05665,-0.013127,"(47.4532, 0.5949)"
2,ERS194038,0.171264,0.119846,-0.030668,"(47.4532, 0.5949)"
3,ERS194039,-0.408608,-0.022347,-0.002799,"(47.4532, 0.5949)"
4,ERS194040,-0.395754,-0.034806,-0.000277,"(47.4532, 0.5949)"


Once again, no clear clusters emerged and no patterns were observable based on sample site:

In [36]:
alltax12 = hv.Scatter(pc_df,
          kdims=['PC1'],
          vdims=['PC2','sample','coord']
          ).opts(height=500,
                 width=600,
                 size=5,
                 legend_position='right',
                 show_legend=False,
                 color='coord',
                 cmap='Category10',
                 fontscale=1.2,
                 tools=['hover'])

alltax13 = hv.Scatter(pc_df,
          kdims=['PC1'],
          vdims=['PC3','sample','coord']
          ).opts(height=500,
                 width=750,
                 size=5,
                 legend_position='right',
                 color='coord',
                 cmap='Category10',
                 fontscale=1.2,
                 tools=['hover'])

alltax12+alltax13

I then repeated the Bray-Curtis analysis as above:

In [40]:
bc_ls = []

for sp in sample_pairs:
    subdf = pca_df.loc[sp]
    bc_dist = braycurtis(subdf.iloc[0],subdf.iloc[1])
    
    bc_ls.append((sp[0],sp[1],bc_dist))
    
bc_df = pd.DataFrame(bc_ls,columns=['sample1','sample2','braycurtis'])

bc_df.head()

Unnamed: 0,sample1,sample2,braycurtis
0,ERS194036,ERS194037,0.151596
1,ERS194036,ERS194038,0.238011
2,ERS194036,ERS194039,0.252629
3,ERS194036,ERS194040,0.259977
4,ERS194036,ERS194041,0.295459


In [41]:
for i in bc_df.index:
    sample = bc_df.loc[i,'sample1']
    subdf = alldata[alldata['sample'] == sample]
    
    bc_df.loc[i,'coord1'] = subdf.iloc[0]['coord']
    
    sample = bc_df.loc[i,'sample2']
    subdf = alldata[alldata['sample'] == sample]
    
    bc_df.loc[i,'coord2'] = subdf.iloc[0]['coord']

bc_df.head()

  bc_df.loc[i,'coord1'] = subdf.iloc[0]['coord']
  bc_df.loc[i,'coord2'] = subdf.iloc[0]['coord']


Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2
0,ERS194036,ERS194037,0.151596,"(47.4532, 0.5949)","(47.4532, 0.5949)"
1,ERS194036,ERS194038,0.238011,"(47.4532, 0.5949)","(47.4532, 0.5949)"
2,ERS194036,ERS194039,0.252629,"(47.4532, 0.5949)","(47.4532, 0.5949)"
3,ERS194036,ERS194040,0.259977,"(47.4532, 0.5949)","(47.4532, 0.5949)"
4,ERS194036,ERS194041,0.295459,"(47.4532, 0.5949)","(47.4532, 0.5949)"


In [42]:
for i in bc_df.index:
    coord = bc_df.loc[i,'coord1']
    bc_df.loc[i,'loc1'] = town_dict2[coord]
    
    coord = bc_df.loc[i,'coord2']
    bc_df.loc[i,'loc2'] = town_dict2[coord]

bc_df.head()

  bc_df.loc[i,'loc1'] = town_dict2[coord]
  bc_df.loc[i,'loc2'] = town_dict2[coord]


Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2,loc1,loc2
0,ERS194036,ERS194037,0.151596,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
1,ERS194036,ERS194038,0.238011,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
2,ERS194036,ERS194039,0.252629,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
3,ERS194036,ERS194040,0.259977,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"
4,ERS194036,ERS194041,0.295459,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France"


In [43]:
transit = transit_df.copy()

transit['loc12'] = transit['loc1'] + transit['loc2']
transit.set_index('loc12',inplace=True)

bc_locs = list(zip(bc_df['loc1'],bc_df['loc2']))
bc_locs = [np.sort(i) for i in bc_locs]
bc_locs = [''.join(i) for i in bc_locs]

bc_df['loc12'] = bc_locs 

bc_df.head()

Unnamed: 0,sample1,sample2,braycurtis,coord1,coord2,loc1,loc2,loc12
0,ERS194036,ERS194037,0.151596,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
1,ERS194036,ERS194038,0.238011,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
2,ERS194036,ERS194039,0.252629,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
3,ERS194036,ERS194040,0.259977,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"
4,ERS194036,ERS194041,0.295459,"(47.4532, 0.5949)","(47.4532, 0.5949)","Angers, France","Angers, France","Angers, FranceAngers, France"


In [44]:
for i in np.unique(bc_df['loc12']):
    subdf = bc_df[bc_df['loc12'] == i]
    
    bc_avg = np.mean(subdf['braycurtis'])
    bc_std = np.std(subdf['braycurtis'])
    
    transit.loc[i,'avg braycurtis'] = bc_avg
    transit.loc[i,'std braycurtis'] = bc_std
    
transit.head()

Unnamed: 0_level_0,loc1,loc2,dist (km),adj dist (au),avg braycurtis,std braycurtis
loc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Angers, FranceEspelette, France","Angers, France","Espelette, France",460.751877,1246.77154,0.42519,0.213469
"Cologne-Bonn, GermanyLangenbrand, Germany","Cologne-Bonn, Germany","Langenbrand, Germany",270.996044,4094.58763,0.397672,0.21293
"Angers, FranceLouan-Villegruis, France","Angers, France","Louan-Villegruis, France",251.659635,1139.709862,0.428388,0.230024
"Cologne-Bonn, GermanySévérac-le-Château, France","Cologne-Bonn, Germany","Sévérac-le-Château, France",799.990157,25814.522367,0.393293,0.200863
"Langenbrand, GermanyLouan-Villegruis, France","Langenbrand, Germany","Louan-Villegruis, France",378.26948,1652.25272,0.376299,0.212464


In [45]:
for i in transit.index:
    if type(transit.loc[i,'loc1']) == str:
        pass
    else:
        location = i.split(', ')
        location = ', '.join([location[0],location[-1]])
        
        transit.loc[i,'loc1'] = location
        transit.loc[i,'loc2'] = location
        
        transit.loc[i,'dist (km)'] = 0
        transit.loc[i,'adj dist (au)'] = 0
        
transit.tail()

Unnamed: 0_level_0,loc1,loc2,dist (km),adj dist (au),avg braycurtis,std braycurtis
loc12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"Espelette, FranceEspelette, France","Espelette, France","Espelette, France",0.0,0.0,0.448535,0.205724
"Langenbrand, GermanyLangenbrand, Germany","Langenbrand, Germany","Langenbrand, Germany",0.0,0.0,0.300602,0.145839
"Louan-Villegruis, FranceLouan-Villegruis, France","Louan-Villegruis, France","Louan-Villegruis, France",0.0,0.0,0.41992,0.251573
"Nancy, FranceNancy, France","Nancy, France","Nancy, France",0.0,0.0,0.47377,0.279929
"Sévérac-le-Château, FranceSévérac-le-Château, France","Sévérac-le-Château, France","Sévérac-le-Château, France",0.0,0.0,0.372103,0.183137


Examining on the Phylum level only produced essentially identical plots to the full taxonomic analysis, indicating that at least with this analysis there is no meaningful correlation between microbiome similarity and geographic proximity:

In [48]:
dist_plot = hv.Scatter(transit,
                      kdims=['dist (km)'],
                      vdims=['avg braycurtis','loc1','loc2']
                      ).opts(height=400,
                             width=400,
                             size=5,
                             alpha=0.5,
                             xlim=(-90,1300),
                            color='blue',
                            tools=['hover'])

dist_error = hv.ErrorBars(list(zip(transit['dist (km)'],transit['avg braycurtis'],transit['std braycurtis'])))

dist_plot = dist_plot * dist_error

adj_plot = hv.Scatter(transit,
                      kdims=['adj dist (au)'],
                      vdims=['avg braycurtis','loc1','loc2']
                      ).opts(height=400,
                             width=400,
                             size=5,
                             alpha=0.5,
                             xlim=(-1000,30000),
                              #title = 'Phylum Bray-Curtis',
                            color='tomato',
                            tools=['hover'])

adj_error = hv.ErrorBars(list(zip(transit['adj dist (au)'],transit['avg braycurtis'],transit['std braycurtis'])))

adj_plot = adj_plot * adj_error

dist_plot

## PCA of Bray-Curtis distances

In [49]:
pcbc = bc_df.copy()
pcbc = pcbc.drop(['coord1','coord2','loc1','loc2','loc12'],axis=1)

pcbc = pcbc.pivot(columns='sample2',index='sample1',values='braycurtis')
pcbc.fillna(0,inplace=True)

pcbc.head()

sample2,ERS194036,ERS194037,ERS194038,ERS194039,ERS194040,ERS194041,ERS194042,ERS194043,ERS194044,ERS194045,...,ERS194227,ERS194228,ERS194229,ERS194230,ERS194231,ERS194232,ERS194233,ERS194234,ERS194235,ERS194236
sample1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERS194036,0.0,0.151596,0.238011,0.252629,0.259977,0.295459,0.375217,0.142013,0.159647,0.278451,...,0.087393,0.065326,0.062066,0.311246,0.22578,0.289784,0.143828,0.317623,0.269999,0.526837
ERS194037,0.151596,0.0,0.346276,0.143732,0.137473,0.445608,0.526813,0.041672,0.008894,0.126855,...,0.223701,0.090627,0.089531,0.269215,0.074184,0.141364,0.043705,0.175562,0.325729,0.485281
ERS194038,0.238011,0.346276,0.0,0.489382,0.483122,0.289971,0.371176,0.378365,0.353731,0.387105,...,0.217381,0.268567,0.261056,0.266174,0.371388,0.343399,0.380646,0.521212,0.057876,0.294167
ERS194039,0.252629,0.143732,0.489382,0.0,0.016311,0.545719,0.627834,0.112541,0.135651,0.126799,...,0.337654,0.224062,0.228741,0.309988,0.11862,0.181839,0.111667,0.066105,0.469255,0.537741
ERS194040,0.259977,0.137473,0.483122,0.016311,0.0,0.550175,0.635182,0.119889,0.129391,0.113191,...,0.330907,0.215099,0.222481,0.296381,0.11236,0.168231,0.119015,0.061461,0.462995,0.531481


In [50]:
pca_braycurtis = PCA(n_components=3)
pc_analysis = pca_braycurtis.fit_transform(pcbc)

x = [i[0] for i in pc_analysis]
y = [i[1] for i in pc_analysis]
z = [i[2] for i in pc_analysis]
samp = pcbc.index

pc_df = pd.DataFrame(list(zip(samp,x,y,z)),columns=['sample','PC1','PC2','PC3'])

for i in pc_df.index:
    sample = pc_df.loc[i,'sample']
    subdf = alldata[alldata['sample'] == sample]
    
    pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']

pc_df.head()

  pc_df.loc[i,'coord'] = subdf.iloc[0]['coord']


Unnamed: 0,sample,PC1,PC2,PC3,coord
0,ERS194036,-1.373617,1.388919,-0.262999,"(47.4532, 0.5949)"
1,ERS194037,-2.253196,0.37095,-0.187541,"(47.4532, 0.5949)"
2,ERS194038,0.680856,1.031166,-1.443229,"(47.4532, 0.5949)"
3,ERS194039,-2.472524,-0.376237,0.926693,"(47.4532, 0.5949)"
4,ERS194040,-2.490052,-0.408418,0.823816,"(47.4532, 0.5949)"


In [51]:
pc12 = hv.Scatter(pc_df,
          kdims=['PC1'],
          vdims=['PC2','sample','coord']
          ).opts(height=500,
                 width=550,
                 size=5,
                 alpha=0.5,
                 legend_position='right',
                 color='coord',
                 cmap='Category10',
                 show_legend=False,
                 tools=['hover'])

pc13 = hv.Scatter(pc_df,
          kdims=['PC1'],
          vdims=['PC3','sample','coord']
          ).opts(height=500,
                 width=700,
                 size=5,
                 alpha=0.5,
                 legend_position='right',
                 color='coord',
                 cmap='Category10',
                 tools=['hover'])

pc12 + pc13

In [29]:
%load_ext watermark
%watermark -v --iversions

Python implementation: CPython
Python version       : 3.11.5
IPython version      : 8.15.0

bokeh    : 3.2.1
pandas   : 2.1.1
numpy    : 1.24.3
holoviews: 1.18.0

