# Initial Analysis
https://www.ebi.ac.uk/metagenomics/studies/MGYS00000516#overview

## Is geographic proximity related to microbial content?

In [1]:
import pandas as pd
import numpy as np
from itertools import chain

import holoviews as hv
import bokeh.io

hv.extension('bokeh')
bokeh.io.output_notebook()

In [2]:
def ecdf(x):
    return np.sort(x), np.linspace(0,1,len(x))

In [3]:
# Import taxonomy data
taxonomy = pd.read_csv('data/ERP001970_taxonomy_abundances_v2.0.tsv',sep='\t')

# Drop root
taxonomy.drop(0,axis=0,inplace=True)

# Set index
taxonomy.set_index('#SampleID',inplace=True)

taxonomy.head()

Unnamed: 0_level_0,ERR197719,ERR197720,ERR197721,ERR197722,ERR197723,ERR197724,ERR197725,ERR197726,ERR197727,ERR197728,...,ERR197910,ERR197911,ERR197912,ERR197913,ERR197914,ERR197915,ERR197916,ERR197917,ERR197918,ERR197919
#SampleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Root;k__Bacteria;p__Acidobacteria;c__Acidobacteria-6;o__iii1-15;f__;g__;s__,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Root;k__Bacteria;p__Acidobacteria;c__Acidobacteria-6;o__iii1-15;f__mb2424;g__;s__,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Root;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__ACK-M1;g__;s__,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Root;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Bogoriellaceae;g__Georgenia;s__,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Root;k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Brevibacteriaceae;g__Brevibacterium;s__,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
phylo = [i.split(';')[1:] for i in taxonomy.index]

for i,v in enumerate(phylo):
    ls = [j for j in v if j[-1] != '_']
    phylo[i] = ls
    
lengths = [len(i) for i in phylo]

phylo[0]

['k__Bacteria', 'p__Acidobacteria', 'c__Acidobacteria-6', 'o__iii1-15']

In [6]:
lx, ly = ecdf(lengths)

hv.Scatter(zip(lx,ly)
          ).opts(width=600,
                 height=300,
                 show_legend=False)

In [4]:
# Look at distributions of taxa
ecdf_ls = []

for s in taxonomy.columns:
    x, y = ecdf(taxonomy[s])
    names = [s for i in x]
    ecdf_ls.append(list(zip(names,x,y)))
    
ecdf_ls = chain.from_iterable(ecdf_ls)
    
ecdf_df = pd.DataFrame(ecdf_ls,columns=['Run','Counts per taxa','ECDF'])

ecdf_df.head()

Unnamed: 0,Run,Counts per taxa,ECDF
0,ERR197719,0,0.0
1,ERR197719,0,0.003425
2,ERR197719,0,0.006849
3,ERR197719,0,0.010274
4,ERR197719,0,0.013699


In [5]:
hv.Scatter(ecdf_df,kdims=['Counts per taxa'],vdims=['ECDF','Run']
          ).opts(width=600,
                 height=300,
                 alpha=0.5,
                 color='Run',
                 cmap='Category20b',
                 show_legend=False)

Most taxa have counts of zero in most samples. Very few show counts above zero. Excluding zeros to allow log scale.

In [6]:
hv.Scatter(ecdf_df[ecdf_df['Counts per taxa'] > 0],kdims=['Counts per taxa'],vdims=['ECDF','Run']
          ).opts(width=600,
                 height=300,
                 alpha=0.5,
                 logx=True,
                 color='Run',
                 cmap='Category20b',
                 show_legend=False)

Check sums of all taxa to see how many are universally underrepresented.

In [7]:
x, y = ecdf(np.sum(taxonomy,axis=1))

hv.Scatter(zip(x,y)
          ).opts(width=600,
                 height=300,
                 logx=True,
                 show_legend=False)

In [8]:
# Group samples by location
geolocs = pd.read_csv('data/sampleLocs.csv')

groups = []

for i in np.unique(geolocs['latitude']):
    subdf = geolocs[geolocs['latitude'] == i]
    groups.append(list(subdf['run']))

len(groups)

8