In [2]:
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import re

from bokeh.io import show, output_file
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure
from bokeh.core.properties import value
from bokeh.palettes import Category20

from bokeh.io import export_png
import itertools  

import warnings
warnings.filterwarnings('ignore')

#Setting plot size
plt.rcParams["figure.figsize"] = (20, 8)
plt.rcParams["xtick.labelsize"] = 5
plt.rcParams["ytick.labelsize"] = 5

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Transforming Metaphlan's relative abundance data to plot in Bokeh

In [5]:
!ls data

pool1_profile_known.txt  sample_df.csv		      species_taxa.csv
pool2_profile_known.txt  species_known_abundance.csv


In [25]:
def clean_merged_table(df,merged_file_name):
    '''Cleans the merged table by removing file extenstion suffix which gets added to sample columns during merge.'''
    #df = pd.read_table(df,sep='\t',engine='python')
    df.columns = df.columns.str.replace(merged_file_name, '')
    return df

def get_taxa_columns(df,rank):
    '''Splits ID into taxanomic ranks to make taxa table''' 
    df_taxa = df['ID'].str.split('|',expand=True)
    taxa_cols = ["Kingdom","Phylum","Class","Order","Family","Genus","Species","Strain"]
    taxa_dict = {'Kingdom':1,"Phylum":2,"Class":3,"Order":4,"Family":5,"Genus":6,"Species":7,"Strain":8}
    value = taxa_dict.get(rank)
    taxa_cols=taxa_cols[0:value]
    df_taxa.columns=taxa_cols
    for col in df_taxa.columns:
        df_taxa[col]=df_taxa[col].apply(trim_taxa_names)    
    otu_index = []
    for i in range(0, len(df)):
        otu_index.append("Otu"+str(i))
    df_taxa['Otu']=otu_index 
    taxa_cols=[col for col in df_taxa.columns if 'Otu' not in col]
    for col in taxa_cols:
        df_taxa.at[df_taxa.index[-1], col] = 'Other'
    return df_taxa

def trim_taxa_names(x):
    '''Removes leading characters before taxa ID e.g. s__ '''
    match = re.sub(r'^[kpcofgs]__',"",str(x))
    return match

def get_sample_cols(df):
    '''Finds and returns sample columns in dataframe, presuming sample names contain a number'''
    r = re.compile(r'^.*[0-9].*$') #match column names that contain a number anywhere
    sample_cols=[]
    for col in df:
        if(r.match(col)):
            sample_cols.append(col)
    return sample_cols

def create_sample_df(abun_matrix):
    sample_cols=get_sample_cols(abun_matrix)
    sample_df=pd.DataFrame({'Sample':sample_cols})
    sample_df['Behaviour'] = sample_df['Sample'].apply(get_behaviour)
    # Add extra column to sample df so phyloseq ordination plots behave
    sample_df['Type'] = 'murine'
    return sample_df

def add_otu_primary_key(df):
    '''Adds otu primary key column to dataframe'''
    otu_index = []
    for i in range(0, len(df)):
        otu_index.append("Otu"+str(i))
    df['Otu']=otu_index 
    return df
    
def create_other_group(df,thresh):
    '''From a given threshold or cut-off point it creates an "Other" row with the summed abundance of 
    species below the givedn threshold. Used to give idea of remainder of species excluded from plot legend'''
    index_to_append = len(df)
    df.at[index_to_append,'ID']= 'Other'
    cols= df.columns 
    for col in cols:
        if col!='ID':
            df.loc[df['ID']=='Other', col]= np.sum(df.loc[(df[col]<=thresh) & (df['ID']!="other")][col])
    sample_cols = get_sample_cols(df)
    ids=df['ID'].to_list()
    df_thresh_removed = df[sample_cols].apply(lambda x: np.where(x <= thresh,0,x))
    df_thresh_removed['ID']=ids
    df=df_thresh_removed[(df_thresh_removed.iloc[:,0:-1] > 0).any(axis=1)]
    df.reset_index(drop=True,inplace=True)
    return df

def create_sample_df(abun_matrix):
    sample_cols=get_sample_cols(abun_matrix)
    sample_df=pd.DataFrame({'Sample':sample_cols})
    sample_df['Behaviour'] = sample_df['Sample'].apply(get_behaviour)
    # Add extra column to sample df so phyloseq ordination plots behave
    sample_df['Type'] = 'murine'
    return sample_df

In [455]:
def plot_stacked_chart_by_taxa_rank(df,taxa_col,name_of_plot):

        from bokeh.io import show, output_file
        from bokeh.models import ColumnDataSource
        from bokeh.plotting import figure
        from bokeh.core.properties import value
        from bokeh.palettes import Category20  
        from bokeh.io import export_svgs
        import itertools  

        name_of_output = name_of_plot
        output_file(name_of_output+".html")
        
        # Hex colour palette from default bokeh category 20
#         hex_colours=['#1f77b4','#aec7e8','#ffe277','#565d47','#2ca02c','#98df8a','#d62728','#ff9896','#9467bd','#c5b0d5','#8c564b',
#          '#c49c94','#e377c2','#f7b6d2','#7f7f7f','#c7c7c7','#bcbd22','#dbdb8d','#17becf','#393b79','#5254a3','#6b6ecf',
#          '#9c9ede','#637939','#8ca252','#b5cf6b','#cedb9c','#8c6d31','#bd9e39','#e7ba52']

#         hex_colours=['#cb9b42','#b1d1c5','#f2f3ee','#dbd7cb','#d2e1c8','#fee4a6','#f9c4aa','#bae5d5','#d7acd4','#eec2c2','#f2f2b0',
#         '#dfd3c3','#c7b198','#596e79','#d1c145','#d08752','#c75643','#8ed6ff','#93abd3','#b590ca','#698474','#ba6b57',
#         '#30475e','#637939','#8ca252','#b5cf6b','#cedb9c','#b8b2a6','#bd9e39','#e7ba52']

        hex_colours=['#cad315','#e4e978','#f2f4c0','#fcba03','#142850','#27496d','#00909e','#dae1e7','#f6f578','#f6d743','#649d66',
         '#06623b','#ffe6e6','#ffabe1','#a685e2','#6155a6','#ff4646','#ff8585','#ffb396','#fff5c0','#d0e8f2','#79a3b1',
         '#456268','#c7956d','#965d62','#fc8621','#d1c145','#d08752','#adce74','#93abd3']

        df.set_index(taxa_col, inplace=True)

        samples = df.columns.values
        organisms = df.index.values

        color_iter = itertools.cycle(hex_colours) 
        colors = [next(color_iter) for organism in organisms]

        data = {'samples': list(samples)}
        for organism in organisms:
            data[organism] = list(df.loc[organism])
        source = ColumnDataSource(data=data)

        # create our plot
        p = figure(x_range=samples, plot_height=800, plot_width=1100,  title="{}".format(name_of_plot),
               toolbar_location=None, tools="")

        p.vbar_stack(organisms, x='samples', width=0.9, source=source,legend=[value(x) for x in organisms], color=colors)
        p.xaxis.axis_label = 'Sample'
        p.yaxis.axis_label = 'Abundance %'
        p.legend.location = "top_right"
        p.legend.orientation = "vertical"
        p.xaxis.major_label_orientation = "vertical"
#         p.title.text_font = "helvetica"
        p.title.text_font_size = '12pt'

        # Position the legend outside the plot area
        new_legend = p.legend[0]
        p.add_layout(new_legend, 'right')
        p.output_backend = "svg"
        svg_output = name_of_plot
        export_svgs(p, filename=svg_output+".svg",height=800, width=1100)

        show(p)

In [456]:
df_k_1 = pd.read_table("data/pool1_profile_known.txt", sep='\t')
df_k_2 = pd.read_table("data/pool2_profile_known.txt", sep='\t')

In [457]:
# Extract all rows containing species relative abundance
df_k_1_species=df_k_1[df_k_1['ID'].str.contains(r'\|s__[^|]*$')]
df_k_2_species=df_k_2[df_k_2['ID'].str.contains(r'\|s__[^|]*$')]
df_k_1_species.reset_index(drop=True,inplace=True)
df_k_2_species.reset_index(drop=True,inplace=True)

In [458]:
df_known_species = pd.merge(df_k_1_species,df_k_2_species,on=['ID'],how='outer')

In [459]:
df_known_species.shape

(66, 38)

In [460]:
df_known=clean_merged_table(df_known_species,'_known_profiled_metagenome')
df_known.fillna(0,inplace=True)

In [461]:
df_known.head()

Unnamed: 0,ID,E15_S89,E19b_S88,E28_S87,E5b_S84,E7_S78,E8_S74,R21b_S76,R24_S82,R5_S75,R6_S81,S11b_S73,S11_S83,S13_S79,S22b_S90,S22_S77,S3b_S86,S4_S85,S9_S80,E12_S66,E1_S65,E23_S55,E25_S63,E30_S56,E3_S64,E9b_S70,R10b_S57,R10_S69,R14_S71,R21_S60,R26_S67,R29_S61,R2_S58,R6b_S54,R7b_S68,S20b_S59,S27_S72,S2b_S62
0,k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_pseudolongum,0.0,33.99356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.50706,0.0,11.21542,0.0,0.0,0.0,0.0,1.74936,46.93049,0.0,0.71209,19.9657,11.61847,0.0,0.0,0.0,70.28168,0.0,0.0,0.0,0.0,31.72024,0.0,10.6155
1,k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Eggerthellales|f__Eggerthellaceae|g__Adlercreutzia|s__Adlercreutzia_equolifaciens,0.01522,0.02269,0.00986,0.01337,0.02927,0.04169,0.0129,0.0,0.02503,0.05092,0.01569,0.03171,0.04299,0.01102,0.02144,0.01528,0.01784,0.03676,0.05423,0.03506,0.01153,0.01412,0.03051,0.0,0.01825,0.04533,0.01906,0.04023,0.04791,0.01381,0.04743,0.04462,0.00712,0.02831,0.06023,0.01183,0.02226
2,k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Eggerthellales|f__Eggerthellaceae|g__Asaccharobacter|s__Asaccharobacter_celatus,0.06825,0.02827,0.02306,0.02648,0.02673,0.03658,0.02223,0.00543,0.03252,0.06713,0.02811,0.04405,0.0396,0.01511,0.00854,0.01129,0.01096,0.06209,0.07194,0.0282,0.02227,0.02782,0.04186,0.0,0.01777,0.06119,0.03526,0.05094,0.0669,0.04167,0.03436,0.02893,0.02066,0.02386,0.06373,0.02512,0.0339
3,k__Bacteria|p__Actinobacteria|c__Coriobacteriia|o__Eggerthellales|f__Eggerthellaceae|g__Enterorhabdus|s__Enterorhabdus_caecimuris,1.68923,1.13757,0.90287,1.69362,1.55661,1.75809,0.8582,1.3841,1.53637,2.20989,1.07292,2.22013,1.54649,0.47359,0.81333,0.61152,0.96265,2.02247,2.72883,2.3316,0.94619,0.8633,1.41844,0.92618,0.69634,2.51668,1.84128,2.1304,3.44706,1.22694,1.74819,2.47755,1.68021,1.92587,2.75324,1.03309,2.21778
4,k__Bacteria|p__Bacteroidetes|c__Bacteroidia|o__Bacteroidales|f__Bacteroidaceae|g__Bacteroides|s__Bacteroides_caecimuris,1.47431,0.87626,2.41045,2.6891,2.44399,1.80747,1.19513,1.24464,2.71021,0.71312,0.44067,2.54004,0.36697,0.41677,0.77984,3.20017,4.43957,0.94188,2.52495,1.1351,0.44879,0.78088,0.90226,2.10273,0.36699,0.59477,3.23323,0.56874,0.53838,0.39267,2.09979,1.94689,0.6141,2.41562,1.27577,1.37661,2.30322


In [462]:
# Creating threshold abundance to group species < 5% abundance in 'other' group
abund_matrix=create_other_group(df_known,5)

In [463]:
len(abund_matrix)

21

In [464]:
species_taxa=get_taxa_columns(abund_matrix,'Species')

In [465]:
species_taxa.head()

Unnamed: 0,Kingdom,Phylum,Class,Order,Family,Genus,Species,Otu
0,Bacteria,Actinobacteria,Actinobacteria,Bifidobacteriales,Bifidobacteriaceae,Bifidobacterium,Bifidobacterium_pseudolongum,Otu0
1,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides_uniformis,Otu1
2,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Muribaculaceae,Muribaculaceae_unclassified,Muribaculaceae_bacterium_DSM_103720,Otu2
3,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Muribaculaceae,Muribaculum,Muribaculum_intestinale,Otu3
4,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella_sp_MGM1,Otu4


In [466]:
species=species_taxa['Species'].to_list()

In [467]:
abund_matrix.drop('ID',axis=1,inplace=True)

In [468]:
abund_matrix['Species']=species

In [469]:
sample_cols=get_sample_cols(abund_matrix)
#abund_matrix=abund_matrix.sort_values(by=sample_cols)

In [470]:
plot_stacked_chart_by_taxa_rank(abund_matrix,'Species','Species relative abundance')