# Analysis of varying trimming thresholds of microbial communities of known composition

Script for the analysis and figure generation for 515F/926R 16S and 18S mock communities

In [144]:
#Import libraries in python3 kernel
import pandas as pd
import seaborn as sns
import glob
import os
import sys
from pathlib import Path
!conda install --yes --prefix {sys.prefix} boto
import boto
import shutil
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from matplotlib.colors import LogNorm
import numpy as np
import skbio
#import fastcluster #this package makes skbio run faster clustermaps but can be tricky with missing values from pairwise comparisons
from functools import reduce
!conda install --yes --prefix {sys.prefix} biopython
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqUtils import GC
from collections import defaultdict
from collections import Counter
import statistics
import itertools as it
from scipy import stats
from matplotlib.ticker import FormatStrFormatter
from qiime2 import Artifact
import tempfile
import zipfile
import yaml
%matplotlib inline

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



## Load functions

__consolidate_tables__ creates a dataframe of all the merged feature tables and parameters

In [147]:
# Special thanks to Alex Manuele https://github.com/alexmanuele
def consolidate_tables(community):
    if community == "16S":
        comm_id, comm = '16S', '02-PROKs'
    if community == "18S":
        comm_id, comm = '18S','02-EUKs'

    table_list = glob.glob('{0}/*/03-DADA2d/table.qza'.format(comm+'/all_trims/'))
    print("Found all tables")

    dataframes = []  
    for table_path in table_list:
        with tempfile.TemporaryDirectory() as tempdir:
            #load table, dump contents to tempdir
            table = Artifact.load(table_path)
            #Make sure the tables are all FeatureFrequency type
            assert str(table.type) == 'FeatureTable[Frequency]', "{0}: Expected FeatureTable[Frequency], got {1}".format(table_path, table.type)
            Artifact.extract(table_path, tempdir)
            #get the provenance form the tempdir and format it for DF
            prov = '{0}/{1}/provenance/'.format(tempdir, table.uuid)
            action = yaml.load(open("{0}action/action.yaml".format(prov), 'r'), Loader=yaml.BaseLoader)
            paramlist = action['action']['parameters']
            paramlist.append({'table_uuid': "{}".format(table.uuid)})
            paramdict = {}
            for record in paramlist:
                paramdict.update(record)

            # Get the data into a dataframe
              #Biom data
            df = table.view(pd.DataFrame).unstack().reset_index()
            df.columns = ['feature_id', 'sample_name', 'feature_frequency']
            df['table_uuid'] = ["{}".format(table.uuid)] * df.shape[0]
              #param data
            pdf = pd.DataFrame.from_records([paramdict])
              #merge params into main df
            df = df.merge(pdf, on='table_uuid')


            #I like having these columns as the last three. Makes it more readable
            cols = df.columns.tolist()
            reorder = ['sample_name', 'feature_id', 'feature_frequency']
            for val in reorder:
                cols.append(cols.pop(cols.index(val)))
            df = df[cols]
            df['table_path'] = [table_path] * df.shape[0]
            dataframes.append(df)

    #Stick all the dataframes together
    #outputfile="merged_all_tables.tsv"
    df = pd.concat(dataframes)
    df.to_csv(comm+'/merged_all_tables.tsv', sep='\t', index=False)
    print("Success.")
    return df, comm, comm_id

In [148]:
df, comm, comm_id = consolidate_tables('16S')

Found all tables
Success.


__merge_metadata__ adds the metadata to the merged feature tables

In [153]:
def merge_metadata():
    #df = pd.read_csv('02-PROKs/'+'/merged_all_tables.tsv', sep='\t')

    tables = df[['sample_name', 'feature_id', 'feature_frequency']].copy()
    tables.rename(columns={'sample_name':'file'}, inplace=True)
    manifest = pd.read_csv('MANIFEST.tsv', sep='\t')
    manifest['file'] = [s.split('SPOT_USC_2/')[1] for s in manifest['absolute-filepath']]
    manifest['file'] = [s.split('.R')[0] for s in manifest['file']]
    manifest = manifest.drop(columns = ['absolute-filepath', 'direction'])
    manifest.drop_duplicates()
    merged = pd.merge(tables,manifest, on='file')
    merged = merged.drop(columns = ['file'])
    merged = merged.drop_duplicates() 
    print('Set up manifest ...')
    
    metadata = pd.read_csv('METADATA.tsv', sep='\t')
    merged = pd.merge(merged,metadata, on='sample-id')
    merged = merged.replace({'V2': '16S'}, regex=True)
    print('Set up metadata ...')
    
    merged.to_csv(comm+'/merged_asvs_metadata.tsv', sep = '\t')
    print('Saved merged_asvs_metadata.tsv')
    
    return merged

In [154]:
merged = merge_metadata()

Set up manifest ...
Set up metadata ...
Saved merged_asvs_metadata.tsv


__rename_move_taxonomy__ rename taxonomy

In [159]:
def rename_move_all_taxonomies():
    dr = comm+'/all_trims'
    if not os.path.isdir(comm+'/all_taxonomies'):
        for root, dirs, files in os.walk(dr): #rename all taxonomy.tsv by their trimlengths
            for file in files:
                if file == "taxonomy.tsv":
                    spl = root.split("/"); newname = spl[-6]; sup = ("/").join(spl[:-6])
                    shutil.copy(root+"/"+file, sup+"/"+newname+".tsv");# shutil.rmtree(root)
        files = glob.glob('{0}F*R*.tsv'.format(comm+'/all_trims/'))
        os.mkdir(comm+'/all_taxonomies')
        for file in files:
            shutil.move(file, comm+'/all_taxonomies/') #puts all tsvs in new directory with correct names 

In [160]:
rename_move_all_taxonomies()

__pick_metadata__ extracts the features according to the given metadata parameters

In [161]:
def pick_metadata(composition, runnumber, R='all', F='all'):
#make df of features/composition+run+comm

    composition = composition
    runnumber = runnumber
    R = R
    F = F

    files = glob.glob('{0}*.tsv'.format(comm+'/all_taxonomies/'))
    taxos = []
#    if not os.path.exists(path+composition):
#        os.mkdir(path+composition)
    for filename in files:
        tax = pd.read_csv(filename, sep='\t')
        tax['table_id'] = str(filename.split('/')[-1])
        tax["table_id"] = tax["table_id"].str.replace(".tsv", "")
        tax['Forward_trim'], tax['Reverse_trim'] = tax['table_id'].str.split('R', 1).str
        tax['Forward_trim'] = tax['Forward_trim'].map(lambda x: x.lstrip('F'))
        tax["Forward_trim"] = pd.to_numeric(tax["Forward_trim"])
        tax["Reverse_trim"] = pd.to_numeric(tax["Reverse_trim"])
        taxos.append(tax)
    print('Appended all taxonomies to taxos')
    taxos = pd.concat(taxos)
    taxos = taxos.rename(columns={"Feature ID": "feature_id"}, errors="raise")
    taxos.to_csv(comm+'/taxos.tsv', sep = '\t')
    separated = merged.merge(taxos, how='left', on='feature_id')
    separated = separated.drop_duplicates()
    separated = separated[separated["community"] == comm_id]
    separated = separated[separated["composition"] == composition]
    separated['run-number']= separated['run-number'].astype(str)
    separated = separated[separated["run-number"] == runnumber]
    separated['sum'] = separated.groupby(['table_id','sample-id'])['feature_frequency'].transform('sum')
    separated['ratio'] = separated['feature_frequency']/(separated['sum'])
    separated_taxonomies = separated.copy()
    #make a dictionary with keys for id-ing the taxon belonging to this sub-community
    separated_dic = pd.Series(separated.Taxon.values,separated.feature_id.values).to_dict()
    
    return composition, runnumber, R, F, separated_taxonomies, separated_dic, files, tax, separated, taxos

In [162]:
composition, runnumber, R, F, separated_taxonomies, separated_dic, files, tax, separated, taxos = pick_metadata('Staggered', '46')



Appended all taxonomies to taxos


__rename_taxonomies__ extracts zipped classification files from qiime2, renames them by the trimming length used, and moves them to a new folder

In [163]:
def pick_taxonomies():
#generate folder of split taxonomies by runnumber and composition
    # Directory
    directory = composition+runnumber
    # Parent Directory path
    parent_dir = comm+'/all_taxonomies'
    # Path
    path = os.path.join(parent_dir, directory)
    # Create the directory
    # 'GeeksForGeeks' in
    # '/home / User / Documents'
    os.mkdir(path)
    for filename in files:
        taxonomy = pd.read_csv(filename, sep='\t')
        taxonomy = taxonomy.rename(columns={"Feature ID": "feature_id"}, errors="raise")
        newz = taxonomy.merge(merged, how='left', on='feature_id')
        #new = newz.drop(['sample-id'], axis=1)
        new = newz.drop_duplicates()
        new = new[new["community"] == comm_id]
        new = new[new["composition"] == composition]
        new['run-number']= new['run-number'].astype(str)
        new = new[new["run-number"] == runnumber]
        new = new[new.feature_frequency != 0]
        new = new.rename(columns={"feature_id":"Feature ID"}, errors="raise")
        new = new[['Feature ID', 'Taxon', 'Confidence']].copy()
        new = new.drop_duplicates()
        d = 'all_taxonomies/'
        new.to_csv(filename.split(d)[0]+d+composition+runnumber+'/'+runnumber+filename.split(d)[1], sep = '\t') 
    
    return new

In [164]:
pick_taxonomies()

Unnamed: 0,Feature ID,Taxon,Confidence
923,863728e1cec6befd5ba02d15baef4c36,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,0.87939
1919,b8b0859fd6afcfc39c1a7e6a50415ca3,d__Bacteria; p__Proteobacteria; c__Alphaproteo...,0.988628
2892,bdf18e8980906003f3978e5ee578c9ea,d__Bacteria; p__Actinobacteriota; c__Acidimicr...,1.0
3326,b079d9bfa3d84b0bdc2c1398d247150e,d__Bacteria; p__Bacteroidota; c__Bacteroidia; ...,0.986491


__make_fasta__ extracts sequences from zipped qiime2 files, and makes new fasta file for the features from the given metadata parameters

In [167]:
def make_fasta():
    
    fastaoutfilename = comm+'/all_seqs/allfasta.fasta'
    
    if not os.path.isfile(fastaoutfilename):
        os.mkdir(comm+'/all_seqs')
        # Get list of all .qza
        repseqs = glob.glob('{0}/*/*/representative_sequences.qza'.format(comm+'/all_trims'), recursive=True)
        for repseq in repseqs:
            with zipfile.ZipFile(repseq, 'r') as zipObj:
                # Get a list of all archived file names from the zip
                listOfFileNames = zipObj.namelist()
                # Iterate over the file names
                for fileName in listOfFileNames:
                    # Check filename endswith fasta
                    if fileName.endswith('.fasta'):
                        # Extract a single file from zip
                        zipObj.extract(fileName, 'temp_fasta')
                        
    
        with open(fastaoutfilename, 'wb') as outfile:
            for filename in glob.glob('temp_fasta/*/*/*.fasta'):
                if filename == fastaoutfilename:
                    # don't want to copy the output into the output
                    continue
                with open(filename, 'rb') as readfile:
                    shutil.copyfileobj(readfile, outfile)
        shutil.rmtree('temp_fasta', ignore_errors=False, onerror=None)
    

    if R!='all':
        rallfs = separated[separated.Reverse_trim == R]
        separated_dic = pd.Series(rallfs.Taxon.values,rallfs.feature_id.values).to_dict()
    else:
        separated_dic = pd.Series(separated.Taxon.values, separated.feature_id.values).to_dict()
    if F!='all':
        fallrs = separated[separated.Forward_trim == F]
        separated_dic = pd.Series(fallrs.Taxon.values,fallrs.feature_id.values).to_dict()
    else:
        separated_dic = pd.Series(separated.Taxon.values, separated.feature_id.values).to_dict()

    fa = SeqIO.parse(comm+'/all_seqs/allfasta.fasta',
                 "fasta")
    seqs_i_want = [] #we'll put the good sequences here
    for record in fa: #a SeqRecord has the accession as record.id, usually.
        if record.id in separated_dic.keys(): #This is how you check if the accession is in the values of the dict
            seqs_i_want.append(record)
    #Now we can write the list of records to a fasta file. This will take care of the formatting etc
    with open(comm+'/all_seqs/R'+R+'F'+F+runnumber+composition+'.fasta', "w") as f:
        SeqIO.write(seqs_i_want, f, "fasta")
    
    return print('Saved selected sequences as '+comm+'/all_seqs/R'+R+'F'+F+runnumber+composition+'.fasta')

In [168]:
make_fasta()

Saved selected sequences as 02-PROKs/all_seqs/RallFall46Staggered.fasta


__make_tbd_hm__ makes a heatmap showing the taxonomic beta diversity of each trim length combination against the expected community. TBD is a dissimilarity index based on taxonomic trees between two samples where 1 is completely different, and 0 is the completely the same.

In [174]:
def make_tbd_hm(level=7):
    
    if not os.path.isdir('Bacaros_Beta'):
        ! git clone https://github.com/alexmanuele/Bacaros_Beta.git
    
    #Move the expected community to the taxonomies folder for comparison
    dest_dir = comm+'/all_taxonomies/'+composition+runnumber
    
    for file in glob.glob('in-silico-mocks/'+comm+'/'+composition+r'/*/data/taxonomy.tsv'):
        print(file)
        shutil.copy(file, os.path.join(dest_dir, 'expected.tsv'))
    
    tax_list = glob.glob('{0}/*.tsv'.format(comm+'/all_taxonomies/'+composition+runnumber))
    textfile = open("tax_comp.txt", "w")
    for element in tax_list:
        textfile.write(element + "\n")
    textfile.close()
    print("Saved all taxonomies list as tax_comp.txt")

    if not os.path.isdir(dest_dir+'/TBD1'):
        for i in range(1, 8):
            output_dir = 'TBD'+str(i)
            os.mkdir(output_dir)
            ! python Bacaros_Beta/run_beta.py --input tax_comp.txt --metric t --l $i --output $output_dir
            shutil.move(output_dir, dest_dir)
        
    bacaros_dm = pd.read_csv(dest_dir+'/TBD'+str(level)+'/tax_comp.csv')
    bacaros_dm = bacaros_dm.set_index('Unnamed: 0')
    bacaros_dm = 1  - bacaros_dm
    #bacaros_dm is a distance matrix of table X table
    my_pcoa = skbio.stats.ordination.pcoa(bacaros_dm.values)
    plt.scatter(my_pcoa.samples['PC1'],  my_pcoa.samples['PC2'])
    against_exp = bacaros_dm[['expected']].copy()
    against_exp = against_exp.reset_index().rename(columns={against_exp.index.name:'sample_name'})
    against_exp.drop(against_exp.index[against_exp['sample_name'] == 'expected'], inplace=True)
    against_exp['Forward_trim'] = [s.split('R')[0] for s in against_exp['sample_name']]
    against_exp['Forward_trim'] = [s.split('46F')[1] for s in against_exp['Forward_trim']]
    against_exp['Reverse_trim'] = [s.split('R')[1] for s in against_exp['sample_name']]
    against_exp["Forward_trim"] = pd.to_numeric(against_exp["Forward_trim"])
    against_exp["Reverse_trim"] = pd.to_numeric(against_exp["Reverse_trim"])
    against_exp["Forward_trim"].replace({0: 280}, inplace=True)
    against_exp["Reverse_trim"].replace({0: 290}, inplace=True)
    tohm = against_exp.pivot("Forward_trim", "Reverse_trim", "expected")
    tohm.rename({280: 'full'}, axis=0, inplace=True)
    tohm.rename({290: 'full'}, axis=1, inplace=True)
    ax = sns.heatmap(tohm, cmap=sns.color_palette("crest"), vmin=0, vmax=1)
    ax.invert_yaxis()
    plt.figure(figsize=(12,12))
    
    # get max and min values
    #print('The min value is {0} and the max value is {1}'.format(df.at[df.stack().index[np.argmin(df.values)]], 
    #                                                             minv = df.at[df.stack().index[np.argmax(df.values)]]))

    return (tohm, bacaros_dm)

In [175]:
make_tbd_hm(7)

in-silico-mocks/02-PROKs/Staggered/7fee7113-8e8a-459b-ae32-6d8381294d5c/data/taxonomy.tsv
Saved all taxonomies list as tax_comp.txt
Traceback (most recent call last):
  File "/Users/Diana/miniconda3/envs/qiime2-2020.111/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2898, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1675, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1683, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "Bacaros_Beta/run_beta.py", line 33, in <module>
    deltas, b = beta.calculate_beta(samples, L, metric)
  File "/Users/Diana/MOCK_ANA

Traceback (most recent call last):
  File "/Users/Diana/miniconda3/envs/qiime2-2020.111/lib/python3.6/site-packages/pandas/core/indexes/base.py", line 2898, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1675, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1683, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 0

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "Bacaros_Beta/run_beta.py", line 33, in <module>
    deltas, b = beta.calculate_beta(samples, L, metric)
  File "/Users/Diana/MOCK_ANALYSIS/Bacaros_Beta/beta/beta.py", line 114, in calculate_beta
    delta = dfunc(pair[0]['taxa'], pair[1]['taxa'], L)

FileNotFoundError: [Errno 2] No such file or directory: '02-PROKs/all_taxonomies/Staggered46/TBD7/tax_comp.csv'

__r2_plot__ makes a linear regression of the observed relative abundances of each combination of trim lengths against the expected, and plots each coefficient of determination in a heatmap

In [None]:
## to import the expected taxonomies and transofrm to ratios
def r2_plot():
    expected_silva = pd.read_csv(dest_dir+'/expected.tsv', sep='\t')
    expected_gg = pd.read_csv('in-silico-mocks/expected_all.tsv')
    expected = expected.rename(columns={'silva_taxonomy':'Taxon', 'sample-id': 'Replicate'})
    expected_even = expected[expected.mock_even_insilico != 0]
    expected_even = expected_even.drop(columns=['mock_staggered_insilico','taxonomy'])
    expected_even.reset_index(drop=True, inplace=True)
    expected_even['expected_ratio'] = expected_even['mock_even_insilico']/(expected_even['mock_even_insilico'].sum())
    expected_stagg = expected.drop(columns=['mock_even_insilico','taxonomy'])
    expected_stagg.reset_index(drop=True, inplace=True)
    expected_stagg['expected_ratio'] = expected_stagg['mock_staggered_insilico']/(expected_stagg['mock_staggered_insilico'].sum())
    
    return (expected_even, expected_stagg)

In [178]:
expected_silva = pd.read_csv(dest_dir+'/expected.tsv', sep='\t')

In [200]:
ftables = glob.glob('{0}/dereplicated-table.qza'.format('in-silico-mocks/'+comm+'/'+composition), recursive=True)

In [201]:
ftables

['in-silico-mocks/02-PROKs/Staggered/dereplicated-table.qza']

In [213]:
ftoutname = 'in-silico-mocks/'+comm+'/'+composition+'/feature_table.biom'

In [214]:
for ftable in ftables:
    with zipfile.ZipFile(ftable, 'r') as zipObj:
        # Get a list of all archived file names from the zip
        listOfFileNames = zipObj.namelist()
        # Iterate over the file names
        for fileName in listOfFileNames:
            # Check filename endswith fasta
            if fileName.endswith('.biom'):
                # Extract a single file from zip
                zipObj.extract(fileName, 'temp_biom')

In [212]:
with open(ftoutname, 'wb') as outfile:
    for filename in glob.glob('temp_biom/*/*/*.biom'):
        if filename == ftoutname:
            # don't want to copy the output into the output
            continue
            with open(filename, 'rb') as readfile:
                shutil.copyfileobj(readfile, outfile)
            shutil.rmtree('temp_fasta', ignore_errors=False, onerror=None)

In [183]:
expected_gg = pd.read_csv('in-silico-mocks/expected_all.tsv', sep='\t')

In [190]:
expected_gg = expected_gg.replace(0,np.nan)

In [191]:
expected_gg_16S_Staggered = expected_gg.loc[expected_gg['16S_Staggered'].notnull(), ['#OTU ID', '16S_Staggered', 'taxonomy']]

In [196]:
pd.set_option('display.max_colwidth', None)

In [197]:
expected_gg_16S_Staggered

Unnamed: 0,#OTU ID,16S_Staggered,taxonomy
16,863728e1cec6befd5ba02d15baef4c36,1400.0,D_0__Bacteria; D_1__Proteobacteria; D_2__Alphaproteobacteria; D_3__SAR11 clade; D_4__Clade I
17,bdf18e8980906003f3978e5ee578c9ea,700.0,D_0__Bacteria; D_1__Actinobacteria; D_2__Acidimicrobiia; D_3__Actinomarinales; D_4__Actinomarinaceae; D_5__Candidatus Actinomarina; D_6__uncultured marine bacterium
18,2970af3e30642198ba1dfa1f304f310b,400.0,D_0__Archaea; D_1__Thaumarchaeota; D_2__Nitrososphaeria; D_3__Nitrosopumilales; D_4__Nitrosopumilaceae; D_5__Candidatus Nitrosopumilus; Ambiguous_taxa
19,beb8cb17f8152e488b6bc95bfcdb484f,400.0,D_0__Bacteria; D_1__Actinobacteria; D_2__Acidimicrobiia; D_3__Actinomarinales; D_4__Actinomarinaceae; D_5__Candidatus Actinomarina; Ambiguous_taxa
20,86cd70d2450e3f44f4a7c543f53008ee,300.0,D_0__Bacteria; D_1__Cyanobacteria; D_2__Oxyphotobacteria; D_3__Synechococcales; D_4__Cyanobiaceae; D_5__Prochlorococcus MIT9313
21,f4ad315ec51d8bee23cb39f39bc8b019,200.0,D_0__Bacteria; D_1__Proteobacteria; D_2__Gammaproteobacteria; D_3__SAR86 clade; D_4__Rhodobacteraceae bacterium REDSEA-S29_B10; D_5__Rhodobacteraceae bacterium REDSEA-S29_B10; D_6__Rhodobacteraceae bacterium REDSEA-S29_B10
22,b8b0859fd6afcfc39c1a7e6a50415ca3,100.0,D_0__Bacteria; D_1__Proteobacteria; D_2__Alphaproteobacteria; D_3__Puniceispirillales; D_4__SAR116 clade; D_5__uncultured marine bacterium; D_6__uncultured marine bacterium
23,0b1c2b9ed02095f223b442c9bdd35006,100.0,D_0__Bacteria; D_1__Proteobacteria; D_2__Alphaproteobacteria; D_3__Rhodospirillales; D_4__AEGEAN-169 marine group; D_5__uncultured marine bacterium; D_6__uncultured marine bacterium
24,0b7809b1b04598cfc5bbf6264159dde9,80.0,D_0__Archaea; D_1__Euryarchaeota; D_2__Thermoplasmata; D_3__Marine Group II
25,20a0d2f150b3312772f5c00e54fd7baf,80.0,D_0__Bacteria; D_1__Bacteroidetes; D_2__Bacteroidia; D_3__Flavobacteriales; D_4__Flavobacteriaceae; D_5__NS2b marine group


In [None]:
xtrain = df.loc[df['Survive'].notnull(), ['Age','Fare', 'Group_Size','deck', 'Pclass', 'Title' ]]
xtrain

__get_fig_per_group__ makes a heatmap of the observed against expected relative abundances for given group names and colors ratios above the expected in blues, and below the expected in reds.

In [None]:
def get_fig_per_group(groupname, expectedratio, duplicate='mean'):
    neoceratium = separated[separated['Taxon'].str.contains(groupname)]
    neoceratium = neoceratium[neoceratium.feature_frequency !=0]
    neoceratium.rename(columns = {'sample-id':'sample_id'}, inplace=True)
    if duplicate!='mean':
        neoceratiumR1 = neoceratium[neoceratium.sample_id == 'R46-18S-'+duplicate]
    else:
        neoceratiumR1 = neoceratium.groupby(['Forward_trim','Reverse_trim'])[['ratio']].mean()
        neoceratiumR1 = neoceratiumR1.reset_index()
    neoceratiumR1["Forward_trim"] = pd.to_numeric(neoceratiumR1["Forward_trim"])
    neoceratiumR1["Reverse_trim"] = pd.to_numeric(neoceratiumR1["Reverse_trim"])
    neoceratiumR1["Forward_trim"].replace({0: 280}, inplace=True)
    neoceratiumR1["Reverse_trim"].replace({0: 290}, inplace=True)
    neoceratiumR1merged = neoceratiumR1.groupby(['Forward_trim','Reverse_trim'])[['ratio']].mean()
    neoceratiumR1merged = neoceratiumR1merged.reset_index()
    tohm = neoceratiumR1merged.pivot("Forward_trim", "Reverse_trim", "ratio")
    tohm.rename({280: 'full'}, axis=0, inplace=True)
    tohm.rename({290: 'full'}, axis=1, inplace=True)
    ax = sns.heatmap(tohm, cmap="Oranges_r")#, mask= (tohm < (expectedratio-(0.0005*expectedratio))) & (tohm > (expectedratio+(0.0005*expectedratio)))) #cmap=sns.color_palette("hls", 90)
    ax = sns.heatmap(tohm, mask=tohm <= expectedratio, cmap=sns.color_palette("GnBu", 5)) #square=True, annot=False, vmin=0, vmax=1, cbar=False, ax=ax)
    #ax = sns.heatmap(tohm, mask=tohm >= expectedratio, cmap=sns.color_palette("Oranges_r", 5)) #square=True, annot=False, vmin=0, vmax=1, cbar=False, ax=ax)
    ax.invert_yaxis()
    plt.figure(figsize=(12,12))
    ax.set(xlabel='Reverse trim length', ylabel='Forward trim length')
    fig = ax.get_figure()
    fig.savefig('ratio'+groupname+'.png', bbox_inches = "tight")
    return neoceratiumR1merged

__ttst__ runs one sample t tests between each taxonomic groups observed against expected relative abundances, and plots the results in a boxplot for a single trimming combination

In [None]:
#run 1 sample t test
#takes trimcombination in format F40R40
def ttst(trimcombination):
    compr_obs_exp = expstagg.merge(separated, how='outer', on='Taxon')
    compr_obs_exp = compr_obs_exp.rename(columns={'sample-id': 'Replicate'})
    compr_obs_exp=compr_obs_exp.fillna(0)
    compr_obs_exp["group"].replace({0: "False positive"}, inplace=True)
    df_with_groups = compr_obs_exp[compr_obs_exp.table_id == trimcombination]
    
    
    means = df_with_groups.groupby('group').mean()
    newsi=means[['expected_ratio', 'ratio']].copy()
    newsi.sort_values('expected_ratio', ascending = False)
    newsi['Difference'] = newsi['expected_ratio'] / newsi['ratio']
    newsi
    
    # generate a boxplot to see the data distribution by treatments. Using boxplot, we can easily detect the differences between different treatments
    ax = sns.boxplot(x='ratio', y='group', data=df_with_groups).set(
    xlabel='Relative abundance', 
    ylabel='Group'
    )
    #ax.tick_params(axis='x', labelrotation=90)
    plt.show()
    
    results = []
    for group in groups:    
        arr = df_with_groups[(df_with_groups['group'] == group)]['ratio'].values  #Filter the dataframe. 
        results.append({'group': group,
                        'ratios': arr}) #Make a single "record" containing the table id, replicate, and ratio array.
        r_gr = pd.DataFrame.from_records(results)
    
    y = expstagg['expected_ratio']
    for i in range(len(y[0:-1])):
        xi = r_gr.iloc[i]['ratios']
        yi = y[i]
        print(r_gr.iloc[i]['group'])
        print(stats.ttest_1samp(xi, yi))
        #but there are outliers
    return (r_gr, newsi)

## Run analyses and make figures

In [130]:
def get_figures(community, composition, runnumber, R='all', F='all',level=7):
    df, comm = consolidate_tables(community) #concat all feature tables and their pipeline parameters
    merged = merge_metada() #add the metadata
    separated_taxonomies, separated_dic = pick_metadata(composition, runnumber, R='all', F='all') #extract the features only from one community
    rename_taxonomies() #rename the taxonomy tsv files for taxonomic beta diversity
    make_fasta() #extract the sequences
    make_tbd_hm(level=7) #make TBD heatmap
    ## add function to find best trim length ranges (white box)
    ## add table of thresholds + line plot
    #r2_plot()
    ##run t tests for best TBD and best r2, make boxplot
    #get_fig_per_group()
    ## get sequence comparison
    ## get files for import to evolview --tree, abundance table, branch coloring and expected ratios
    
    return figs

In [None]:
get_figures('18S', 'Even', '46')