In [1]:
#Import libraries in python3 kernel
import pandas as pd
import seaborn as sns
import glob
import os
import sys
from pathlib import Path
#!conda install --yes --prefix {sys.prefix} boto
import boto
import shutil
#!conda install --yes --prefix {sys.prefix} tqdm
from tqdm.notebook import trange, tqdm
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import matplotlib.colors as colors
from matplotlib.colors import LogNorm
import numpy as np
import skbio
#import fastcluster #this package makes skbio run faster clustermaps but can be tricky with missing values from pairwise comparisons
from functools import reduce
#!conda install --yes --prefix {sys.prefix} biopython
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser
from Bio.SeqUtils import GC
from biom import load_table
from biom.table import Table
from collections import defaultdict
from collections import Counter
import statistics
import itertools as it
from scipy import stats
from matplotlib.ticker import FormatStrFormatter
import matplotlib.colors as mcolors
from qiime2 import Artifact
import tempfile
import zipfile
import yaml

#!conda install --yes --prefix {sys.prefix} -c etetoolkit ete3 
#!conda install -c bioconda seqkit
#pip install ete3
#conda install -c anaconda pyqt
#from ete3 import Tree, TreeStyle
%matplotlib inline

#Import libraries
#from matplotlib_venn import venn2, venn2_circles, venn2_unweighted
#from matplotlib_venn import venn3, venn3_circles
from matplotlib import pyplot as plt

In [5]:
# Special thanks to Alex Manuele https://github.com/alexmanuele
def consolidate_tables(community):
    
    if community == "16S":
        comm_id, comm = '16S', '02-PROKs'
        table_list = glob.glob('{0}/*/DADA2/table.qza'.format('/Users/Diana/Documents/escuela/phd/ch2/2014_trims/'+comm+'/all_trims'))
        print("Found all 16S tables")
    if community == "18S":
        comm_id, comm = '18S','02-EUKs'
        table_list = glob.glob('{0}/*/DADA2/table.qza'.format('/Users/Diana/Documents/escuela/phd/ch2/2014_trims/'+comm+'/all_trims'))
        print("Found all tables")
        
    dataframes = []  
    for table_path in table_list:
        with tempfile.TemporaryDirectory() as tempdir:
            #load table, dump contents to tempdir
            table = Artifact.load(table_path)
            #Make sure the tables are all FeatureFrequency type
            assert str(table.type) == 'FeatureTable[Frequency]', "{0}: Expected FeatureTable[Frequency], got {1}".format(table_path, table.type)
            Artifact.extract(table_path, tempdir)
            #get the provenance form the tempdir and format it for DF
            prov = '{0}/{1}/provenance/'.format(tempdir, table.uuid)
            action = yaml.load(open("{0}action/action.yaml".format(prov), 'r'), Loader=yaml.BaseLoader)
            paramlist = action['action']['parameters']
            paramlist.append({'table_uuid': "{}".format(table.uuid)})
            paramdict = {}
            for record in paramlist:
                paramdict.update(record)

            # Get the data into a dataframe
              #Biom data
            df = table.view(pd.DataFrame).unstack().reset_index()
            df.columns = ['feature_id', 'sample_name', 'feature_frequency']
            df['table_uuid'] = ["{}".format(table.uuid)] * df.shape[0]
              #param data
            pdf = pd.DataFrame.from_records([paramdict])
              #merge params into main df
            df = df.merge(pdf, on='table_uuid')
            

            #I like having these columns as the last three. Makes it more readable
            cols = df.columns.tolist()
            reorder = ['sample_name', 'feature_id', 'feature_frequency']
            for val in reorder:
                cols.append(cols.pop(cols.index(val)))
            df = df[cols]
            df['table_path'] = [table_path] * df.shape[0]
            dataframes.append(df)
            
            #Adding table_id, forward and reverse trim columns
            df['table_id'] = str(table_path.split('/')[-3]) #add a table_id column
            df[['forward_trim', 'reverse_trim']] = df['table_id'].str.split('R', 1).str
            df['forward_trim'] = df['forward_trim'].map(lambda x: x.lstrip('F'))
            df["forward_trim"] = pd.to_numeric(df["forward_trim"])
            df["reverse_trim"] = pd.to_numeric(df["reverse_trim"])

    #Stick all the dataframes together
    #outputfile="merged_all_tables.tsv"
    df = pd.concat(dataframes)
    df['sampleid'] = df['sample_name'].str.split('-S').str.get(0)
    df['sampleid'] = df['sampleid'].str.replace('-', '.')
    #df.to_csv(comm+'/merged_all_tables.tsv', sep='\t', index=False)
    print("Successfully saved all tables.")
    return df

In [3]:
def merge_metadata(df):
    all_md = pd.read_csv('/Users/Diana/Documents/escuela/phd/ch2/bedfordbasinTS/allmetadata.csv')

    tables = df[['sampleid', 'feature_id', 'feature_frequency']].copy()

    all_md['sampleid'] = all_md['sampleid'].str.replace('_', '.')
    merged = pd.merge(tables,all_md, on='sampleid', how='left') #all_md is the metadata file
    merged = merged[merged.feature_frequency != 0]
    print('Set up metadata ...')
    
    #merged.to_csv(comm+'/merged_asvs_metadata.tsv', sep = '\t')
    print('Saved merged_asvs_metadata.tsv')
    
    return merged

In [4]:
df = consolidate_tables('16S')

Found all 16S tables


  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
  df['forward_trim'], df['reverse_trim']

Successfully saved all tables.


In [None]:
merged = merge_metadata(df)