In [122]:
#for importing, formatting and data manipulation
import pandas as pd
import numpy as np

In [380]:
def import_docs(comm):
    #import your ASV biom table
    biom = pd.read_csv('ASVs_'+comm+'.csv')

    #import your metadata
    md = pd.read_csv('Metadata_OG.csv') #take the habit of avoiding spaces in file names

    #melt your ASV table to attach dna metadata to ASVs
    biom.rename(columns={'#OTU ID': 'feature_id'}, inplace=True)
    md.rename(columns={'ID': 'sample_id'}, inplace=True)


    biomelted = biom.melt(id_vars=['feature_id'], var_name='sample_id', value_name='feature_frequency')
    
    return biomelted, md

In [392]:
def make_defract(biomelted, md_SF):
    #remove rows where samples where not size fractionated, i.e. base/tray/top
    md_SF = md_SF[md_SF['SizeFraction2'].isin(['S', 'L'])]

    #make a new column of total [DNA] per sample that were size fractionated and need to be pooled
    md_SF['[DNAt]'] = md_SF.groupby(['ID3'])['Concentration'].transform('sum')

    #separate small and large size fraction
    sep_S = md_SF[md_SF.SizeFraction2 == 'S']
    sep_L = md_SF[md_SF.SizeFraction2 == 'L']

    #calculate DNA proportion per size fraction
    md_SF['DNApr'] = md_SF['Concentration']/md_SF['[DNAt]']

    #merge with separated on common columns to get corresponding rel. abundances
    #md_SF = md_SF[['sample_id', 'DNApr', '[DNAt]']].copy()
    merged = pd.merge(biomelted, md_SF, on=['sample_id'], how='left') #all_md is the metadata file

    #remove the ASVs with a null read count
    sepSLRA = merged[merged.feature_frequency != 0]

    #calculate corrected per sample ratio, and corrected feature frequency of de-fractionated samples
    sepSLRA['Newfeature_frequency'] = sepSLRA['feature_frequency'] * sepSLRA['DNApr']
    sepSLRA['Newff'] = sepSLRA.groupby(['feature_id', 'ID3'])['Newfeature_frequency'].transform('sum')

    #remove the rows where there was no size fractionation (base, tray, top..)
    sepSLRA = sepSLRA[sepSLRA['Newff'].notna()]

    #make a new id for the new combined samples
    sepSLRA['sampleid'] = sepSLRA['ID3'].astype(str) + "SL"

    #uncomment the line above if merging smallandlarge
    sepSLRA['SizeFraction'] = 'SL'

    #rename the columns
    sepSLRA.rename(columns={'feature_frequency':'old_feature_frequency'}, inplace=True)
    sepSLRA.rename(columns={'Newff':'feature_frequency'}, inplace=True)
    sepSLRA = sepSLRA.drop_duplicates()

    #recalculate ratios
    sepSLRA['Total'] = sepSLRA['feature_frequency'].groupby(sepSLRA['sampleid']).transform('sum')
    sepSLRA['ratio'] = sepSLRA['feature_frequency']/sepSLRA['Total'] #calculate the relative abundance of a feature (0-1 scale per sample)
    sepSLRA['nASVs'] = sepSLRA['feature_id'].groupby(sepSLRA['sampleid']).transform('nunique') #calculate the number of ASVs per sample

    sepSLRA = sepSLRA.drop_duplicates()

    #make a new biom table
    newbiom = sepSLRA[['sampleid', 'feature_id', 'feature_frequency']].copy()
    newbiom.drop_duplicates(inplace=True)
    newbiom = newbiom.pivot(index='feature_id', columns='sampleid', values='feature_frequency')
    newbiom = newbiom.fillna(0)
    
    #save outputs to csv
    newbiom.to_csv('newbiom_'+comm+'.csv')
    sepSLRA.to_csv('CombinedSL_metadata_'+comm+'.csv')
    
    return merged, sepSLRA, newbiom

In [387]:
#here specify which community it is, either 'bact' or 'chloro'
comm = 'chloro'
biomelted, md = import_docs(comm)

In [388]:
#sort the concentrations to check they are all positive
md.sort_values('Concentration')

Unnamed: 0,sample_id,Sample,Cruise,BA,BAA,Category,TOC,Station_ID,WaterColumn,SizeFraction,...,MSC,MSC_DOS,BioPaper_Grid,BioPaper_ID,Fig1,Concentration,Sample.1,SizeFraction3,WaterColumn3,ID3
297,CES22-S8L-BD-D7-Lb,CES22-S8L-BD-D7-Lb,CES22,No,No,BD,No,8L-BD,D7,Lb,...,No,Day 5-6,EASTERN,CE_8L-BD_D7_L,,-1.6,CES22-S8L-BD-D7-Lb,L,D7,CE_8L-BD_D7
198,CES22-S30L-BD-D8-Lb,CES22-S30L-BD-D8-Lb,CES22,No,No,BD,No,30L-BD,D8,Lb,...,No,Day 15,NORTHERN-BD,CE_30L-BD_D8_L,,0.9,CES22-S30L-BD-D8-Lb,L,D8,CE_30L-BD_D8
120,CES22-S15L-MSC-D1R-BASE,CES22-S15L-MSC-D1R-BASE,CES22,No,No,MSC,No,15L,D1R,BASE,...,Yes,Day 13,EASTERN-MSC,CE_15L_D1R_BASE,,1.0,CES22-S15L-MSC-D1R-BASE,,MSC-D1,CE_15L_D1R
299,CES22-S8L-BD-D8-Lb,CES22-S8L-BD-D8-Lb,CES22,No,No,BD,No,8L-BD,D8,Lb,...,No,Day 5-6,EASTERN,CE_8L-BD_D8_L,,1.8,CES22-S8L-BD-D8-Lb,L,D8,CE_8L-BD_D8
196,CES22-S30L-BD-D7-Lb,CES22-S30L-BD-D7-Lb,CES22,No,No,BD,No,30L-BD,D7,Lb,...,No,Day 15,NORTHERN-BD,CE_30L-BD_D7_L,,1.9,CES22-S30L-BD-D7-Lb,L,D7,CE_30L-BD_D7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,CES22-S9L-80mLb,CES22-S9L-80mLb,CES22,No,No,Station,No,9L,D3,Lb,...,No,Day 5-6,EASTERN,CE_9L_D3_L,,163.7,CES22-S9L-80mLb,L,D3,CE_9L_D3
186,CES22-S30L-25mLb,CES22-S30L-25mLb,CES22,No,No,Station,No,30L,D2,Lb,...,No,Day 15,NORTHERN,CE_30L_D2_L,,167.1,CES22-S30L-25mLb,L,D2,CE_30L_D2
231,CES22-S6L-5mLb,CES22-S6L-5mLb,CES22,No,No,Station,No,6L,D1,Lb,...,No,Day 4,EASTERN,CE_6L_D1_L,,169.0,CES22-S6L-5mLb,L,D1,CE_6L_D1
86,CES22-S12-BA1-Tfinal-LTF-Lb,CES22-S12-BA1-Tfinal-LTF-Lb,CES22,Yes,BA1,Bioassay,No,12L-BA1,,Lb,...,No,,EASTERN-BA,CE_12L-BA1_NA_L,,183.6,CES22-S12-BA1-Tfinal-LTF-Lb,L,,CE_BA1_Tfinal-LTF-b


In [389]:
#remove the samples with a negative DNA concentration because we can't calculate that new feature frequency
samples_to_exclude = ['CE_8L-BD_D7']

#exclude those samples
md_SF = md[~md['ID3'].isin(samples_to_exclude)]

In [393]:
merged, sepSLRA, newbiom = make_defract(biomelted, md)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  md_SF['[DNAt]'] = md_SF.groupby(['ID3'])['Concentration'].transform('sum')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  md_SF['DNApr'] = md_SF['Concentration']/md_SF['[DNAt]']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sepSLRA['Newfeature_frequency'] = sepSLRA['feature_frequency'] * sepSLRA['

In [394]:
#just check which samples didn't match between the metadata and the ASV table
df2 = merged.DNApr.isnull().groupby([merged['sample_id']]).sum().astype(int).reset_index(name='counts')
sep_mismatch = df2[df2.counts != 0]
sep_mismatch #it's only the top, base, ..

Unnamed: 0,sample_id,counts
126,CES22-S15L-MSC-D1R-BASE,391
127,CES22-S15L-MSC-D1R-TRAY,391
128,CES22-S15L-MSC-D2Y-BASE,391
129,CES22-S15L-MSC-D2Y-TRAY,391
130,CES22-S15L-MSC-D3B-BASE,391
131,CES22-S15L-MSC-D3B-TRAY,391
148,CES22-S28LDay1-MSC-D1R-TOP,391
149,CES22-S28LDay1-MSC-D1R-TRAY,391
150,CES22-S28LDay1-MSC-D2Y-TOP,391
151,CES22-S28LDay1-MSC-D2Y-TRAY,391
