In [None]:
#for importing, formatting and data manipulation
import pandas as pd
import numpy as np
import glob
import datetime
#from time import time
#from datetime import datetime
#from datetime import timedelta
import tempfile
from qiime2 import Artifact
import zipfile
import yaml
import json

#for plotting
import matplotlib, random
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib_venn import venn3, venn3_circles
from matplotlib.patches import Patch
import seaborn as sns
#sns.set(style="whitegrid")
import plotly.express as px
%matplotlib inline
from IPython.display import display
from upsetplot import plot
#import pyupset as pyu
pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from pandas.plotting import register_matplotlib_converters
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
register_matplotlib_converters()
import scipy as sp
import statsmodels.api as sm

#for statistical analyses
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from skbio.diversity import alpha_diversity
from skbio.stats.distance import permanova
from skbio import DistanceMatrix
from scipy.spatial.distance import cdist
from skbio.stats.composition import clr
from skbio.stats.composition import alr
from skbio.stats.composition import ilr
from skbio.diversity.alpha import chao1
from skbio.stats.composition import ancom
import scipy.stats as stats
import statsmodels.api as sa
import statsmodels.formula.api as sfa
import scikit_posthocs as spPH

## Import and format metadata from lab, and BBMP

### Import lab metadata

In [None]:
def load_df():
    
    filenames = glob.glob('/Users/Diana/Documents/escuela/phd/ch2/bb_data/2022/METADATA.txt')
    #load 2022 metadata and concatenate it into one dataframe
    md = []
    for filename in filenames:
        df = pd.read_csv(filename, sep='\t')
        md.append(df)
        print (filename)
    
    md = pd.concat(md)
    
    #drop empty columns and rows
    md.dropna(how='all', axis=1, inplace=True) #empty cols
    md.dropna(how='all', inplace=True) #empty rows
    
    return md

In [None]:
md = load_df()
#md = md[["sampleid", "[DNA]ng/ul", "A260/280"]].copy()

In [None]:
from platform import python_version

print(python_version())

Modified version of load_df() to accomodate different metadata files specific to the size fractioning samples.

In [None]:
def load_df():
    
    filenames = glob.glob('/Users/Diana/Documents/escuela/phd/ch2/bb_data/2022/*.csv')
    #load all metadata from 2022 folder and concatenate them.
    md = []
    for filename in filenames:
        df = pd.read_csv(filename)
        df = df.rename(columns={'Depth Code 1-A, 5-B, 10-C, 60-D': 'depth_code',
                            'Size Code 3um - L 0.2um - S': 'size_code'}) 
        md.append(df)
        print (filename)
    
    md = pd.concat(md)
    
    #drop empty columns and rows
    md.dropna(how='all', axis=1, inplace=True) #empty cols
    md.dropna(how='all', inplace=True) #empty rows
    
    return md

In [None]:
#initial version of code for whole BB dataset uses this function but for the size fractions the metadata was slightly differently formatted so we manipulated each 
#mdsf = load_df()

In [None]:
#upload the absorbance per sample data
a260230 = pd.read_csv("/Users/Diana/Documents/escuela/phd/ch2/bb_data/2022/a260230.csv")
a260230 = a260230.dropna(how='all') #drop null rows and columns
a260230.dropna(how='all', axis=1, inplace=True)
a260230 = a260230.replace({pd.NA: np.nan})

In [None]:
#upload metadata of non size fractionated samples
noSF = pd.read_csv("/Users/Diana/Documents/escuela/phd/ch2/bb_data/2022/noSF.csv")
noSF = noSF.dropna(how='all')
noSF.dropna(how='all', axis=1, inplace=True)
noSF = noSF.replace({pd.NA: np.nan})
noSF = noSF.dropna(how='all') #drop null rows and columns
#uncomment the line below to remove metadata columns
#noSF = noSF[["sampleid", "[DNA]ng/ul", "A260/280", "date"]]

In [None]:
#upload metadata of size fractionated samples
SF = pd.read_csv("/Users/Diana/Documents/escuela/phd/ch2/bb_data/2022/SF.csv")
SF = SF.dropna(how='all')
SF.dropna(how='all', axis=1, inplace=True)
SF = SF.replace({pd.NA: np.nan})
SF = SF.rename(columns={'Depth Code 1-A, 5-B, 10-C, 60-D': 'depth_code',
                            'Size Code 3um - L 0.2um - S': 'size_code'}) 
#uncomment the line below to remove metadata columns
#SF = SF[["sampleid", "[DNA]ng/ul", "A260/280", "date"]]

#### Renumber dates

In [None]:
#create a dictionary for months
month_dic = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "Jun": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12
}
month_season = {
    "Jan": "Winter",
    "Feb": "Winter",
    "Mar": "Spring",
    "Apr": "Spring",
    "May": "Spring",
    "Jun": "Summer",
    "Jul": "Summer",
    "Aug": "Summer",
    "Sep": "Autumn",
    "Oct": "Autumn",
    "Nov": "Autumn",
    "Dec": "Winter"
}
depth_num = {
    "A": 1,
    "B": 5,
    "C": 10,
    "D": 60,
    "E": 30
}

In [None]:
def fill_dates(md):
    if 'weekn' not in md:
        md["weekn"] = md["sampleid"].str.extract(r'\.([1-9][0-9]?)[A-E]')
    md['weekn'] = pd.to_numeric(md['weekn'])
    md['date'] = md.groupby(['sampleid','weekn'], sort=False)['date'].apply(lambda x: x.ffill().bfill())

    #add month to a new column
    md['month_name'] = md['date'].str.split('-').str[1]
    md['year'] = 2022
    md=md[md.year==2022]

    #add month number
    md['month']= md['month_name'].map(month_dic)

    #add day number
    md['day'] = md['date'].str.split('-').str[0]
    md[["year", "month", "day"]] = md[["year", "month", "day"]].apply(pd.to_numeric)

    #remove symbol for better handling of data
    #md.rename(columns={"Week#": "Weekn"}, inplace=True)
    #md.rename(columns={"Depth": "depth"}, inplace=True) #to match dfo

    #change to int to remove decimals from date columns
    md.year = md.year.apply(int)
    md.day = md.day.apply(int)
    md.month = md.month.apply(int)
    #md.depth = md.depth.apply(int)
    #md.weekn = md.weekn.apply(int)

    #change to str to aggregate them into time_string to match dfos formatting of the date
    md.year = md.year.apply(str)
    md.month = md.month.apply(str)
    md.day = md.day.apply(str)

    md["depth_code"] = md["sampleid"].str.extract(r'[1-9][0-9]?([A-E])')
    md['depth']= md['depth_code'].map(depth_num)
    md['depth'] = pd.to_numeric(md['depth'])

    #add leading zero to match date format in dfo metadata
    md['month'] = md['month'].str.zfill(2)
    md['day'] = md['day'].str.zfill(2)

    md['time_string'] = md[['year', 'month', 'day']].agg('-'.join, axis=1)
    
    md["size_code"] = md["sampleid"].str.extract(r'[1-9][0-9]?[A-E]([L-S])')
    md["size_code"] = md["size_code"].fillna('W')
    
    return md

In [None]:
SF = fill_dates(SF)

In [None]:
noSF = noSF[noSF['weekn'] < 17]
noSF = fill_dates(noSF)

In [None]:
#verify which columns are shared between two dataframes
a = np.intersect1d(SF.columns, noSF.columns)
a

In [None]:
colsnum = ['A260/280', '[DNA]ng/ul', 'depth', 'elution_volume', 'filtration_volume ', 'weekn', 'year']
noSF[colsnum] = noSF[colsnum].apply(pd.to_numeric, errors='coerce', axis=1)
SF[colsnum] = SF[colsnum].apply(pd.to_numeric, errors='coerce', axis=1)

colstr = ['sampleid']
SF[colstr] = SF[colstr].astype("string")

SF = SF.replace({pd.NA: np.nan})
noSF = noSF.replace({pd.NA: np.nan})

mdsf = noSF.merge(SF, on=['A260/280', 'Extracted_By', 'Notes', '[DNA]ng/ul', 'date', 'day',
                           'depth', 'depth_code', 'elution_volume', 'extraction_date',
                           'filtration_volume ', 'month', 'month_name', 'sampleid',
                           'size_code', 'time_string', 'weekn', 'year'], how='outer')

In [None]:
a = np.intersect1d(mdsf.columns, a260230.columns)
a

In [None]:
#fill missing cell values with matching column name from other dataframe

a260230["weekn"] = a260230["sampleid"].str.extract(r'\.([1-9][0-9]?)[A-E]')
a260230['weekn'] = pd.to_numeric(a260230['weekn'])
    
a260230 = a260230.fillna(noSF)
mdsf = mdsf.fillna(a260230)

a260230["depth_code"] = a260230["sampleid"].str.extract(r'[1-9][0-9]?([A-E])')
a260230['depth']= a260230['depth_code'].map(depth_num)
a260230['depth'] = pd.to_numeric(a260230['depth'])


mdsf2 = mdsf.merge(a260230, on=['A260/280', '[DNA]ng/ul', 'extraction_date', 'sampleid', 'weekn', 'depth_code','depth'], how='outer')

mdsf["weekn"] = mdsf["sampleid"].str.extract(r'\.([1-9][0-9]?)[A-E]')
mdsf['weekn'] = pd.to_numeric(mdsf['weekn'])
mdsf['date'] = mdsf.groupby(['weekn'], sort=False)['date'].apply(lambda x: x.ffill().bfill())

mdsf = mdsf[mdsf['weekn'] < 17]

In [None]:
md = mdsf2.copy()

In [None]:
md.sort_values(by=['weekn', 'depth'],inplace=True)
md = md.fillna(method='ffill')

In [None]:
md.to_csv('regardons.csv')

### Import and manage BBMP data

#### Metadata __md__ is formatted. It contains 38 columns.
__md__ is the lab's metadata for sampling, extraction and sequencing. \
__dfo_md__ is BBMP remote sensing data (salinity, pH, temperature, density..) \
__bio_niskin__ is nutrient data \
Format __bio_niskin__ data to merge with __md__. __bio_niskin__ is 32 columns, including year, month, day, and depth. __dfo_md__ also has 32 columns, including year_time, month_time, day_time. To merge these data with __md__, we will change the time stamps columns to the same name, and generate a time_string column.

In [None]:
dfo_md = pd.read_csv("/Users/Diana/Documents/escuela/phd/ch2/bb_data/bbmp_aggregated_profiles.csv")
bio_niskin = pd.read_csv("/Users/Diana/Documents/escuela/phd/ch2/bb_data/BBMP_Data_2022.csv")#
#dfo_metadata_y14 = pd.read_csv("/Users/Diana/Documents/escuela/phd/bb_data/2019/data_export/trim-analysis/dfo_metadata_y14.tsv", sep='\t')

#change to str to aggregate them into time_string
bio_niskin = bio_niskin[bio_niskin.year==2022]
bio_niskin.year = bio_niskin.year.apply(str)
bio_niskin.month = bio_niskin.month.apply(str)
bio_niskin.day = bio_niskin.day.apply(str)
#add leading zero to match date format in dfo metadata
bio_niskin['month'] = bio_niskin['month'].str.zfill(2)
bio_niskin['day'] = bio_niskin['day'].str.zfill(2)

bio_niskin['time_string'] = bio_niskin[['year', 'month', 'day']].agg('-'.join, axis=1)

#make a new column for time_string without the time
dfo_md=dfo_md[dfo_md.year_time==2022]
dfo_md['time_string_time'] = dfo_md['time_string']
dfo_md['time_string'] = dfo_md['time_string'].str.split(' ').str[0]

#renaming columns to ensure correct merging
dfo_md.rename(columns={"depth":"bbmpdepth","pressure": "depth", "year_time": "year", "month_time": "month", "day_time": "day"}, inplace=True)

#change to int to remove decimals from date columns
cols = ['year', 'depth', 'month', 'day']
md[cols] = md[cols].apply(pd.to_numeric, errors='ignore', axis=1)
dfo_md[cols] = dfo_md[cols].apply(pd.to_numeric, errors='ignore', axis=1)
bio_niskin[cols] = bio_niskin[cols].apply(pd.to_numeric, errors='ignore', axis=1)

#drop empty columns and rows
dfo_md.dropna(how='all', axis=1, inplace=True) #empty cols
dfo_md.dropna(how='all', inplace=True) #empty rows

bio_niskin.dropna(how='all', axis=1, inplace=True) #empty cols
bio_niskin.dropna(how='all', inplace=True) #empty rows

#make a season column
md['season'] = ''

for month, season in month_season.items():
    md.loc[md['month_name'] == month, 'season'] = season

bio_niskin data has exact recorded depths, whereas BB sample data is restricted to categories: make a new column to allow for data merging

In [None]:
depths = np.array([1,5,10,60])

In [None]:
bio_niskin2= bio_niskin.copy()

In [None]:
length = 48 #number of weeks for the tile repeat
bio_niskin2['NewDepth'] = pd.DataFrame({'NewDepth': np.tile(depths, length)}) #tile depth categorical values

In [None]:
bio_niskin2=bio_niskin2.assign(NewDepth=depths[np.arange(len(bio_niskin2)) % len(depths)])

In [None]:
#order the two depth columns at the end of the dataframe to visually examine
cols_at_end = ['depth', 'NewDepth']
bio_niskin3 = bio_niskin2[[c for c in bio_niskin2 if c not in cols_at_end] 
        + [c for c in cols_at_end if c in bio_niskin2]]

In [None]:
#rename columns to ensure correct merging
bio_niskin3.rename(columns={'depth': 'truedepth', 'NewDepth': 'depth'}, inplace=True)

In [None]:
#make merging columns to same type
bio_niskin3[cols] = bio_niskin3[cols].apply(pd.to_numeric, errors='ignore', axis=1)
md[cols] = md[cols].apply(pd.to_numeric, errors='ignore', axis=1)
dfo_md[cols] = md[cols].apply(pd.to_numeric, errors='ignore', axis=1)

In [None]:
a = np.intersect1d(md.columns, dfo_md.columns)
a

In [None]:
#preview column types to allow for merging
#pd.set_option('display.max_rows', 35)
#md.dtypes

In [None]:
#convert merging columns to same type
colsnum = ['day', 'month']
dfo_md[colsnum] = dfo_md[colsnum].apply(pd.to_numeric, errors='coerce', axis=1)
md[colsnum] = md[colsnum].apply(pd.to_numeric, errors='coerce', axis=1)

In [None]:
#merging party
merged = pd.merge(md, dfo_md, on=['day', 'depth', 'month', 'pH', 'sigmaTheta', 'theta',
                                  'time_string', 'year'], how="left")
allyears = pd.merge(md, dfo_md, on=['day', 'depth', 'month', 'pH', 'sigmaTheta', 'theta',
                                    'time_string', 'year'], how="outer")

In [None]:
merged = merged[merged['weekn'] < 17]

In [None]:
merged.to_csv('regardons2.csv')

In [None]:
#convert column type to numeric for merging
allyears[cols] = allyears[cols].apply(pd.to_numeric, errors='ignore', axis=1)

#merged = merged.drop(index=237) #delete a row with missing information
merged[cols] = merged[cols].apply(pd.to_numeric, axis=1)
bio_niskin3[cols] = bio_niskin3[cols].apply(pd.to_numeric, axis=1)

#add nutrient data
#uncomment the line below if  need access to metadata outside the 16weeks samples in 2022
#preall_md= pd.merge(allyears, bio_niskin3, on=["day", "month", "year", 'depth'], how="outer")
all_md = pd.merge(merged, bio_niskin3, on=["day", "month", "year", 'depth'], how="left")

#split dfs by depth
shallow_depths = [1, 5, 10]
shallow = all_md[all_md["depth"] < 30]
#shallow = shallow.groupby(['year', 'month', "day"]).mean().reset_index()
deep = all_md[all_md.depth == 60]

#split dfs by season
year_season = all_md.groupby(by = ['year','season']).mean().reset_index()

Winter = year_season.loc[year_season['season'] == 'Winter',:]
Spring = year_season.loc[year_season['season'] == 'Spring',:]
Summer = year_season.loc[year_season['season'] == 'Summer',:]
Autumn = year_season.loc[year_season['season'] == 'Autumn',:]

#save output as csv
all_md.to_csv('allmetadata.csv')

In [None]:
d1 = all_md[all_md.depth_code == 'A']

In [None]:
sns.relplot(data=d1, x="weekn", y="Chlorophyll A", color="0.8", linewidth=.75, kind="line")

In [None]:
sns.relplot(data=d1, x="weekn", y="Phosphate", color="0.8", linewidth=.75, kind="line")

In [None]:
#find rows with null values at given column
emptynit = merged[merged['depth'].isna()]
emptynit

## Find missing data

In [None]:
emptynit = merged[merged['temperature'].isna()]

In [None]:
emptynit

## Plotting party

In [None]:
#plotly seasonal averages figure
fig2 = go.Figure()
for template in ["plotly_white"]:
    fig2.add_trace(go.Scatter(x=Winter['year'], y=Winter['temperature'],
                    mode='lines',
                    name='Winter',
                    marker_color='#838B8B'))
    fig2.add_trace(go.Scatter(x=Spring['year'], y=Spring['temperature'],
                    mode='lines',
                    name='Spring',
                    marker_color='#FFB5C5'))
    fig2.add_trace(go.Scatter(x=Summer['year'], y=Summer['temperature'],
                    mode='lines',
                    name='Summer',
                    marker_color='#87CEFF'))
    fig2.add_trace(go.Scatter(x=Autumn['year'], y=Autumn['temperature'],
                    mode='lines',
                    name='Autumn',
                    marker_color='#FF8000'))
    fig2.update_layout(
    height=800,
    xaxis_title="Years",
    yaxis_title='Temperature in degree',
    title_text='Average Temperature seasonwise over the years',
    template=template)

fig2.show()

In [None]:
#seaborn season averages plot
sns.lineplot(year_season['year'],year_season['temperature'], hue =year_season["season"])

In [None]:
plt.style.use('ggplot')

### Detect and plot anomalies in variables

In [None]:
def detect_anomalies(metadata, df, dpt, yr=all, month=all):
    
    sfd=df[df.depth==dpt]
    
    md_col = sfd[['event_id', metadata, "year", "month"]].copy()
    md_col = md_col[md_col[metadata].notna()]
    if yr != all:
        #mdcol_yr = md_col[md_col.Year == yr]
        mdcol_yr = md_col[md_col['year'].isin(yr)]
    else: 
        mdcol_yr = md_col
        
    if month != all:
        #mdcol_yr = mdcol_yr[mdcol_yr.Month == month]
        mdcol_yr = mdcol_yr[mdcol_yr['month'].isin(month)]
    
    mdcol_yr = mdcol_yr.drop(columns=['year', "month"])
    mdcol_yr = mdcol_yr.set_index(['event_id'])
    
    #modelling time
    outliers_fraction = float(.01)
    scaler = StandardScaler()
    np_scaled = scaler.fit_transform(mdcol_yr.values.reshape(-1, 1))
    data = pd.DataFrame(np_scaled)
    # train isolation forest
    model =  IsolationForest(contamination=outliers_fraction)
    model.fit(data)
    
    #predict data
    mdcol_yr['anomaly'] = model.predict(data)
    
    # visualization
    fig, ax = plt.subplots(figsize=(10,6))
    a = mdcol_yr.loc[mdcol_yr['anomaly'] == -1, [metadata]] #anomaly
    ax.plot(mdcol_yr.index, mdcol_yr[metadata], color='black', label = 'Normal')
    ax.scatter(a.index,a[metadata], color='red', label = 'Anomaly')
    #plt.axvline(36, ls='--')
    plt.legend()
    plt.show();
    #add axes names

In [None]:
detect_anomalies('Temperature', bio_niskin3, 1, yr={2022}, month={1,2,3,4})

In [None]:
detect_anomalies('Temperature', bio_niskin3, 5, yr={2022}, month={1,2,3,4})

In [None]:
detect_anomalies('Temperature', bio_niskin3, 10, yr={2022}, month={1,2,3,4})

In [None]:
detect_anomalies('Temperature', bio_niskin3, 60, yr={2022}, month={1,2,3,4})

## Add prokaryotic community

#### Upload functions

In [None]:
# Special thanks to Alex Manuele https://github.com/alexmanuele
def consolidate_tables(MG):
    if MG == '16S':
        comm = '02-PROKs'
    else :
        comm = '02-EUKs'
        
    table_list = glob.glob('{0}/table.qza'.format('/Users/Diana/Documents/escuela/phd/size_fractions/BB22_size-fraction-comparison-analysed/to_transfer/'+comm))
    print("Found all "+MG+" tables.")

        
    dataframes = []  
    for table_path in table_list:
        with tempfile.TemporaryDirectory() as tempdir:
            #load table, dump contents to tempdir
            table = Artifact.load(table_path)
            #Make sure the tables are all FeatureFrequency type
            assert str(table.type) == 'FeatureTable[Frequency]', "{0}: Expected FeatureTable[Frequency], got {1}".format(table_path, table.type)
            Artifact.extract(table_path, tempdir)
            #get the provenance form the tempdir and format it for DF
            prov = '{0}/{1}/provenance/'.format(tempdir, table.uuid)
            action = yaml.load(open("{0}action/action.yaml".format(prov), 'r'), Loader=yaml.BaseLoader)
            paramlist = action['action']['parameters']
            paramlist.append({'table_uuid': "{}".format(table.uuid)})
            paramdict = {}
            for record in paramlist:
                paramdict.update(record)

            # Get the data into a dataframe
              #Biom data
            df = table.view(pd.DataFrame).unstack().reset_index()
            df.columns = ['feature_id', 'sample_name', 'feature_frequency']
            df['table_uuid'] = ["{}".format(table.uuid)] * df.shape[0]
              #param data
            pdf = pd.DataFrame.from_records([paramdict])
              #merge params into main df
            df = df.merge(pdf, on='table_uuid')
            

            #I like having these columns as the last three. Makes it more readable
            cols = df.columns.tolist()
            reorder = ['sample_name', 'feature_id', 'feature_frequency']
            for val in reorder:
                cols.append(cols.pop(cols.index(val)))
            df = df[cols]
            df['table_path'] = [table_path] * df.shape[0]
            df['sample_name'] = df['sample_name'].str.replace('-', '.')
            dataframes.append(df)
            
            # Adding table_id, forward and reverse trim columns
            #df['table_id'] = str(table_path.split('/')[-3]) #add a table_id column
            #df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
            #df['forward_trim'] = df['forward_trim'].map(lambda x: x.lstrip('F'))
            #df["forward_trim"] = pd.to_numeric(df["forward_trim"])
            #df["reverse_trim"] = pd.to_numeric(df["reverse_trim"])

    #Stick all the dataframes together
    #outputfile="merged_all_tables.tsv"
    df = pd.concat(dataframes)
    df['sample_name'] = df['sample_name'].str.replace(r'\.S([1-9]|[1-9][0-9]|[1-9][0-9][0-9]).L001\.','', regex=True)
    
    #df.to_csv(comm+'/merged_all_tables.tsv', sep='\t', index=False)
    print("Successfully saved all tables.")
    return df, MG

In [None]:
def merge_metadata(df):
    #df = pd.read_csv('02-PROKs/'+'/merged_all_tables.tsv', sep='\t')

    tables = df[['sample_name', 'feature_id', 'feature_frequency']].copy()
    tables.rename(columns={'sample_name':'sampleid'}, inplace=True)

    all_md['sampleid'] = all_md['sampleid'].str.replace('_', '.')
    merged = pd.merge(tables,all_md, on='sampleid', how='left') #all_md is the metadata file
    merged = merged[merged.feature_frequency != 0]
    
    merged['year'] = 2022

    merged["size_code"] = merged["sampleid"].str.extract(r'[1-9][0-9]?[A-E]([L-S])')
    merged["size_code"] = merged["size_code"].fillna('W')
    merged["depth_code"] = merged["sampleid"].str.extract(r'[1-9][0-9]?([A-E])')
    merged['depth']= merged['depth_code'].map(depth_num)
    merged["weekn"] = merged["sampleid"].str.extract(r'\.([1-9][0-9]?)[A-E]')
    merged['weekn'] = pd.to_numeric(merged['weekn'])
    merged['depth'] = pd.to_numeric(merged['depth'])
    merged['date'] = merged.groupby('weekn', as_index=False)['date'].transform('first')
    
    merged['Total'] = merged['feature_frequency'].groupby(merged['sampleid']).transform('sum')
    merged['ratio'] = merged['feature_frequency']/merged['Total']
    merged['nASVs'] = merged['feature_id'].groupby(merged['sampleid']).transform('count')
    merged['weekdepth'] = merged["weekn"].astype(str) + merged["depth"].astype(str)
    merged['avg'] = merged['nASVs'].groupby(merged['weekdepth']).transform('mean')
    merged['diff'] = merged['nASVs'] - merged['avg']

    print('Set up metadata ...')
    
    #merged.to_csv(comm+'/merged_asvs_metadata.tsv', sep = '\t')
    print('Saved merged_asvs_metadata.tsv')
    
    return merged

In [None]:
def pick_metadata(merged, depth='all', size_fraction='both', year='all', R='all', F='all', txsubset = 'all'):
#make df of features/composition+run+comm

    depth = depth
    year = year
    size_fraction = size_fraction
    txsubset = txsubset
        
    files = glob.glob('{0}/*/class/*/data/taxonomy.tsv'.format('/Users/Diana/Documents/escuela/phd/size_fractions/BB22_size-fraction-comparison-analysed/to_transfer'))
    taxos = []
#    if not os.path.exists(path+composition):
#        os.mkdir(path+composition)
    for filename in files:
        tax = pd.read_csv(filename, sep='\t')
        taxos.append(tax)
        
    print('Appended all taxonomies to taxos')
    taxos = pd.concat(taxos)
    taxos = taxos.rename(columns={"Feature ID": "feature_id"}, errors="raise")
    taxos = taxos.drop_duplicates()

    separated = merged.merge(taxos, how='left', on='feature_id') #merged excludes features of frequency = 0
    separated = separated.drop_duplicates()
    
    if depth != 'all':
        separated = separated[separated["depth"] == depth]
    if size_fraction != 'both':
        separated = separated[separated["size_fraction"] == size_fraction]

    separated[['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']] = separated['Taxon'].str.split('; ', expand=True)
    cols = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
    for col in cols:
        separated[col] = separated[col].fillna('Unassigned')
        
    separated['Month'] = separated['date'].str.split('-').str[1]
    
    #separated['total'] = separated.groupby(['table_id','sample-id'])['feature_frequency'].transform('sum')
    #separated['ratio'] = separated['feature_frequency']/(separated['total'])
    #separated_taxonomies = separated.copy()
    
    #make a dictionary with keys for id-ing the taxon belonging to this sub-community
    #separated_dic = pd.Series(separated.Taxon.values,separated.feature_id.values).to_dict()
    print('Saved separated by metadata dataframe.')
    
    return separated

In [None]:
def taxbarplot(separated, level, depth, topn): #separated is the df, #level is a string of taxonomic level column name, depth is an integer
    sfd=separated[separated.depth==depth]
    toptaxa = sfd[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', level]].copy()
    toptaxa = toptaxa.drop_duplicates()
    df_agg = toptaxa.groupby(['size_code',level, 'depth']).agg({'feature_frequency':sum})
    topd = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(topn)
    topd = topd.to_frame()
    topd = topd.reset_index()


    df_agg = df_agg.reset_index()
    df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)
    
    cumulab = separated[['feature_frequency', 'depth', 'size_code', 'Genus']].copy()
    cumulab1 = cumulab.groupby(['Genus']).agg({'feature_frequency':sum})

    resultpivot = df_agg.pivot_table(index=level, columns='set_name', values='feature_frequency')
    resultpivot = resultpivot.fillna(0)
    resultpivot[resultpivot != 0] = 1
    tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
    tosave.to_csv(level+'_'+str(depth)+'16S_relab.csv')
    
    top10d_list = topd[level].unique()
    top10d = sfd.copy()
    top10d.loc[~top10d[level].isin(top10d_list), level] = 'Other' #isnot in top list
    phyld = top10d.groupby(['size_code','weekn', level])['ratio'].sum()
    phyld = phyld.reset_index()


    fig = px.bar(phyld, x="size_code", y="ratio", facet_col="weekn", color=level, labels={
                     "feature_frequency": "Relative abundance",
                     "size_code": "",
                     "weekn": "w"}, color_discrete_map=palette_dict)
    fig.update_xaxes(type='category', dtick=1)
    fig.update_layout(
        title="Relative abundance of top 10" + level + 'observed at Depth' + str(depth),
        yaxis_title="Relative abundance",
        xaxis_title="Size fraction",
        legend_title=level,
        font=dict(size=8)
    )

    fig.show()
    #fig.write_image("outputs/fig1.png")
    #fig.to_image(format="png")
    
    return phyld, top10d

In [None]:
def pcaplot(separated, depth, comm, columnperm, spc):
    
    if comm == '16S':
        folder = '02-PROKs'
    else:
        folder = '02-EUKs'
        
    
    if depth == 'all':
        df = separated.copy()
    else:
        df=separated[separated.depth==depth]
        
    
    if 'SL' in separated['size_code'].unique():
        #sizecode palette codes
        sizecodes = ['S', 'L', 'W', 'SL']
        palette_colors = sns.color_palette()
        palette_dict = {sizecode: color for sizecode, color in zip(sizecodes, palette_colors)}
        dicsc = pd.Series(df.size_code.values,index=df.sampleid).to_dict()
        color_rows_sc = {k: palette_dict[v] for k, v in dicsc.items()}
        seriescr = pd.Series(color_rows_sc)
    
    else:
        #sizecode palette codes
        sizecodes = ['S', 'L', 'W']
        palette_colors = sns.color_palette()
        palette_dict = {sizecode: color for sizecode, color in zip(sizecodes, palette_colors)}
        dicsc = pd.Series(df.size_code.values,index=df.sampleid).to_dict()
        color_rows_sc = {k: palette_dict[v] for k, v in dicsc.items()}
        seriescr = pd.Series(color_rows_sc)
    
    #month palette code
    df['Month'] = df['date'].str.split('-').str[1]
    months = ['Jan', 'Feb', 'Mar', 'May', 'Apr']
    palette_colors = sns.color_palette("flare")
    palette_dict_month = {monthname: color for monthname, color in zip(months, palette_colors)}
    dic = pd.Series(df.Month.values,index=df.sampleid).to_dict()
    color_rows_month = {k: palette_dict_month[v] for k, v in dic.items()}
    seriesmonthcr = pd.Series(color_rows_month)

    dfcolors = pd.DataFrame({'Month': seriesmonthcr,'Size code':seriescr})
    
    topiv = df[['feature_id', 'feature_frequency', 'sampleid']].copy()
    topiv = topiv.drop_duplicates()
    
    sfdpiv= topiv.pivot(index='sampleid', columns='feature_id', values='feature_frequency')
    sfdpiv=sfdpiv.fillna(0)
    sfdclr=sfdpiv.mask(sfdpiv==0).fillna(0.1)
    clr_transformed_array = clr(sfdclr)
    samples = sfdpiv.index
    asvs = sfdpiv.columns
    
    #Creating the dataframe with the clr transformed data, and assigning the sample names
    clr_transformed = pd.DataFrame(clr_transformed_array, columns=asvs)
    #Assigning the asv names
    clr_transformed['samples'] = samples
    clr_transformed = clr_transformed.set_index('samples')
    clr_transformed.head()

    #calculate distance matrix
    dist = cdist(clr_transformed, clr_transformed, 'euclid')
    distance_matrix = pd.DataFrame(dist, columns=samples)
    distance_matrix['samples'] = samples
    distance_matrix = distance_matrix.set_index('samples')

    #format for pca
    dm = DistanceMatrix(distance_matrix)

    pca = PCA(n_components=2)
    pca_features = pca.fit_transform(distance_matrix)
    
    ####
    sns.set(rc={"figure.figsize":(4, 3)})
    sns.set_style("whitegrid", {'axes.grid' : False})
    plot_df = pd.DataFrame(data = pca_features, columns = ['dim1', 'dim2'], index = sfdpiv.index)
    plot_df['dim1'] = plot_df['dim1']/1000
    plot_df['dim2'] = plot_df['dim2']/1000
    if depth =='all':
        plot_df2 = pd.merge(plot_df,df[['sampleid','size_code','depth']],on='sampleid', how='left')
    else:
        plot_df2 = pd.merge(plot_df,df[['sampleid','size_code','weekn']],on='sampleid', how='left')
        
    
    ##divide into pre-post bloom
    def get_stage(weekNb):
        if weekNb < 10:
            return 'Pre-bloom'
        elif weekNb == 10 :
            return 'Bloom'
        elif weekNb > 10:
            return 'Bloom'
    
    if depth != 'all':
        plot_df2['Time'] = plot_df2['weekn'].apply(get_stage)
    
    plot_df2 = plot_df2.rename(columns={'size_code': 'Size code'})
    
    pc1v = round(pca.explained_variance_ratio_[0]*100)
    pc2v = round(pca.explained_variance_ratio_[1]*100)
    
    #plot_df2 = plot_df2.drop_duplicates()
    #dfperm = plot_df2.set_index('sampleid')
    
    #permanova2 = permanova(dm, dfperm, columnperm)
    #results = permanova2(999)
    
    #plot
    
    if depth == 'all':
        var2 = 'depth'
    else:
        var2 = 'Time'
    
    sns.set_style("white")
    ax=sns.scatterplot(x = 'dim1', y = 'dim2', hue= 'Size code', style=var2, data = plot_df2, 
                       palette=palette_dict)#, size = 'Week_Group')#,palette=sns.color_palette("dark:salmon_r", as_cmap=True))
    plt.ylabel('PCo2 (' + str(pc2v) + '% variance explained)')
    plt.xlabel('PCo1 (' + str(pc1v) +'% variance explained)')
    ax.set_title('Depth ' + str(depth) + 'm', loc='left', weight='bold')
    plt.legend(frameon=False)
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    sns.despine()
    plt.savefig('outputs/'+folder+'/D'+str(depth)+spc+'_PCAplot.png', dpi=200, bbox_inches="tight")
    plt.clf()
    plt.cla()
    plt.close()
    
    print ( "Components = ", pca.n_components_ , ";\nTotal explained variance = ",
      round(pca.explained_variance_ratio_.sum(),5)  )
    
    print ("Components 1 and 2 are", pca.explained_variance_ratio_)
    
    # Retrieve Loadings
    loadings = pca.components_

    # Summarize Loadings by Metadata Category
    metadata_groups = plot_df2[var2].unique()
    metadata_contributions = {}
    
    for group in metadata_groups:
        group_variables = plot_df2.loc[plot_df2[var2] == group, 'sampleid']
        group_loadings = np.abs(loadings[:, [list(distance_matrix.columns).index(var) for var in group_variables]]).mean(axis=1)
        metadata_contributions[group] = group_loadings

    # Visual Representation
    for group, contributions in metadata_contributions.items():
        plt.barh(contributions, group) #range(1, len(contributions) + 1),

    plt.ylabel('Principal Component')
    plt.xlabel('Average Loading Contribution')
    sns.despine()
    plt.legend(frameon=False)
    plt.savefig('outputs/'+folder+'/D'+str(depth)+spc+'_PCAplot_brplot.png', dpi=200, bbox_inches="tight")
    plt.clf()
    plt.cla()
    plt.close()
        

    ##clustermap
    ax = sns.clustermap(distance_matrix, method="complete", cmap='RdBu', annot=True,
               yticklabels=True, row_colors = dfcolors,
               annot_kws={"size": 7}, figsize=(15,12));

    handles1 = [Patch(facecolor=palette_dict_month[key]) for key in palette_dict_month]
    plt.legend(handles1, palette_dict_month, title='Month',
               bbox_to_anchor=(1, 1), bbox_transform=plt.gcf().transFigure, loc='upper left')
    
    plt.savefig('outputs/'+folder+'/D'+str(depth)+spc+'_clustermap.png', dpi=200, bbox_inches="tight")


    return pca, pca_features, sfdclr

In [None]:
def boxplot_depth(separated, comm, depth, ycolumn, yaxislabel='def'):
    
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
    
    if yaxislabel != 'def':
        ycol = ycolumn
    
    #sfd=separated[separated.depth==depth]
    sfd = separated.copy()
    
    #sfd_S = sfd[['size_code', 'nASVs', 'weekn']].copy()
    #sfd_S = sfd_S.drop_duplicates()
    #sdfpv = sfd_S.pivot(index='weekn', columns='size_code', values='nASVs')
    #fvalue, pvalue = stats.f_oneway(sdfpv['L'], sdfpv['S'], sdfpv['W'])
    
    sfd_LM = sfd[['size_code', 'nASVs']].copy()
    sfd_LM = sfd_LM.drop_duplicates()
    lm = sfa.ols('nASVs ~ C(size_code)', data=sfd_LM).fit()
    anova = sa.stats.anova_lm(lm)
    results = spPH.posthoc_ttest(sfd_LM, val_col='nASVs', group_col='size_code', p_adjust='holm')
    
    if 'SL' in separated['size_code'].unique():
        #sizecode palette codes
        sizecodes = ['S', 'L', 'W', 'SL']
        palette_colors = sns.color_palette()
        palette_dict = {sizecode: color for sizecode, color in zip(sizecodes, palette_colors)}
    
    else:
        #define color palettes
        sizecodes = ['S', 'L', 'W']
        palette_colors = sns.color_palette()
        palette_dict = {sizecode: color for sizecode, color in zip(sizecodes, palette_colors)}
    
    
    
    #plot
    sns.set(rc={"figure.figsize":(4, 3)})
    sns.set_style("ticks")
    sns.boxplot(data=sfd, x="size_code", y=ycolumn, palette=palette_dict, order=sizecodes)#, hue="size_code")
    sns.despine()
    plt.ylabel(yaxislabel, fontsize=20)
    plt.xlabel('Size fraction', fontsize=20)

    #g.tick_params(labelsize=15)
    plt.savefig('outputs/'+comm_id+'/D'+str(depth)+'_adboxplot.png', dpi=200, bbox_inches="tight")
    plt.clf()
    plt.cla()
    plt.close()
    
    
    sns.set(rc={"figure.figsize":(7, 3)})
    sns.set_style("ticks")
    ax=sns.barplot(data=sfd, x="weekn", y="diff", hue="size_code", palette=palette_dict,
                  capsize=.15, errwidth=0.5)#, hue="size_code")
    sns.despine()
    plt.ylabel('Number of ASVs relative to weekly average', fontsize=20)
    plt.xlabel('Week number', fontsize=20)
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig('outputs/'+comm_id+'/D'+str(depth)+'_avgbarplot.png', dpi=200, bbox_inches="tight")

    plt.clf() 
    

#     glue = sfd[['size_code', 'weekn', 'diff']].copy()
#     glue = glue.drop_duplicates()
#     glue = glue.pivot(index="size_code", columns="weekn", values="diff")
#     floored_data = glue.apply(np.floor)
#     sns.set_style('ticks')
#     plt.figure(figsize=(8, 2))
#     cmap = sns.diverging_palette(240,240, as_cmap=True)
#     ax = sns.heatmap(floored_data, yticklabels=True, linewidths=.5, annot=True, annot_kws={"fontsize":8},
#                     cmap = cmap)
#     plt.savefig('outputs/'+comm_id+'/heatmap_nasv_change_d'+str(depth)+'_annot.png', bbox_inches='tight', dpi=300)
#     plt.clf() 
    
#     ax = sns.heatmap(floored_data, fmt='.1f', yticklabels=True, linewidths=.5,
#                     cmap = cmap)
#     plt.savefig('outputs/'+comm_id+'/heatmap_nasv_change_d'+str(depth)+'.png', bbox_inches='tight', dpi=300)
    
    
    plt.clf() 
    sns.set(rc={"figure.figsize":(7, 3)})
    sns.set_style("ticks")
    ax=sns.lineplot(x = "weekn", y = ycolumn, data=sfd, hue="size_code", palette=palette_dict)
    sns.despine()
    plt.ylabel(yaxislabel, fontsize=20)
    plt.xlabel('Week', fontsize=20)
    plt.legend(title='Size fraction')
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig('outputs/'+comm_id+'/D'+str(depth)+'_adlineplot.png', dpi=200, bbox_inches="tight")
    
    return anova, results

In [None]:
def upsetprep(comm, level, separated):
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    depths = [1, 5, 10, 30, 60]
    
    cumulab = separated[['feature_frequency', 'depth', 'size_code', level]].copy()
    cumulab1 = cumulab.groupby([level]).agg({'feature_frequency':sum})
    
    for d in depths:
        #make csv
        sfd=separated[separated.depth==d]
        
        toptaxa = sfd[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', level]].copy()
        toptaxa = toptaxa.drop_duplicates()
        df_agg = toptaxa.groupby(['size_code',level, 'depth']).agg({'feature_frequency':sum})
        topd = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(10)
        topd = topd.to_frame()
        topd = topd.reset_index()

        df_agg = df_agg.reset_index()
        df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)
    
        resultpivot = df_agg.pivot_table(index=level, columns='set_name', values='feature_frequency')
        resultpivot = resultpivot.fillna(0)
        resultpivot[resultpivot != 0] = 1
        tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
        tosave.to_csv('csvs/'+comm_id+'/'+level+'_d'+str(d)+'_relab.csv')
        
        
        #make json
        data = {
            "file": "https://raw.githubusercontent.com/dianahaider/size_fractions/main/csvs/"+comm_id+'/'+level+'_d'+str(d)+'_relab.csv',
            "name": comm + level,
            "header": 0,
            "separator": ",",
            "skip": 0,
            "meta":[
                {"type":"id", "index":0, "name":"Name"},
                {"type":"integer", "index":4, "name":"Rel. ab."}
            ],
            "sets": [
                {"format": "binary", "start":1, "end": 3}
            ]
        }
        
        with open('json/'+comm_id+'/'+level+'_d'+str(d)+'.json', 'w') as f:
            json.dump(data, f)

In [None]:
def plot_per_fid(comm, separated, depth, fid):
    
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
    
    if 'SL' in separated['size_code'].unique():
        #sizecode palette codes
        sizecodes = ['S', 'L', 'W', 'SL']
        palette_colors = sns.color_palette()
        palette_dict = {sizecode: color for sizecode, color in zip(sizecodes, palette_colors)}
    
    else:
        #sizecode palette codes
        sizecodes = ['S', 'L', 'W']
        palette_colors = sns.color_palette()
        palette_dict = {sizecode: color for sizecode, color in zip(sizecodes, palette_colors)}
    
    sfd=separated[separated.depth==depth]
    sfd['weekfid'] = sfd["weekn"].astype(str) + sfd["feature_id"].astype(str)
    sfd['avg_p_id'] = sfd['ratio'].groupby(sfd['weekfid']).transform('mean')
    sfd['diff_p_id'] = sfd['ratio'] - sfd['avg_p_id']
    
    sfd_f=sfd[sfd.feature_id==fid]
    
    ttl = sfd_f['Taxon'].iloc[0]
    
    sns.set(rc={"figure.figsize":(7, 3)})
    ax=sns.barplot(data=sfd_f, x="weekn", y="diff_p_id", hue="size_code", palette=palette_dict)#, hue="size_code")
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    plt.title(ttl)
    plt.ylabel('Ratio difference')
    plt.xlabel('Week number')
    plt.savefig('outputs/'+comm_id+'/D'+str(depth)+fid+'.png', dpi=200, bbox_inches="tight")

In [None]:
def run_ancom(separated, sfdclr, depth, ancomcol):
    
    sfd=separated[separated.depth==depth]

        
    df_ancom = sfd[['sampleid', ancomcol]].copy()
    df_ancom = df_ancom.drop_duplicates()
    df_ancom = df_ancom.set_index('sampleid')
    
    results = ancom(table=sfdclr, grouping=df_ancom[ancomcol])
    
    DAresults = results[0].copy()
    DARejected_SC = DAresults.loc[DAresults['Reject null hypothesis'] == True]
    DARejected_SC.sort_values(by=['W'])
    
    taxonomy = sfd[['feature_id', 'Confidence', 'Taxon', 'Phylum', 'Class', 'Family', 'Genus', 'Species']].copy()
    taxonomy = taxonomy.drop_duplicates()
    DARejected_SC_taxonomy = pd.merge(DARejected_SC, taxonomy, on="feature_id", how="left")
    DARejected_SC_taxonomy.sort_values(by='W')
    
    prcentile = results[1].copy()
    
    return DARejected_SC_taxonomy, prcentile

In [None]:
subtitle = 'From Jan7 2022 to Apr27 2022'
def plot_stackedbar_(df, labels, colors, title, subtitle, level):
        
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    fields = df.columns.tolist()
    
    # figure and axis
    fig, ax = plt.subplots(1, figsize=(8, 5))
# plot bars
    left = len(df) * [0]
    for idx, name in enumerate(fields):
        plt.barh(df.index, df[name], left = left, color=colors[idx])
        left = left + df[name]
# title and subtitle
    plt.title(title, loc='left')
    #plt.text(0, ax.get_yticks()[-1] + 0.75, subtitle)
# legend
    plt.legend(labels, bbox_to_anchor=([1, 1, 0, 0]), ncol=1, frameon=False)
# remove spines
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
# format x ticks
    xticks = np.arange(0,110,10)
    xlabels = ['{}%'.format(i) for i in np.arange(0,101,10)]
    plt.xticks(xticks, xlabels)
# adjust limits and draw grid lines
    plt.ylim(-0.5, ax.get_yticks()[-1] + 0.5)
    ax.xaxis.grid(color='gray', linestyle='dashed')
    plt.gca().invert_yaxis()
    plt.ylabel("Depth (m)")
    plt.savefig('outputs/'+comm_id+'/'+level+'alldepths_stacked_perc_weighted.png', dpi=200, bbox_inches="tight")
    plt.show()

In [None]:
subtitle = 'From Jan7 2022 to Apr27 2022'
def plot_stackedbar_p(df, labels, colors, title, subtitle, level):
        
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    fields = df.columns.tolist()
    
    # figure and axis
    fig, ax = plt.subplots(1, figsize=(8, 5))
# plot bars
    left = len(df) * [0]
    for idx, name in enumerate(fields):
        plt.barh(df.index, df[name], left = left, color=colors[idx])
        left = left + df[name]
# title and subtitle
    plt.title(title, loc='left')
    #plt.text(0, ax.get_yticks()[-1] + 0.75, subtitle)
# legend
    plt.legend(labels, bbox_to_anchor=([1, 1, 0, 0]), ncol=1, frameon=False)
# remove spines
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
# format x ticks
    xticks = np.arange(0,110,10)
    xlabels = ['{}%'.format(i) for i in np.arange(0,101,10)]
    plt.xticks(xticks, xlabels)
# adjust limits and draw grid lines
    plt.ylim(-0.5, ax.get_yticks()[-1] + 0.5)
    ax.xaxis.grid(color='gray', linestyle='dashed')
    plt.gca().invert_yaxis()
    plt.ylabel("Depth (m)")
    plt.savefig('outputs/'+comm_id+'/'+level+'alldepths_stacked_perc_weighted.png', dpi=200, bbox_inches="tight")
    plt.show()

In [None]:
subtitle = 'From Jan7 2022 to Apr27 2022'
def plot_stackedbar_p(df, labels, colors, title, subtitle, level):
        
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    fields = df.columns.tolist()
    
    # figure and axis
    fig, ax = plt.subplots(1, figsize=(8, 5))
# plot bars
    left = len(df) * [0]
    for idx, name in enumerate(fields):
        plt.barh(df.index, df[name], left = left, color=colors[idx])
        left = left + df[name]
# title and subtitle
    plt.title(title, loc='left')
    #plt.text(0, ax.get_yticks()[-1] + 0.75, subtitle)
# legend
    plt.legend(labels, bbox_to_anchor=([1, 1, 0, 0]), ncol=1, frameon=False)
# remove spines
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
# format x ticks
    xticks = np.arange(0,110,10)
    xlabels = ['{}%'.format(i) for i in np.arange(0,101,10)]
    plt.xticks(xticks, xlabels)
# adjust limits and draw grid lines
    plt.ylim(-0.5, ax.get_yticks()[-1] + 0.5)
    ax.xaxis.grid(color='gray', linestyle='dashed')
    plt.gca().invert_yaxis()
    plt.ylabel("Depth (m)")
    plt.savefig('outputs/'+comm_id+'/'+level+'alldepths_stacked_perc_weighted.png', dpi=200, bbox_inches="tight")
    plt.show()

In [None]:
def calcperc_defrac(comm, separated, level):
    
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    depths = [1, 5, 10, 30, 60]
    
    level = level
    
    dfplot = pd.DataFrame(columns=['Depth', 'SF', 'NSF', 'Both', 'DFr'])
    
    for d in range(len(depths)):
        sfd=separated[separated.depth==depths[d]]
        toptaxa = sfd[[level, 'feature_frequency', 'Taxon', 'size_code', 'weekn']].copy()
    
        toptaxa = toptaxa.drop_duplicates()
        df_agg = toptaxa.groupby(['size_code',level]).agg({'feature_frequency':sum})
    
        df_agg = df_agg.reset_index()
        resultpivot = df_agg.pivot_table(index=level, columns='size_code', values='feature_frequency')
        resultpivot = resultpivot.fillna(0)
    
        df1 = resultpivot.copy()
    
        df = resultpivot[['L', 'S', 'W']].copy()
        Sonly = df[(df['L'] == 0) & (df['W'] == 0)]
        Wonly = df[(df['L'] == 0) & (df['S'] == 0)]
        Lonly = df[(df['S'] == 0) & (df['W'] == 0)]
        LW = df[(df['S'] == 0) & (df['W'] != 0) & (df['L'] != 0)]
        LS = df[(df['W'] == 0) & (df['S'] != 0) & (df['L'] != 0)]
        SW = df[(df['W'] != 0) & (df['S'] != 0) & (df['L'] == 0)]
        LSW = df[~(df == 0).any(axis=1)]
        
        DFr = df1[(df1['SL'] != 0)]
        DFr = DFr[['SL']].copy()
    
        total = resultpivot.to_numpy().sum()
    
        SFdf = Lonly, LS, Sonly
        SF = pd.concat(SFdf)
        SF_value = SF.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
    
    
        Bothdf = LW, LSW, SW
        Both = pd.concat(Bothdf)
        Both_value = Both.to_numpy().sum()/total *100
    
        Wonly_value = Wonly.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
        Lonly_value = Lonly.to_numpy().sum()/total *100
        LS_value = LS.to_numpy().sum()/total *100
        DFr_value = DFr.to_numpy().sum()/total *100
        
        dfplot.loc[d,'Depth'] = depths[d]
        dfplot.loc[d,'SF'] = SF_value
        dfplot.loc[d,'NSF'] = Wonly_value
        dfplot.loc[d,'Both'] = Both_value
        dfplot.loc[d,'DFr'] = DFr_value
        
        dfplot_unweighted.loc[d,'Depth'] = depths[d]
        dfplot_unweighted.loc[d,'SF'] = len(Lonly) + len(Sonly) + len(LS)
        dfplot_unweighted.loc[d,'NSF'] = len(Wonly)
        dfplot_unweighted.loc[d,'Both'] = len(LW) + len(SW) + len(LSW)
        

        venn3(subsets = (len(Lonly), len(Sonly), len(LS), len(Wonly), len(LW), len(SW), len(LSW)), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

        plt.savefig("outputs/"+comm_id+"/D"+str(depths[d])+level+"_venn.png")
        plt.clf()
        plt.cla()
        plt.close()
    
    dfplot['Depth'] = dfplot['Depth'].astype(str)
    dfplot = dfplot.set_index('Depth')
        
    return dfplot, level, dfplot_unweighted

In [None]:
def calcperc_defrac_unweighted(comm, separated, level):
    
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    depths = [1, 5, 10, 30, 60]
    
    level = level
    
    dfplot = pd.DataFrame(columns=['Depth', 'SF', 'NSF', 'Both'])
    dfplot_unweighted = pd.DataFrame(columns=['Depth', 'SF', 'NSF', 'Both'])
    
    for d in range(len(depths)):
        sfd=separated[separated.depth==depths[d]]
        toptaxa = sfd[[level, 'feature_frequency', 'Taxon', 'size_code', 'weekn']].copy()
    
        toptaxa = toptaxa.drop_duplicates()
        df_agg = toptaxa.groupby(['size_code',level]).agg({'feature_frequency':sum})
    
        df_agg = df_agg.reset_index()
        resultpivot = df_agg.pivot_table(index=level, columns='size_code', values='feature_frequency')
        resultpivot = resultpivot.fillna(0)
    
        df1 = resultpivot.copy()
    
        df = resultpivot[['L', 'S', 'W']].copy()
        Sonly = df[(df['L'] == 0) & (df['W'] == 0)]
        Wonly = df[(df['L'] == 0) & (df['S'] == 0)]
        Lonly = df[(df['S'] == 0) & (df['W'] == 0)]
        LW = df[(df['S'] == 0) & (df['W'] != 0) & (df['L'] != 0)]
        LS = df[(df['W'] == 0) & (df['S'] != 0) & (df['L'] != 0)]
        SW = df[(df['W'] != 0) & (df['S'] != 0) & (df['L'] == 0)]
        LSW = df[~(df == 0).any(axis=1)]
    
        total = len(resultpivot)
    
        SFdf = Lonly, LS, Sonly
        SF = pd.concat(SFdf)
        SF_value = len(SF)/total *100
        
        Sonly_value = len(Sonly)/total *100
    
    
        Bothdf = LW, LSW, SW
        Both = pd.concat(Bothdf)
        Both_value = len(Both)/total *100
    
        Wonly_value = len(Wonly)/total *100
        
        Sonly_value = len(Sonly)/total *100
        Lonly_value = len(Lonly)/total *100
        LS_value = len(LS)/total *100
        
        dfplot.loc[d,'Depth'] = depths[d]
        dfplot.loc[d,'SF'] = SF_value
        dfplot.loc[d,'NSF'] = Wonly_value
        dfplot.loc[d,'Both'] = Both_value
        
        dfplot_unweighted.loc[d,'Depth'] = depths[d]
        dfplot_unweighted.loc[d,'SF'] = Lonly_value + Sonly_value + LS_value
        dfplot_unweighted.loc[d,'NSF'] = Wonly_value
        dfplot_unweighted.loc[d,'Both'] = 100 - (Lonly_value + Sonly_value + LS_value + Wonly_value)
        

        venn3(subsets = (len(Lonly), len(Sonly), len(LS), len(Wonly), len(LW), len(SW), len(LSW)), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

        plt.savefig("outputs/"+comm_id+"/D"+str(depths[d])+level+"_venn.png")
        plt.clf()
        plt.cla()
        plt.close()
    
    dfplot['Depth'] = dfplot['Depth'].astype(str)
    dfplot = dfplot.set_index('Depth')
        
    return dfplot, level, dfplot_unweighted

In [None]:
def calcperc(comm, separated, level):
    
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    depths = [1, 5, 10, 30, 60]
    
    level = level
    
    dfplot = pd.DataFrame(columns=['Depth', 'SF', 'NSF', 'Both'])
    
    for d in range(len(depths)):
        sfd=separated[separated.depth==depths[d]]
        toptaxa = sfd[[level, 'feature_frequency', 'Taxon', 'size_code', 'weekn']].copy()
    
        toptaxa = toptaxa.drop_duplicates()
        df_agg = toptaxa.groupby(['size_code',level]).agg({'feature_frequency':sum})
    
        df_agg = df_agg.reset_index()
        resultpivot = df_agg.pivot_table(index=level, columns='size_code', values='feature_frequency')
        resultpivot = resultpivot.fillna(0)
    
        df = resultpivot.copy()
    
        Sonly = df[(df['L'] == 0) & (df['W'] == 0)]
        Wonly = df[(df['L'] == 0) & (df['S'] == 0)]
        Lonly = df[(df['S'] == 0) & (df['W'] == 0)]
        LW = df[(df['S'] == 0) & (df['W'] != 0) & (df['L'] != 0)]
        LS = df[(df['W'] == 0) & (df['S'] != 0) & (df['L'] != 0)]
        SW = df[(df['W'] != 0) & (df['S'] != 0) & (df['L'] == 0)]
        LSW = df[~(df == 0).any(axis=1)]
    
        total = df.to_numpy().sum()
    
        SFdf = Lonly, LS, Sonly
        SF = pd.concat(SFdf)
        SF_value = SF.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
    
    
        Bothdf = LW, LSW, SW
        Both = pd.concat(Bothdf)
        Both_value = Both.to_numpy().sum()/total *100
    
        Wonly_value = Wonly.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
        Lonly_value = Lonly.to_numpy().sum()/total *100
        LS_value = LS.to_numpy().sum()/total *100
        
        dfplot.loc[d,'Depth'] = depths[d]
        dfplot.loc[d,'SF'] = SF_value
        dfplot.loc[d,'NSF'] = Wonly_value
        dfplot.loc[d,'Both'] = Both_value
        

        venn3(subsets = (len(Lonly), len(Sonly), len(LS), len(Wonly), len(LW), len(SW), len(LSW)), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

        plt.savefig("outputs/"+comm_id+"/D"+str(depths[d])+level+"_venn.png")
        plt.clf()
        plt.cla()
        plt.close()
    
    dfplot['Depth'] = dfplot['Depth'].astype(str)
    dfplot = dfplot.set_index('Depth')
        
    return dfplot, level

In [None]:
def calcperc_SLNSF(comm, separated, level):
    
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    depths = [1, 5, 10, 30, 60]
    
    level = level
    
    dfplot = pd.DataFrame(columns=['Depth', 'Sonly', 'Lonly', 'LS', 'NSF'])
    
    for d in range(len(depths)):
        sfd=separated[separated.depth==depths[d]]
        toptaxa = sfd[[level, 'feature_frequency', 'Taxon', 'size_code', 'weekn']].copy()
    
        toptaxa = toptaxa.drop_duplicates()
        df_agg = toptaxa.groupby(['size_code',level]).agg({'feature_frequency':sum})
    
        df_agg = df_agg.reset_index()
        resultpivot = df_agg.pivot_table(index=level, columns='size_code', values='feature_frequency')
        resultpivot = resultpivot.fillna(0)
    
        df = resultpivot.copy()
    
        Sonly = df[(df['L'] == 0) & (df['W'] == 0)]
        Wonly = df[(df['L'] == 0) & (df['S'] == 0)]
        Lonly = df[(df['S'] == 0) & (df['W'] == 0)]
        LW = df[(df['S'] == 0) & (df['W'] != 0) & (df['L'] != 0)]
        LS = df[(df['W'] == 0) & (df['S'] != 0) & (df['L'] != 0)]
        SW = df[(df['W'] != 0) & (df['S'] != 0) & (df['L'] == 0)]
        LSW = df[~(df == 0).any(axis=1)]
    
        total = df.to_numpy().sum()
    
        SFdf = Lonly, LS, Sonly
        SF = pd.concat(SFdf)
        SF_value = SF.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
    
    
        Bothdf = LW, LSW, SW
        Both = pd.concat(Bothdf)
        Both_value = Both.to_numpy().sum()/total *100
    
        Wonly_value = Wonly.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
        Lonly_value = Lonly.to_numpy().sum()/total *100
        LS_value = LS.to_numpy().sum()/total *100
        
        NewTotal = Sonly_value + Lonly_value + LS_value + Wonly_value
        
        
        dfplot.loc[d,'Depth'] = depths[d]
        dfplot.loc[d,'Sonly'] = Sonly_value
        dfplot.loc[d,'Lonly'] = Lonly_value
        dfplot.loc[d,'LS'] = LS_value
        dfplot.loc[d,'NSF'] = Wonly_value
        

        venn3(subsets = (len(Lonly), len(Sonly), len(LS), len(Wonly), len(LW), len(SW), len(LSW)), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

        plt.savefig("outputs/"+comm_id+"/D"+str(depths[d])+level+"_venn.png")
        plt.clf()
        plt.cla()
        plt.close()
    
    dfplot['Depth'] = dfplot['Depth'].astype(str)
    dfplot = dfplot.set_index('Depth')
    dfplot_normalized = dfplot/NewTotal *100
        
    return dfplot, dfplot_normalized, level

In [None]:
def calcperc_LSW(comm, separated, level):
    
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    depths = [1, 5, 10, 30, 60]
    
    level = level
    
    dfplot = pd.DataFrame(columns=['Depth', 'NSF', 'LW', 'SW', 'LSW'])
    
    for d in range(len(depths)):
        sfd=separated[separated.depth==depths[d]]
        toptaxa = sfd[[level, 'feature_frequency', 'Taxon', 'size_code', 'weekn']].copy()
    
        toptaxa = toptaxa.drop_duplicates()
        df_agg = toptaxa.groupby(['size_code',level]).agg({'feature_frequency':sum})
    
        df_agg = df_agg.reset_index()
        resultpivot = df_agg.pivot_table(index=level, columns='size_code', values='feature_frequency')
        resultpivot = resultpivot.fillna(0)
    
        df = resultpivot.copy()
    
        Sonly = df[(df['L'] == 0) & (df['W'] == 0)]
        Wonly = df[(df['L'] == 0) & (df['S'] == 0)]
        Lonly = df[(df['S'] == 0) & (df['W'] == 0)]
        LW = df[(df['S'] == 0) & (df['W'] != 0) & (df['L'] != 0)]
        LS = df[(df['W'] == 0) & (df['S'] != 0) & (df['L'] != 0)]
        SW = df[(df['W'] != 0) & (df['S'] != 0) & (df['L'] == 0)]
        LSW = df[~(df == 0).any(axis=1)]
    
        total = df.to_numpy().sum()
    
        SFdf = Lonly, LS, Sonly
        SF = pd.concat(SFdf)
        SF_value = SF.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
    
    
        Bothdf = LW, LSW, SW
        Both = pd.concat(Bothdf)
        Both_value = Both.to_numpy().sum()/total *100
    
        Wonly_value = Wonly.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
        Lonly_value = Lonly.to_numpy().sum()/total *100
        LS_value = LS.to_numpy().sum()/total *100
        
        LW_value = LW.to_numpy().sum()/total *100
        SW_value = SW.to_numpy().sum()/total *100
        LSW_value = LSW.to_numpy().sum()/total *100
        
        NewTotal = Sonly_value + Lonly_value + LS_value + Wonly_value
        
        
        dfplot.loc[d,'Depth'] = depths[d]
        dfplot.loc[d,'NSF'] = Wonly_value
        dfplot.loc[d,'LW'] = LW_value
        dfplot.loc[d,'SW'] = SW_value
        dfplot.loc[d,'LSW'] = LSW_value
        

        venn3(subsets = (len(Lonly), len(Sonly), len(LS), len(Wonly), len(LW), len(SW), len(LSW)), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

        plt.savefig("outputs/"+comm_id+"/D"+str(depths[d])+level+"_venn.png")
        plt.clf()
        plt.cla()
        plt.close()
    
    dfplot['Depth'] = dfplot['Depth'].astype(str)
    dfplot = dfplot.set_index('Depth')
    dfplot_normalized = dfplot/NewTotal *100
        
    return dfplot, dfplot_normalized, level

In [None]:
def calcperc_LS_W(comm, separated, level):
    
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    depths = [1, 5, 10, 30, 60]
    
    level = level
    
    dfplot = pd.DataFrame(columns=['Depth', 'NSF', 'LW', 'SW'])
    
    for d in range(len(depths)):
        sfd=separated[separated.depth==depths[d]]
        toptaxa = sfd[[level, 'feature_frequency', 'Taxon', 'size_code', 'weekn']].copy()
    
        toptaxa = toptaxa.drop_duplicates()
        df_agg = toptaxa.groupby(['size_code',level]).agg({'feature_frequency':sum})
    
        df_agg = df_agg.reset_index()
        resultpivot = df_agg.pivot_table(index=level, columns='size_code', values='feature_frequency')
        resultpivot = resultpivot.fillna(0)
    
        df = resultpivot.copy()
    
        Sonly = df[(df['L'] == 0) & (df['W'] == 0)]
        Wonly = df[(df['L'] == 0) & (df['S'] == 0)]
        Lonly = df[(df['S'] == 0) & (df['W'] == 0)]
        LW = df[(df['S'] == 0) & (df['W'] != 0) & (df['L'] != 0)]
        LS = df[(df['W'] == 0) & (df['S'] != 0) & (df['L'] != 0)]
        SW = df[(df['W'] != 0) & (df['S'] != 0) & (df['L'] == 0)]
        LSW = df[~(df == 0).any(axis=1)]
    
        total = df.to_numpy().sum()
    
        SFdf = Lonly, LS, Sonly
        SF = pd.concat(SFdf)
        SF_value = SF.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
    
    
        Bothdf = LW, LSW, SW
        Both = pd.concat(Bothdf)
        Both_value = Both.to_numpy().sum()/total *100
    
        Wonly_value = Wonly.to_numpy().sum()/total *100
        
        Sonly_value = Sonly.to_numpy().sum()/total *100
        Lonly_value = Lonly.to_numpy().sum()/total *100
        LS_value = LS.to_numpy().sum()/total *100
        
        LW_value = LW.to_numpy().sum()/total *100
        SW_value = SW.to_numpy().sum()/total *100
        LSW_value = LSW.to_numpy().sum()/total *100
        
        NewTotal = Sonly_value + Lonly_value + LS_value + Wonly_value
        
        
        dfplot.loc[d,'Depth'] = depths[d]
        dfplot.loc[d,'NSF'] = Wonly_value
        dfplot.loc[d,'LW'] = LW_value
        dfplot.loc[d,'SW'] = SW_value
        

        venn3(subsets = (len(Lonly), len(Sonly), len(LS), len(Wonly), len(LW), len(SW), len(LSW)), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

        plt.savefig("outputs/"+comm_id+"/D"+str(depths[d])+level+"_venn.png")
        plt.clf()
        plt.cla()
        plt.close()
    
    dfplot['Depth'] = dfplot['Depth'].astype(str)
    dfplot = dfplot.set_index('Depth')
    dfplot_normalized = dfplot/NewTotal *100
        
    return dfplot, dfplot_normalized, level

In [None]:
#subtitle = 'From Jan7 2022 to Apr27 2022'
def plot_stackedbar_p_SLNSF(df, labels, colors, title, subtitle, level, xmax=110, xtick=10):
        
    if comm == '16S':
        comm_id = '02-PROKs'
    else:
        comm_id = '02-EUKs'
        
    fields = df.columns.tolist()
    
    name =[x for x in globals() if globals()[x] is df][0]
    
    # figure and axis
    fig, ax = plt.subplots(1, figsize=(8, 5))
# plot bars
    left = len(df) * [0]
    for idx, name in enumerate(fields):
        plt.barh(df.index, df[name], left = left, color=colors[idx])
        left = left + df[name]
# title and subtitle
    plt.title(title, loc='left')
    #plt.text(0, ax.get_yticks()[-1] + 0.75, subtitle)
# legend
    plt.legend(labels, bbox_to_anchor=([1, 1, 0, 0]), ncol=1, frameon=False)
# remove spines
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
# format x ticks
    xticks = np.arange(0,xmax,xtick)
    xlabels = ['{}%'.format(i) for i in np.arange(0,xmax,xtick)]
    plt.xticks(xticks, xlabels)
# adjust limits and draw grid lines
    plt.ylim(-0.5, ax.get_yticks()[-1] + 0.5)
    ax.xaxis.grid(color='gray', linestyle='dashed')
    plt.gca().invert_yaxis()
    plt.ylabel("Depth (m)")
    plt.savefig('outputs/'+comm_id+'/'+level+'alldepths_stacked_perc_weighted'+name+'.png', dpi=200, bbox_inches="tight")
    plt.show()

Data prep

In [None]:
#generate a dataframe from all specified amplicon
df, comm = consolidate_tables('16S')

In [None]:
#only if needed redefine the metadata file used to create merged
#all_md = md.copy()

In [None]:
merged = merge_metadata(df)

In [None]:
separated = pick_metadata(merged)

In [None]:
#remove chloroplast and cyanobacteria from 16S
chloroplast = separated[separated.Taxon.str.contains("Cyanobacteria")]
separated = separated[~separated.Taxon.str.contains("Cyanobacteria")]
separated = separated[~separated.Taxon.str.contains("Chloroplast")]
#count the number of features removed
chloroplast['feature_id'].nunique()

In [None]:
separated = separated.reset_index(drop=True)

Run the following code to generate "newseparated" which is the union of small and large size fractions

In [None]:
#make sure all size codes are indicated
all_md["size_code"] = all_md["sampleid"].str.extract(r'[1-9][0-9]?[A-E]([L-S])')
all_md["size_code"] = all_md["size_code"].fillna('W')

#only keep values from weeks 1 to 16
sep_SL = all_md[all_md.size_code != "W"]
sep_SL = sep_SL.drop(sep_SL[sep_SL.weekn > 16].index)

#sum [DNA] of small and large size fractions
sep_SL['[DNAt]'] = sep_SL.groupby(['weekn', 'depth'])['[DNA]ng/ul'].transform('sum')

#separate small and large size fraction
sep_S = sep_SL[sep_SL.size_code == 'S']
sep_L = sep_SL[sep_SL.size_code == 'L']

#calculate DNA proportion per size fraction
sep_SL['DNApr'] = sep_SL['[DNA]ng/ul']/sep_SL['[DNAt]']

#merge with separated on common columns to get corresponding rel. abundances
sep_SL = sep_SL[['sampleid', 'DNApr', '[DNAt]']].copy()
sepSLRA = pd.merge(separated, sep_SL, on=['sampleid'], how='left') #all_md is the metadata file

#exclude ASVs from the whole water
sep_SLRA = sepSLRA[separated.size_code != "W"]

#calculate corrected per sample ratio, and corrected feature frequency of de-fractionated samples
sep_SLRA['Newfeature_frequency'] = sep_SLRA['feature_frequency'] * sep_SLRA['DNApr']
sep_SLRA['Newff'] = sep_SLRA.groupby(['feature_id', 'weekn', 'depth'])['Newfeature_frequency'].transform('sum')


#sep_SLRA = sep_SLRA.drop(['sampleid', 'size_code'], axis=1)
sep_SLRA['sampleid'] = "BB22." + sep_SLRA['weekn'].astype(str) + sep_SLRA['depth_code'] + "SL"

#uncomment the line below if keeping small and large original sample
#sep_SLRA['size_code'] = sep_SLRA['size_code'] + '-DFr'

#uncomment the line above if merging smallandlarge
sep_SLRA['size_code'] = 'SL'

#drop unecessary columns which might rise merging conflicts
sep_SLRA = sep_SLRA.drop(['feature_frequency', 'Total', 'ratio', 'nASVs', 'weekdepth', 'avg',
                          'diff', 'extraction_date', '[DNA]ng/ul', 'A260/280', 'A260/230',
                          'Newfeature_frequency'], axis=1)
sep_SLRA.rename(columns={'Newff':'feature_frequency'}, inplace=True)
sep_SLRA = sep_SLRA.drop_duplicates()

#recalculate ratios
sep_SLRA['Total'] = sep_SLRA['feature_frequency'].groupby(sep_SLRA['sampleid']).transform('sum')
sep_SLRA['ratio'] = sep_SLRA['feature_frequency']/sep_SLRA['Total']
sep_SLRA['nASVs'] = sep_SLRA['feature_id'].groupby(sep_SLRA['sampleid']).transform('nunique')

sep_SLRA = sep_SLRA.drop_duplicates()

#make new df dependingg on plotting needs
sep_WO = separated[separated.size_code == "W"]
sep_WO = sep_WO.drop_duplicates()

sep_S = separated[separated.size_code == "S"]
sep_L = separated[separated.size_code == "L"]


sep_WO.reset_index(inplace=True, drop=True)
sep_SLRA.reset_index(inplace=True, drop=True)

#newseparated = pd.concat([sep_SLRA.reset_index(drop=True), sep_WO.reset_index(drop=True)], axis=0).reset_index(drop=True)
newseparated = pd.concat([sep_SLRA, sep_WO, sep_L, sep_S], ignore_index=True)

newseparated['weekdepth'] = newseparated["weekn"].astype(str) + newseparated["depth"].astype(str)
newseparated['avg'] = newseparated['nASVs'].groupby(newseparated['weekdepth']).transform('mean')
newseparated['diff'] = newseparated['nASVs'] - newseparated['avg']

### Richness analysis

In [None]:
sfd1 = separated[['sampleid','size_code', 'weekn', 'nASVs', 'depth']].copy()

In [None]:
#group the dataframe with all features to obtain either the mean or std of number of features per size fraction
sfd1.groupby(['size_code']).std()

In [None]:
sfd1.describe()

In [None]:
#run the visualisations for alpha diversity and run pairwise t-tests between size fractions for richness values
anova, results = boxplot_depth(separated, comm, 60, 'nASVs', 'Number of ASVs')

In [None]:
results

## Simple regression analysis between pairs of size fractions

In [None]:
#data prep
if comm == '16S':
    comm_id = '02-PROKs'
else:
    comm_id = '02-EUKs'

depth = 1

d1 = newseparated.loc[newseparated['depth'] == depth]
forpl = d1[['ratio', 'feature_id', 'sampleid', 'weekn', 'depth', 'size_code', 'Phylum', 'Family']].copy()
slwplot = forpl.pivot_table(index=["feature_id", "depth", 'weekn','Phylum', 'Family'], columns="size_code", values='ratio').fillna(0)
slwplot = slwplot.reset_index()

In [None]:
q75, q25 = np.percentile(slwplot['L'], [75 ,25])
iqr = q75 - q25

In [None]:
def outlier(df):
    new_df = df.copy()
    numeric_cols = ['L', 'S', 'SL', 'W']
    
    q1 = np.percentile(new_df[numeric_cols],25, axis=0)
    q3 = np.percentile(new_df[numeric_cols],75, axis=0)
    IQR = q3 - q1
    lower_limit = q1 - (1.5*IQR)
    upper_limit = q3 + (1.5*IQR)
    mask = (new_df[numeric_cols] < lower_limit) | (new_df[numeric_cols] > upper_limit)
    new_df[numeric_cols] = new_df[numeric_cols].mask(mask)
    return new_df

In [None]:
newslw = outlier(slwplot)

In [None]:
newslw

In [None]:
newslw['L'].isna().sum()

In [None]:
#rerun the regression by removing the 4 pathogens found in weeks 10 and 11
slwplot.sort_values('W')

In [None]:
slwplot.drop([35,36,2353,2200], axis=0, inplace=True)

slwplot.sort_values('W')

In [None]:
slwplot = slwplot.loc[slwplot['feature_id'] != '4cb5c79abe0a3b611671b1f356a86f19']
slwplot = slwplot.loc[slwplot['feature_id'] != '51d96e96a3350beedece3878d6d0b3e7']
slwplot = slwplot.loc[slwplot['feature_id'] != '015600cdcdfdc7333d251c9c9160eb72']

In [None]:
Y = slwplot['S']
X = slwplot['W']

X = sm.add_constant(X)

#fit the model
model = sm.OLS(Y, X, missing='drop')
model_result = model.fit()
model_result.summary()

In [None]:
sns.histplot(model_result.resid);

In [None]:
mu, std = stats.norm.fit(model_result.resid)
mu, std

In [None]:
fig, ax = plt.subplots()
# plot the residuals
sns.histplot(x=model_result.resid, ax=ax, stat="density", linewidth=0, kde=True)
ax.set(title="Distribution of residuals", xlabel="residual")

# plot corresponding normal curve
xmin, xmax = plt.xlim() # the maximum x values from the histogram above
x = np.linspace(xmin, xmax, 100) # generate some x values
p = stats.norm.pdf(x, mu, std) # calculate the y values for the normal curve
sns.lineplot(x=x, y=p, color="orange", ax=ax)
plt.show()

In [None]:
sns.boxplot(x=model_result.resid, showmeans=True);

In [None]:
sm.qqplot(model_result.resid, line='s');

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(6, 5)

sns.set_style('white')
fig = sm.graphics.plot_fit(model_result,1, vlines=False, ax=ax)
ax.set_ylabel("Defractionated")
ax.set_xlabel("Whole")
ax.set_title("Fitted values linear regression")

plt.savefig('outputs/'+comm_id+'/asv_'+str(depth)+'_scatter_RL.png', dpi=200, bbox_inches="tight")

plt.show()

In [None]:
model_result.fittedvalues

In [None]:
Y_max = Y.max()
Y_min = Y.min()

plt.figure(figsize=(5, 4))


ax = sns.scatterplot(x=model_result.fittedvalues, y=Y)
ax.set(ylim=(Y_min, Y_max))
ax.set(xlim=(Y_min, Y_max))
ax.set_xlabel("Predicted value")
ax.set_ylabel("Observed value")

X_ref = Y_ref = np.linspace(Y_min, Y_max, 100)
plt.plot(X_ref, Y_ref, color='red', linewidth=1)

plt.savefig('outputs/'+comm_id+'/asv_'+str(depth)+'_scatterRL.png', dpi=200, bbox_inches="tight")

plt.show()

investigate temporal pattern thorugh all depths of outliers

In [None]:
slwplot.sort_values('W')

In [None]:
slwplot.loc[slwplot['feature_id'] == '75ceeaa937c64399438614ca3706cf2a']

In [None]:
supsel = newseparated.loc[newseparated['feature_id'] == '75ceeaa937c64399438614ca3706cf2a'].sort_values('feature_frequency')
supsel['depth'].unique()

In [None]:
supsel.sort_values('weekn')

In [None]:
sns.scatterplot(data = supsel, x = 'weekn', y = 'ratio', hue='depth')

In [None]:
#draw interactive plotly to identify outliers
import plotly.express as px

df = px.data.tips()
fig = px.scatter(slwplot, x="W", y="SL", color="weekn", trendline="ols")
fig.show()

results = px.get_trendline_results(fig)
print(results)

#results.query("weekn ==  and Phylum == ").px_fit_results.iloc[0].summary()

### Calculate log2 fold change per feature at the phylum level of abundance between size fractions to identify which taxonomic group is driving the simple regression off x=y

In [None]:
#add pseudo count for log-calculations and zero divisions
slwplot['SL'] = slwplot['SL'] + 0.0000001
slwplot['W'] = slwplot['W'] + 0.0000001

#calculate log2 fold change
slwplot['OR'] = (slwplot['W'] - slwplot['SL']) / slwplot['SL']
slwplot['fold_change'] = slwplot['W']/slwplot['SL']
slwplot['log2_fold_change'] = np.log2(slwplot['fold_change'])

In [None]:
#make dual plot of log2FC and mean relative abundances side by side including error bars in the plots
data = slwplot[['log2_fold_change','Phylum', 'W', 'SL']].copy()

data['Phylum'] = data['Phylum'].map(lambda x: x.lstrip('p__'))

# Group by index labels and take the means and standard deviations for each group
data['avg_W'] = data['W'].groupby(data['Phylum']).transform('mean')
data['std_W'] = data['W'].groupby(data['Phylum']).transform('std')
data['avg_SL'] = data['SL'].groupby(data['Phylum']).transform('mean')
data['std_SL'] = data['SL'].groupby(data['Phylum']).transform('std')
data['means'] = data['log2_fold_change'].groupby(data['Phylum']).transform('mean')
data['stds'] = data['log2_fold_change'].groupby(data['Phylum']).transform('std')

data['positive'] = data['means'] > 0

fig, axes = plt.subplots(ncols=2, sharey=True, figsize=(8, 10))

axes[0].barh(data['Phylum'], data['means'],
         xerr = data['stds'],
         error_kw=dict(lw=0.5, capsize=1, capthick=0.5),
         color=data.positive.map({True: 'g', False: 'r'}))

axes[1].barh(data['Phylum'], data['avg_W'],
            xerr = data['std_W'],
         error_kw=dict(lw=0.5, capsize=1, capthick=0.5))

#axes[2].barh(data['Phylum'], data['avg_SL'],
#            xerr = data['std_SL'],
#         error_kw=dict(lw=0.5, capsize=1, capthick=0.5))

plt.gca().invert_yaxis()


plt.savefig('outputs/'+comm_id+'/log2foldchange_d'+str(depth)+'.png', bbox_inches='tight', dpi=300)

#plt.show()

In [None]:
#bar plot of log2FC per phylum without error bars
data = slwplot[['log2_fold_change','Phylum']].copy()
data['Phylum'] = data['Phylum'].map(lambda x: x.lstrip('p__'))

# Group by index labels and take the means and standard deviations for each group
#data['means'] = data['log2_fold_change'].groupby(data['Phylum']).transform('mean')
#data['stds'] = data['log2_fold_change'].groupby(data['Phylum']).transform('std')

data['positive'] = data['log2_fold_change'] > 0

plt.figure(figsize=(6,11))
plt.barh(data['Phylum'], data['log2_fold_change'],
         color=data.positive.map({True: 'g', False: 'r'}))
plt.gca().invert_yaxis()

plt.savefig('outputs/'+comm_id+'/log2foldchange_d'+str(depth)+'_noerr.png', bbox_inches='tight', dpi=300)

#plt.show()

In [None]:
not_in_W = slwplot.loc[slwplot['W'] == 0]
#not_in_W['feature_id'].nunique()
not_in_W_from_S = not_in_W.loc[not_in_W['L'] == 0]
#not_in_W_from_S['feature_id'].nunique()

#calculate the percentage of features coming from the small size fraction that are absent from the whole  
not_in_W_from_S['feature_id'].nunique()/not_in_W['feature_id'].nunique()

In [None]:
sns.scatterplot(data=not_in_W, x='S', y='SL', hue='weekn')

### run lmplot for each pair of sets

In [None]:
#loglog plot
depths = [1,5,10,30,60]

for depth in depths:
    d1 = newseparated.loc[newseparated['depth'] == depth]
    forpl = d1[['ratio', 'feature_id', 'sampleid', 'weekn', 'depth', 'size_code', 'Phylum']].copy()
    slwplot = forpl.pivot_table(index=["feature_id", "depth", 'weekn', 'Phylum'], columns="size_code", values='ratio').fillna(0)
    slwplot = slwplot.reset_index()
    sns.set_style("white")

    #slwplot = slwplot.loc[slwplot['weekn'] = 10]
    #slwplot = slwplot.loc[slwplot['weekn'] = 11]

    slwplot = slwplot.rename(columns={"depth": "Depth"})
    slwplot["weekn"] = pd.to_numeric(slwplot["weekn"])
    g = sns.scatterplot(x="W", y="SL", data=slwplot, palette=['black'])#, hue='Phylum', alpha=0.6) #, hue="weekn");
    
    #uncomment for log-log
    #ax.set(xscale="log", yscale="log")
    g.set_ylabel("Defractionated",fontsize=15)
    g.set_xlabel("Unfractionated",fontsize=15)
    g.tick_params(labelsize=12)
    #g.legend(loc='center left', bbox_to_anchor=(1.25, 0.5), ncol=3)
    plt.legend([],[], frameon=False)
    
    #plt.savefig('outputs/lmplot_'+comm+str(depth)+'WSL.png', bbox_inches='tight', dpi=300)

    plt.show()

### Phylogenetic analysis: taxonomic bar plots of relative abundance per depth and size fraction

In [None]:
#get a list of top taxa to provide the palette for the visualisation
toptaxa = newseparated[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', 'Phylum']].copy()
toptaxa = toptaxa.drop_duplicates()
df_agg = toptaxa.groupby(['size_code','Phylum', 'depth']).agg({'feature_frequency':sum})
topd = df_agg['feature_frequency'].groupby(['size_code', 'depth'], group_keys=False).nlargest(10)
topd = topd.to_frame()
topd = topd.reset_index()
listoftop = topd['Phylum'].unique()

#set a palette for the toptaxa
hex_colors_dic = {}
rgb_colors_dic = {}
hex_colors_only = []
for name, hex in matplotlib.colors.cnames.items():
    hex_colors_only.append(hex)
    hex_colors_dic[name] = hex
    rgb_colors_dic[name] = matplotlib.colors.to_rgb(hex)
    
palette_dict = {taxon: color for taxon, color in zip(listoftop, px.colors.sequential.Plasma)}

In [None]:
phyld, top10d = taxbarplot(newseparated, 'Phylum', 1, 5)

#### Top taxon longitudinal analysis

In [None]:
# newseparated.loc[(newseparated['weekn']==12)&
#                  (newseparated['depth']==10)]
#                & (newseparated['size_code']=='W')]

In [None]:
sfd=newseparated
toptaxa = sfd[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', 'Class']].copy()
toptaxa = toptaxa.drop_duplicates()
df_agg = toptaxa.groupby(['size_code','Class', 'depth', 'weekn']).agg({'feature_frequency':sum})
topd = df_agg['feature_frequency'].groupby(['size_code', 'depth','weekn'], group_keys=False).nlargest(1)
topd = topd.to_frame()
topd = topd.reset_index()

In [None]:
topd.loc[topd['weekn'] == 12]

In [None]:
#list the unique top taxa
topd['Class'].unique()

In [None]:
type_dic = {
    #"c__Cyanobacteriia": 1,
    "c__OM190": 3,
    "c__Bacteroidia": 2,
    "c__Gammaproteobacteria": 5,
    "c__Alphaproteobacteria": 4}

In [None]:
#make a season column
topd['comm_type'] = ''

for tx, ctype in type_dic.items():
    topd.loc[topd['Class'] == tx, 'comm_type'] = ctype
    
topd

In [None]:
topd["sc_weekn"] = topd["depth"].astype(str) + topd["size_code"]
topd

In [None]:
topd = topd.sort_values(['depth', 'size_code'])

In [None]:
topd.loc[topd['sc_weekn'] == '10W']

In [None]:
topd

In [None]:
topdlist = topd['sc_weekn'].tolist()

def uniqlist(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

mylist = uniqlist(topdlist)

In [None]:
glue = topd.pivot(index="sc_weekn", columns="weekn", values="comm_type")
glue = glue.reindex(mylist)
glue = glue[glue.columns].astype(float)

In [None]:
from matplotlib.colors import ListedColormap
cmap_dict = {#1: '#77AADD',
             2: '#EEDD88', 3: '#99DDFF', 4: '#BBCC33', 5:'#DDDDDD'}
cmap = ListedColormap([cmap_dict[i] for i in range(2,6,1)])

In [None]:
sns.set_style('ticks')
plt.figure(figsize=(5, 5))


ax = sns.heatmap(glue, fmt='f', yticklabels=True, linewidths=.5, cmap=cmap)
ax.axhline(4, ls='--')
ax.axhline(8, ls='--')
ax.axhline(12, ls='--')
ax.axhline(16, ls='--')

ax.set_xticks(range(1, 16, 4))

plt.savefig('outputs/heatmap_'+comm+'top1clasno_chloroplast.png', bbox_inches='tight', dpi=300)

plt.show()

#### Longitudinal analysis of top 3 taxa

Run these lines only for the 18S rRNA analysis

In [None]:
#18S has many more classes as top values to
top518s = topd['Class'].value_counts()[:4].index.tolist()

In [None]:
topd.loc[~topd["Class"].isin(top518s), "Class"] = "Other"

In [None]:
newdf = topd.groupby(['size_code', 'depth','weekn'])['Class'].apply(lambda x: list(set(x)))
newdf = newdf.reset_index()

In [None]:
result = newdf.copy()

In [None]:
result = newdf.Class.sort_values().apply(lambda x: sorted(x))
result = pd.DataFrame(result).reset_index(drop=True)

In [None]:
result.Class.apply(tuple).unique()

In [None]:
topd['Class'].unique()

In [None]:
type_dic = {
    'c__Cryptophyceae':1, 'c__Dinophyceae':2, 'c__Prymnesiophyceae':3,
       'c__Mediophyceae':4, 'c__Monogononta':5, 'c__Tentaculata':6,
       'c__Maxillopoda':7, 'c__Thecofilosea':8, 'Unassigned':9, 'c__Insecta':10,
       'c__MAST-2':11, 'c__Hydrozoa':12, 'c__Syndiniales':13,
       'c__Intramacronucleata':14, 'c__Pucciniomycetes':15,
       'c__Mamiellophyceae':16, 'c__Raphidophyceae':17, 'c__MAST-1A':18,
       'c__Bicosoecida':19, 'c__Polychaeta':20, 'c__Tremellomycetes':21,
       'c__Embryophyta':22, 'c__Dothideomycetes':23, 'c__Incertae_Sedis':24,
       'c__MAST-7A':25
}

In [None]:
type_dic = {'Other':5, 'c__Dinophyceae':1, 'c__Mediophyceae':2,
       'c__Intramacronucleata':3, 'c__Syndiniales':4}

In [None]:
result['liststring'] = result['Class'].apply(lambda x: ','.join(map(str, x)))

In [None]:
result['liststring'].unique()

In [None]:
#use this dic if looking at community type by 3 top taxa
type_dic = {'c__Bacteroidia,c__Cyanobacteriia,c__OM190':1,
       'c__Gammaproteobacteria,c__Bacteroidia,c__Cyanobacteriia':2,
       'c__Alphaproteobacteria,c__Bacteroidia,c__Cyanobacteriia':3,
       'c__Bacteroidia,c__Cyanobacteriia,c__Planctomycetes':4,
       'c__Bacteroidia,c__Alphaproteobacteria,c__OM190':5,
       'c__Bacteroidia,c__OM190,c__Planctomycetes':6,
       'c__Gammaproteobacteria,c__Bacteroidia,c__OM190':7,
       'c__Gammaproteobacteria,c__Bacteroidia,c__Planctomycetes':8,
       'c__Cyanobacteriia,c__OM190,c__Planctomycetes':9,
       'c__Alphaproteobacteria,c__Gammaproteobacteria,c__Cyanobacteriia':10,
       'c__Gammaproteobacteria,c__Bacteroidia,c__Alphaproteobacteria':11,
       'c__Bacteroidia,c__Alphaproteobacteria,c__Cyanobacteriia':12,
       'c__Nitrososphaeria,c__Gammaproteobacteria,c__Bacteroidia':13,
       'c__Gammaproteobacteria,c__Alphaproteobacteria':14,
       'c__Cyanobacteriia':15}

In [None]:
#make a season column
topd['comm_type'] = ''

for tx, ctype in type_dic.items():
    topd.loc[topd['Class'] == tx, 'comm_type'] = ctype
    
topd

In [None]:
result['comm_type'] = ''

for tx, ctype in type_dic.items():
    result.loc[result['liststring'] == tx, 'comm_type'] = ctype
    
result

In [None]:
topd = result.copy()

In [None]:
topd["sc_weekn"] = topd["depth"].astype(str) + topd["size_code"]
topd

In [None]:
topd = topd.sort_values(['depth', 'size_code'])

In [None]:
topdlist = topd['sc_weekn'].tolist()

def uniqlist(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

mylist = uniqlist(topdlist)

In [None]:
glue = topd.pivot(index="sc_weekn", columns="weekn", values="comm_type")
glue = glue.reindex(mylist)
glue = glue[glue.columns].astype(float)

In [None]:
 '#77AADD', '#EE8866', '#EEDD88', '#FFAABB', '#99DDFF', '#44BB99', '#BBCC33', '#AAAA00', '#DDDDDD'.

In [None]:
from matplotlib.colors import ListedColormap
cmap_dict = {1: '#77AADD', 2: '#EEDD88', 3: '#99DDFF', 4: '#BBCC33', 5:'#DDDDDD'}
cmap = ListedColormap([cmap_dict[i] for i in range(1,6,1)])

In [None]:
cmap_dict ={1:'#125A56', 2:'#00767B', 3:'#238F9D', 4:'#42A7C6', 5:'#60BCE9',
            6:'#9DCCEF', 7:'#C6DBED', 8:'#DEE6E7', 9:'#ECEADA', 10:'#F0E6B2',
            11:'#F9D576', 12:'#FFB954', 13:'#FD9A44', 14:'#F57634', 15:'#E94C1F'}#, '#D11807', '#A01813'.
cmap = ListedColormap([cmap_dict[i] for i in range(1,16,1)])

In [None]:
sns.set_style('ticks')
plt.figure(figsize=(5, 5))


ax = sns.heatmap(glue, fmt='f', yticklabels=True, linewidths=.5, cmap=cmap)
ax.axhline(4, ls='--')
ax.axhline(8, ls='--')
ax.axhline(12, ls='--')
ax.axhline(16, ls='--')

ax.set_xticks(range(1, 16, 4))


plt.savefig('outputs/heatmap_'+comm+'top1class_reducde.png', bbox_inches='tight', dpi=300)

plt.show()

In [None]:
    sfd=separated[separated.depth==depth]
    toptaxa = sfd[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', level]].copy()
    toptaxa = toptaxa.drop_duplicates()
    df_agg = toptaxa.groupby(['size_code',level, 'depth']).agg({'feature_frequency':sum})
    topd = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(topn)
    topd = topd.to_frame()
    topd = topd.reset_index()


    df_agg = df_agg.reset_index()
    df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)
    
    cumulab = separated[['feature_frequency', 'depth', 'size_code', 'Genus']].copy()
    cumulab1 = cumulab.groupby(['Genus']).agg({'feature_frequency':sum})

    resultpivot = df_agg.pivot_table(index=level, columns='set_name', values='feature_frequency')
    resultpivot = resultpivot.fillna(0)
    resultpivot[resultpivot != 0] = 1
    tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
    tosave.to_csv(level+'_'+str(depth)+'16S_relab.csv')
    
    top10d_list = topd[level].unique()
    top10d = sfd.copy()
    top10d.loc[~top10d[level].isin(top10d_list), level] = 'Other' #isnot in top list
    phyld = top10d.groupby(['size_code','weekn', level])['ratio'].sum()
    phyld = phyld.reset_index()


    fig = px.bar(phyld, x="size_code", y="ratio", facet_col="weekn", color=level, labels={
                     "feature_frequency": "Relative abundance",
                     "size_code": "",
                     "weekn": "w"}, color_discrete_map=palette_dict)
    fig.update_xaxes(type='category', dtick=1)
    fig.update_layout(
        title="Relative abundance of top 10" + level + 'observed at Depth' + str(depth),
        yaxis_title="Relative abundance",
        xaxis_title="Size fraction",
        legend_title=level,
        font=dict(size=8)
    )

    fig.show()
    #fig.write_image("outputs/fig1.png")
    #fig.to_image(format="png")

In [None]:
phyld, top10d = taxbarplot(newseparated, 'Class', 5, 5)

In [None]:
plot_df2 = plot_df2.drop_duplicates()

In [None]:
plot_df3 = plot_df2.set_index('sampleid')
plot_df3

In [None]:
permanova2 = permanova(dm, plot_df3, 'Size code')
permanova2

In [None]:
distance_matrix2 = distance_matrix.reset_index()
idedup = distance_matrix2['samples'].to_list()
dm = DistanceMatrix(distance_matrix, ids=idedup)
df123 = dm.to_data_frame()

In [None]:
df123.to_csv('distance_matrix_5m16s.tsv', sep='\t')

In [None]:
plot_df2.to_csv('METADATAtiny.txt', sep='\t')

In [None]:
pca, pca_features, sfdclr = pcaplot(newseparated, 5, comm, 'Size code', 'DFr')

In [None]:
newmetadata = newseparated[['sampleid', 'weekn', 'size_code', 'depth', 'depth_code', 'month_name']].copy()
newmetadata = newmetadata.drop_duplicates()
newmetadata.to_csv('METADATA.tsv', sep='\t')

In [None]:
pca_features

In [None]:
from skbio.stats.distance import permanova

In [None]:
level = 'feature_id'
if level == 'feature_id':
    id = 'ASV'
else:
    id = level

subtitile = 'subtitle'

In [None]:
dfplot, level = calcperc(comm, separated, level)
# variables
labels = ['S ∩ W', 'L ∩ W','W ∩ (S ∩ L)']
colors = ['#1D2F6F', '#8390FA', '#C49CD3']
title = 'Weighted proportion of shared '+id+' between size fractionated and non size fractionated samples'

plot_stackedbar_p(dfplot, labels, colors, title, subtitle, level)

In [None]:
dfplot, level, dfplot_unweighted = calcperc_defrac_unweighted(comm, newseparated, level)
# variables
labels = ['Size fractionated samples', 'Not size fractionated samples','Both']
colors = ['#1D2F6F', '#8390FA', '#6EAF46']
title = 'Unweighted proportion of shared '+id+' between size fractionated and non size fractionated samples'

plot_stackedbar_p(dfplot, labels, colors, title, subtitle, level)

In [None]:
dfplot, level = calcperc_defrac(comm, newseparated, level)
# variables
labels = ['SF samples', 'NSF samples','Both', 'DFr']
colors = ['#1D2F6F', '#8390FA', '#6EAF46', '#de282e']
title = 'Weighted proportion of shared '+id+' between size fractionated and non size fractionated samples'

plot_stackedbar_p(dfplot, labels, colors, title, subtitle, level)

In [None]:
dfplotSLNSF, dfplot_normalized, level = calcperc_SLNSF(comm, separated, level)
# variables
labels = ['S', 'L','S ∩ L', 'W']
colors = ['#976BE5','#E56BE5','#FF96FF', '#6EAF46']
title = 'Weighted proportion of shared '+ id +' between size fractionated and non size fractionated samples'

plot_stackedbar_p_SLNSF(dfplotSLNSF, labels, colors, title, subtitle, level, 30.1, 5)

In [None]:
dfplotLSW, dfplot_normalized, level = calcperc_LSW(comm, separated, level)
#variables
labels = ['NSF samples', 'Large and whole','Small and whole', 'Large, small, and whole']
colors = ['#8390FA', '#FF0000', '#DAD746', '#E89618']
title = 'Weighted proportion of shared '+ id +' between size fractionated and non size fractionated samples'

plot_stackedbar_p_SLNSF(dfplotLSW, labels, colors, title, subtitle, level)

In [None]:
dfplotLS_W, dfplot_normalized, level = calcperc_LS_W(comm, separated, level)
#variables
labels = ['NSF samples', 'Large and whole','Small and whole']
colors = ['#8390FA', '#FF0000', '#DAD746']
title = 'Weighted proportion of shared '+ id +' between size fractionated and non size fractionated samples'

plot_stackedbar_p_SLNSF(dfplotLS_W, labels, colors, title, subtitle, level, 45.1, 5)

In [None]:
newseparated

In [None]:
level = 'feature_id'

toptaxa = separated[[level, 'feature_frequency', 'Taxon', 'size_code', 'weekn']].copy()
    
toptaxa = toptaxa.drop_duplicates()
df_agg = toptaxa.groupby(['size_code',level]).agg({'feature_frequency':sum})
    
df_agg = df_agg.reset_index()
resultpivot = df_agg.pivot_table(index=level, columns='size_code', values='feature_frequency')
resultpivot = resultpivot.fillna(0)
    
df1 = resultpivot.copy()
    
df = resultpivot[['L', 'S', 'W']].copy()
Sonly = df[(df['L'] == 0) & (df['W'] == 0)]
Wonly = df[(df['L'] == 0) & (df['S'] == 0)]
Lonly = df[(df['S'] == 0) & (df['W'] == 0)]
LW = df[(df['S'] == 0) & (df['W'] != 0) & (df['L'] != 0)]
LS = df[(df['W'] == 0) & (df['S'] != 0) & (df['L'] != 0)]
SW = df[(df['W'] != 0) & (df['S'] != 0) & (df['L'] == 0)]
LSW = df[~(df == 0).any(axis=1)]
        
c = venn3(subsets = (len(Lonly), len(Sonly), len(LS), len(Wonly), len(LW), len(SW), len(LSW)), 
          set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'),
          #set_colors=('#E56BE5', '#976BE5', '#6EAF46'),
          set_colors=('#E56BE5', '#976BE5', '#6EAF46'), alpha = 1);


plt.savefig("outputs/02-EUKs/D"+level+"_vennall.png", dpi=200, bbox_inches="tight")

** another idea is to run ancom of sizefraction specific and compare after the categories (run ancom on ?time or month.. or some other column) and compare the number/taxonomy of differentially abundant taxa recovered;
are we recovering the same diff ab taxa between the (1.SF samples, 2. NSF samples)

In [None]:
pca, pca_features, sfdclr, dm, plot_df2, df, distance_matrix = pcaplot(newseparated, 1, comm, 'Size code')

## 18S ANCOM PER DEPTH

In [None]:
pca1_18, pca_features1_18, sfdclr1_18 = pcaplot(newseparated, 1, comm, 'Size code')
DARejected_SC_taxonomy1_18, prcentile1_18 = run_ancom(newseparated, sfdclr1_18, 1, 'size_code')

In [None]:
prcentile1_18

In [None]:
DARejected_SC_taxonomy1_18.sort_values(by='W')

In [None]:
pca5_18, pca_features5_18, sfdclr5_18 = pcaplot(newseparated, 5, '18S', 'Size Code')
DARejected_SC_taxonomy5_18, prcentile5_18 = run_ancom(newseparated, sfdclr5_18, 5, 'size_code')

In [None]:
DARejected_SC_taxonomy5_18.sort_values(by='W')

In [None]:
pca10_18, pca_features10_18, sfdclr10_18 = pcaplot(newseparated, 10, '18S', 'Size code')
DARejected_SC_taxonomy10_18, prcentile10_18 = run_ancom(newseparated, sfdclr10_18, 10, 'size_code')

In [None]:
DARejected_SC_taxonomy10_18.sort_values(by='W')

In [None]:
pca30_18, pca_features30_18, sfdclr30_18 = pcaplot(newseparated, 30, '18S', 'Size code')
DARejected_SC_taxonomy30_18, prcentile30_18 = run_ancom(newseparated, sfdclr30_18, 30, 'size_code')

In [None]:
DARejected_SC_taxonomy30_18.sort_values(by='W')

In [None]:
pca60_18, pca_features60_18, sfdclr60_18 = pcaplot(newseparated, 60, '18S', 'size code')
DARejected_SC_taxonomy60_18, prcentile60_18 = run_ancom(newseparated, sfdclr60_18, 60, 'size_code')

In [None]:
DARejected_SC_taxonomy60_18.sort_values(by='W')

## 16S ANCOM PER DEPTH

In [None]:
newseparated.loc[newseparated['feature_id'] == '51d96e96a3350beedece3878d6d0b3e7']

In [None]:
df1 = newseparated[newseparated.size_code != 'S']
df1

In [None]:
df1 = df1[df1.size_code != 'L']
df1

In [None]:
newseparated = df[df.size_code != 'S']

In [None]:
newseparated = newseparated[newseparated.depth == 1]

In [None]:
newbiom = newseparated.pivot_table(index="feature_id", columns="sampleid", values="feature_frequency")

In [None]:
newbiom = newbiom.fillna(0)

In [None]:
newbiom.to_csv('newbiomdepth1.tsv', sep="\t") 

In [None]:
pca1_16, pca_features1_16, sfdclr1_16 = pcaplot(df1, 1, '16S', 'Size code')
DARejected_SC_taxonomy1_16, prcentile1_16 = run_ancom(df1, sfdclr1_16, 1, 'weekn')

In [None]:
prcentile1_16 = prcentile1_16.reset_index()

In [None]:
prcentile1_16.loc[prcentile1_16['feature_id'] == '80c73848b68ff95bd030fccfce011294']

In [None]:
DARejected_SC_taxonomy1_16.sort_values(by='W')

In [None]:
pca5_16, pca_features5_16, sfdclr5_16 = pcaplot(newseparated, 5, '16S', 'Size code')
DARejected_SC_taxonomy5_16, prcentile5_16 = run_ancom(newseparated, sfdclr5_16, 5, 'size_code')

In [None]:
DARejected_SC_taxonomy5_16.sort_values(by='W')

In [None]:
pca10_16, pca_features10_16, sfdclr10_16 = pcaplot(newseparated, 10, '16S', 'Size code')
DARejected_SC_taxonomy10_16, prcentile10_16 = run_ancom(newseparated, sfdclr10_16, 10, 'size_code')

In [None]:
DARejected_SC_taxonomy10_16.sort_values(by='W')

In [None]:
pca30_16, pca_features30_16, sfdclr30_16 = pcaplot(newseparated, 30, '16S', 'Size code')
DARejected_SC_taxonomy30_16, prcentile30_16 = run_ancom(newseparated, sfdclr30_16, 30, 'size_code')

In [None]:
DARejected_SC_taxonomy30_16.sort_values(by='W')

In [None]:
pca60_16, pca_features60_16, sfdclr60_16 = pcaplot(newseparated, 60, '16S', 'Size code')
DARejected_SC_taxonomy60_16, prcentile60__16 = run_ancom(newseparated, sfdclr60_16, 60, 'size_code')

In [None]:
DARejected_SC_taxonomy60_16.sort_values(by='W')

? run a clustermap of top10 taxa of each deapth and color rows by depth, month, size code

In [None]:
prcentile60__16

In [None]:
pca, pca_features, sfdclr = pcaplot(separated, 1, '16S')
DARejected_month_taxonomy_16_1, prcentile = run_ancom(sfdclr, 1, 'Month')

In [None]:
plot_per_fid('16S', separated, 60, '50a0c3221c68046dfc96e032aff1ccd8')

In [None]:
plot_df2.sort_values('dim1')

### Upset plot data prep

In [None]:
upsetprep('16S', 'Genus')

In [None]:
#if we want to make an upset plot of all depths?

frames = sfd1, sfd5, sfd10, sfd30, sfd60
result = pd.concat(frames)
resultpivot = result.pivot_table(index='Genus', columns='set_name', values='feature_frequency')
resultpivot = resultpivot.fillna(0)
resultpivot[resultpivot != 0] = 1
tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
tosave.to_csv('genus_all16S_relab.csv')

### Venn diagrams

In [None]:
#Depth 1 all 16S at genus level
#N=84
venn3(subsets = (27, 11, 4, 7, 6, 5, 22), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);
#venn3(subsets = (Lonly, Sonly, LS, Wonly, LW, SW, LSW), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

plt.title("1m depth") 
plt.savefig("outputs/02-EUKs/D1_genus_venn.png")
plt.show()

In [None]:
lm = sfa.ols('nASVs ~ C(size_code)', data=sfd_LM).fit()
anova = sa.stats.anova_lm(lm)
spPH.posthoc_ttest(sfd_LM, val_col='nASVs', group_col='size_code', p_adjust='holm')

In [None]:
sfd_S = sfd[['size_code', 'nASVs', 'weekn']].copy()
sfd_S = sfd_S.drop_duplicates()
sdfpv = sfd_S.pivot(index='weekn', columns='size_code', values='nASVs')
fvalue, pvalue = stats.f_oneway(sdfpv['L'], sdfpv['S'], sdfpv['W'])

In [None]:
sfd_LM = sfd[['size_code', 'nASVs']].copy()
sfd_LM = sfd_LM.drop_duplicates()
lm = sfa.ols('nASVs ~ C(size_code)', data=sfd_LM).fit()
anova = sa.stats.anova_lm(lm)
spPH.posthoc_ttest(sfd_LM, val_col='nASVs', group_col='size_code', p_adjust='holm')

In [None]:
dffff = spPH.posthoc_ttest(sfd_LM, val_col='nASVs', group_col='size_code', p_adjust='holm')
dffff

In [None]:
sfd_S = sfd_S.drop_duplicates()

In [None]:
sfd_S = sfd_S.set_index('weekn')

In [None]:
sfd_sdf = sfd_S.stack().to_frame().reset_index()

In [None]:
sdfpv = sfd_S.pivot(index='weekn', columns='size_code', values='nASVs')

In [None]:
sdfpv

In [None]:
fvalue, pvalue = stats.f_oneway(sdfpv['L'], sdfpv['S'], sdfpv['W'])

In [None]:
pvalue

In [None]:
plot_stackedbar_p(dfplot, labels, colors, title, subtitle)

In [None]:
plt.savefig('outputs/'+comm_id+'/D'+str(depth)+'_adlineplot.png', dpi=200, bbox_inches="tight")

In [None]:
df_grouped.describe()

In [None]:
##make new category of S+L

In [None]:
#make sure all size codes are indicated
all_md["size_code"] = all_md["sampleid"].str.extract(r'[1-9][0-9]?[A-E]([L-S])')
all_md["size_code"] = all_md["size_code"].fillna('W')

#only keep values from weeks 1 to 16
sep_SL = all_md[all_md.size_code != "W"]
sep_SL = sep_SL.drop(sep_SL[sep_SL.weekn > 16].index)

#sum [DNA] of small and large size fractions
sep_SL['[DNAt]'] = sep_SL.groupby(['weekn', 'depth'])['[DNA]ng/ul'].transform('sum')

#separate small and size fraction
sep_S = sep_SL[sep_SL.size_code == 'S']
sep_L = sep_SL[sep_SL.size_code == 'L']

#calculate DNA proportion per size fraction
sep_SL['DNApr'] = sep_SL['[DNA]ng/ul']/sep_SL['[DNAt]']

#merge with separated on common columns to get corresponding rel. abundances
sep_SL = sep_SL[['sampleid', 'DNApr', '[DNAt]']].copy()
sepSLRA = pd.merge(separated, sep_SL, on=['sampleid'], how='left') #all_md is the metadata file

#exclude ASVs from the whole water
sep_SLRA = sepSLRA[separated.size_code != "W"]

#calculate corrected per sample ratio, and corrected feature frequency of de-fractionated samples
sep_SLRA['Newfeature_frequency'] = sep_SLRA['feature_frequency'] * sep_SLRA['DNApr']
sep_SLRA['Newff'] = sep_SLRA.groupby(['feature_id', 'weekn', 'depth'])['Newfeature_frequency'].transform('sum')


#sep_SLRA = sep_SLRA.drop(['sampleid', 'size_code'], axis=1)
#sep_SLRA['sampleid'] = "BB22." + sep_SLRA['weekn'].astype(str) + sep_SLRA['depth_code'] + "SL"

#uncomment the line below if keeping small and large original sample
#sep_SLRA['size_code'] = sep_SLRA['size_code'] + '-DFr'

#uncomment the line above if merging smallandlarge
#sep_SLRA['size_code'] = 'SL'

sep_SLRA = sep_SLRA.drop(['feature_frequency', 'Total', 'ratio', 'nASVs', 'weekdepth', 'avg',
                          'diff', 'extraction_date', '[DNA]ng/ul', 'A260/280', 'A260/230',
                          'Newfeature_frequency'], axis=1)
sep_SLRA.rename(columns={'Newff':'feature_frequency'}, inplace=True)
sep_SLRA = sep_SLRA.drop_duplicates()

sep_SLRA['Total'] = sep_SLRA['feature_frequency'].groupby(sep_SLRA['sampleid']).transform('sum')
sep_SLRA['ratio'] = sep_SLRA['feature_frequency']/sep_SLRA['Total']
sep_SLRA['nASVs'] = sep_SLRA['feature_id'].groupby(sep_SLRA['sampleid']).transform('nunique')

sep_SLRA = sep_SLRA.drop_duplicates()

sep_WO = separated[separated.size_code == "W"]
sep_L = separated[separated.size_code == "L"]
sep_S = separated[separated.size_code == "S"]

newseparated = pd.concat([sep_SLRA, sep_WO], ignore_index=True)

newseparated['weekdepth'] = newseparated["weekn"].astype(str) + newseparated["depth"].astype(str)
newseparated['avg'] = newseparated['nASVs'].groupby(newseparated['weekdepth']).transform('mean')
newseparated['diff'] = newseparated['nASVs'] - newseparated['avg']

Permanova results from R into boxplots

In [None]:
permresu = pd.read_csv('R_results/post_hoc_results.csv')

In [None]:
permresu["depth_pairs"] = permresu["depth"].astype(str) + permresu["pairs"]

In [None]:
ax = sns.catplot(
    permresu, kind="bar",
    x="p.adjusted", y="pairs", col="comm", hue="depth",
    height=4, aspect=1.3, palette="Greys", log=True
)
#ax.set(xlim=(0, 0.10))

ax.refline(x=0.05, color='red')

plt.savefig('outputs/perm_pvalues_logged.png', bbox_inches='tight', dpi=300)

plt.show()

In [None]:
ax = sns.catplot(
    permresu, kind="bar",
    x="p.adjusted", y="pairs", col="comm", hue="depth",
    height=4, aspect=1.3, palette="Greys", log=True
)
#ax.set(xlim=(0, 0.10))

ax.refline(x=0.05, color='red')

plt.savefig('outputs/perm_pvalues_logged.png', bbox_inches='tight', dpi=300)

plt.show()