In [1]:
#for importing, formatting and data manipulation
import pandas as pd
import numpy as np
import glob
import datetime
#from time import time
#from datetime import datetime
#from datetime import timedelta
import tempfile
from qiime2 import Artifact
import zipfile
import yaml

#for plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib_venn import venn3, venn3_circles
import seaborn as sns
#sns.set(style="whitegrid")
import plotly.express as px
%matplotlib inline
from IPython.display import display
from upsetplot import plot
#import pyupset as pyu
pd.set_option('display.max_rows', 15)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from pandas.plotting import register_matplotlib_converters
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
register_matplotlib_converters()

#for statistical analyses
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from skbio.diversity import alpha_diversity
from skbio.stats.distance import permanova
from skbio import DistanceMatrix
from scipy.spatial.distance import cdist
from skbio.stats.composition import clr
from skbio.stats.composition import alr
from skbio.stats.composition import ilr
from skbio.diversity.alpha import chao1

## Import and format metadata from lab, and BBMP

### Import lab metadata

In [2]:
def load_df():
    
    filenames = glob.glob('/Users/Diana/Documents/escuela/phd/ch2/bb_data/20**/METADATA.txt')
    #load all metadata and concatenate them into one dataframe
    md = []
    for filename in filenames:
        df = pd.read_csv(filename, sep='\t')
        md.append(df)
        print (filename)
    
    md = pd.concat(md)
    
    #drop empty columns and rows
    md.dropna(how='all', axis=1, inplace=True) #empty cols
    md.dropna(how='all', inplace=True) #empty rows
    
    return md

In [3]:
md = load_df()

/Users/Diana/Documents/escuela/phd/ch2/bb_data/2014/METADATA.txt
/Users/Diana/Documents/escuela/phd/ch2/bb_data/2022/METADATA.txt
/Users/Diana/Documents/escuela/phd/ch2/bb_data/2015/METADATA.txt
/Users/Diana/Documents/escuela/phd/ch2/bb_data/2017/METADATA.txt
/Users/Diana/Documents/escuela/phd/ch2/bb_data/2019/METADATA.txt
/Users/Diana/Documents/escuela/phd/ch2/bb_data/2021/METADATA.txt
/Users/Diana/Documents/escuela/phd/ch2/bb_data/2020/METADATA.txt
/Users/Diana/Documents/escuela/phd/ch2/bb_data/2018/METADATA.txt
/Users/Diana/Documents/escuela/phd/ch2/bb_data/2016/METADATA.txt


#### Renumber dates

In [4]:
#create a dictionary for months
month_dic = {
    "Jan": 1,
    "Feb": 2,
    "Mar": 3,
    "Apr": 4,
    "May": 5,
    "Jun": 6,
    "Jul": 7,
    "Aug": 8,
    "Sep": 9,
    "Oct": 10,
    "Nov": 11,
    "Dec": 12
}
month_season = {
    "Jan": "Winter",
    "Feb": "Winter",
    "Mar": "Spring",
    "Apr": "Spring",
    "May": "Spring",
    "Jun": "Summer",
    "Jul": "Summer",
    "Aug": "Summer",
    "Sep": "Autumn",
    "Oct": "Autumn",
    "Nov": "Autumn",
    "Dec": "Winter"
}
depth_num = {
    "A": 1,
    "B": 5,
    "C": 10,
    "D": 60,
    "E": 30
}

In [5]:
#add month to a new column
md['month_name'] = md['date'].str.split('-').str[1]

#add month number
md['month']= md['month_name'].map(month_dic)

#add day number
md['day'] = md['date'].str.split('-').str[0]
md[["year", "month", "day"]] = md[["year", "month", "day"]].apply(pd.to_numeric)

#remove symbol for better handling of data
#md.rename(columns={"Week#": "Weekn"}, inplace=True)
#md.rename(columns={"Depth": "depth"}, inplace=True) #to match dfo

#change to int to remove decimals from date columns
md.year = md.year.apply(int)
md.depth = md.depth.apply(int)
md.weekn = md.weekn.apply(int)

#change to str to aggregate them into time_string to match dfos formatting of the date
md.year = md.year.apply(str)
md.month = md.month.apply(str)
md.day = md.day.apply(str)
#add leading zero to match date format in dfo metadata
md['month'] = md['month'].str.zfill(2)
md['day'] = md['day'].str.zfill(2)

#add leading zero to match date format in dfo metadata
md['month'] = md['month'].str.zfill(2)
md['day'] = md['day'].str.zfill(2)

md['time_string'] = md[['year', 'month', 'day']].agg('-'.join, axis=1)

### Import and manage BBMP data

#### Metadata __md__ is formatted. It contains 38 columns.
__md__ is the lab's metadata for sampling, extraction and sequencing. \
__dfo_md__ is BBMP remote sensing data (salinity, pH, temperature, density..) \
__bio_niskin__ is nutrient data \
Format __bio_niskin__ data to merge with __md__. __bio_niskin__ is 32 columns, including year, month, day, and depth. __dfo_md__ also has 32 columns, including year_time, month_time, day_time. To merge these data with __md__, we will change the time stamps columns to the same name, and generate a time_string column.

In [6]:
dfo_md = pd.read_csv("/Users/Diana/Documents/escuela/phd/ch2/bb_data/bbmp_aggregated_profiles.csv")
bio_niskin = pd.read_csv("/Users/Diana/Documents/escuela/phd/ch2/bb_data/BBMP_Data_2022.csv")#
#dfo_metadata_y14 = pd.read_csv("/Users/Diana/Documents/escuela/phd/bb_data/2019/data_export/trim-analysis/dfo_metadata_y14.tsv", sep='\t')

#change to str to aggregate them into time_string
bio_niskin.year = bio_niskin.year.apply(str)
bio_niskin.month = bio_niskin.month.apply(str)
bio_niskin.day = bio_niskin.day.apply(str)
#add leading zero to match date format in dfo metadata
bio_niskin['month'] = bio_niskin['month'].str.zfill(2)
bio_niskin['day'] = bio_niskin['day'].str.zfill(2)

bio_niskin['time_string'] = bio_niskin[['year', 'month', 'day']].agg('-'.join, axis=1)

#make a new column for time_string without the time
dfo_md['time_string_time'] = dfo_md['time_string']
dfo_md['time_string'] = dfo_md['time_string'].str.split(' ').str[0]

#renaming columns to ensure correct merging
dfo_md.rename(columns={"depth":"bbmpdepth","pressure": "depth", "year_time": "year", "month_time": "month", "day_time": "day"}, inplace=True)

#change to int to remove decimals from date columns
cols = ['year', 'depth', 'month', 'day']
md[cols] = md[cols].apply(pd.to_numeric, errors='ignore', axis=1)
dfo_md[cols] = dfo_md[cols].apply(pd.to_numeric, errors='ignore', axis=1)
bio_niskin[cols] = bio_niskin[cols].apply(pd.to_numeric, errors='ignore', axis=1)

#make a season column
md['season'] = ''

for month, season in month_season.items():
    md.loc[md['month_name'] == month, 'season'] = season

#merging party
merged = pd.merge(md, dfo_md, on=["year", "month", "depth", "day"], how="left")
allyears = pd.merge(md, dfo_md, on=["year", "month", "depth", "day"], how="outer")

#add nutrient data
preall_md= pd.merge(allyears, bio_niskin, on=["day", "month", "year", 'depth'], how="outer")
all_md = pd.merge(merged, bio_niskin, on=["day", "month", "year", 'depth'], how="left")

#split dfs by depth
shallow_depths = [1, 5, 10]
shallow = all_md[all_md["depth"] < 30]
#shallow = shallow.groupby(['year', 'month', "day"]).mean().reset_index()
deep = all_md[all_md.depth == 60]

#split dfs by season
year_season = preall_md.groupby(by = ['year','season']).mean().reset_index()

Winter = year_season.loc[year_season['season'] == 'Winter',:]
Spring = year_season.loc[year_season['season'] == 'Spring',:]
Summer = year_season.loc[year_season['season'] == 'Summer',:]
Autumn = year_season.loc[year_season['season'] == 'Autumn',:]

#save output as csv
all_md.to_csv('allmetadata.csv')

  merged = pd.merge(md, dfo_md, on=["year", "month", "depth", "day"], how="left")
  allyears = pd.merge(md, dfo_md, on=["year", "month", "depth", "day"], how="outer")
  all_md = pd.merge(merged, bio_niskin, on=["day", "month", "year", 'depth'], how="left")
  year_season = preall_md.groupby(by = ['year','season']).mean().reset_index()


## Find missing data

In [7]:
emptynit = all_md[all_md['Nitrate'].isna()]

## Plotting party

In [None]:
#plotly seasonal averages figure
fig2 = go.Figure()
for template in ["plotly_white"]:
    fig2.add_trace(go.Scatter(x=Winter['year'], y=Winter['temperature'],
                    mode='lines',
                    name='Winter',
                    marker_color='#838B8B'))
    fig2.add_trace(go.Scatter(x=Spring['year'], y=Spring['temperature'],
                    mode='lines',
                    name='Spring',
                    marker_color='#FFB5C5'))
    fig2.add_trace(go.Scatter(x=Summer['year'], y=Summer['temperature'],
                    mode='lines',
                    name='Summer',
                    marker_color='#87CEFF'))
    fig2.add_trace(go.Scatter(x=Autumn['year'], y=Autumn['temperature'],
                    mode='lines',
                    name='Autumn',
                    marker_color='#FF8000'))
    fig2.update_layout(
    height=800,
    xaxis_title="Years",
    yaxis_title='Temperature in degree',
    title_text='Average Temperature seasonwise over the years',
    template=template)

fig2.show()

In [None]:
#seaborn season averages plot
sns.lineplot(year_season['year'],year_season['temperature'], hue =year_season["season"])

In [None]:
plt.style.use('ggplot')

### Detect and plot anomalies in variables

In [None]:
def detect_anomalies(metadata, depth, yr=all, month=all):
    md_col = depth[['weekn', metadata, "year", "month"]].copy()
    md_col = md_col[md_col[metadata].notna()]
    if yr != all:
        #mdcol_yr = md_col[md_col.Year == yr]
        mdcol_yr = md_col[md_col['year'].isin(yr)]
    else: 
        mdcol_yr = md_col
        
    if month != all:
        #mdcol_yr = mdcol_yr[mdcol_yr.Month == month]
        mdcol_yr = mdcol_yr[mdcol_yr['month'].isin(month)]
    
    mdcol_yr = mdcol_yr.drop(columns=['year', "month"])
    mdcol_yr = mdcol_yr.set_index(['weekn'])
    
    #modelling time
    outliers_fraction = float(.01)
    scaler = StandardScaler()
    np_scaled = scaler.fit_transform(mdcol_yr.values.reshape(-1, 1))
    data = pd.DataFrame(np_scaled)
    # train isolation forest
    model =  IsolationForest(contamination=outliers_fraction)
    model.fit(data)
    
    #predict data
    mdcol_yr['anomaly'] = model.predict(data)
    
    
    # visualization
    fig, ax = plt.subplots(figsize=(10,6))
    a = mdcol_yr.loc[mdcol_yr['anomaly'] == -1, [metadata]] #anomaly
    ax.plot(mdcol_yr.index, mdcol_yr[metadata], color='black', label = 'Normal')
    ax.scatter(a.index,a[metadata], color='red', label = 'Anomaly')
    plt.axvline(36, ls='--')
    plt.legend()
    plt.show();
    #add axes names

In [None]:
detect_anomalies('Phosphate', shallow, yr={2022}, month={1,2,3})

## Add prokaryotic community

In [8]:
# Special thanks to Alex Manuele https://github.com/alexmanuele
def consolidate_tables(MG):
    if MG == '16S':
        comm = '02-PROKs'
    else :
        comm = '02-EUKs'
        
    table_list = glob.glob('{0}/table.qza'.format('/Users/Diana/Documents/escuela/phd/size_fractions/BB22_size-fraction-comparison-analysed/to_transfer/'+comm))
    print("Found all "+MG+" tables.")

        
    dataframes = []  
    for table_path in table_list:
        with tempfile.TemporaryDirectory() as tempdir:
            #load table, dump contents to tempdir
            table = Artifact.load(table_path)
            #Make sure the tables are all FeatureFrequency type
            assert str(table.type) == 'FeatureTable[Frequency]', "{0}: Expected FeatureTable[Frequency], got {1}".format(table_path, table.type)
            Artifact.extract(table_path, tempdir)
            #get the provenance form the tempdir and format it for DF
            prov = '{0}/{1}/provenance/'.format(tempdir, table.uuid)
            action = yaml.load(open("{0}action/action.yaml".format(prov), 'r'), Loader=yaml.BaseLoader)
            paramlist = action['action']['parameters']
            paramlist.append({'table_uuid': "{}".format(table.uuid)})
            paramdict = {}
            for record in paramlist:
                paramdict.update(record)

            # Get the data into a dataframe
              #Biom data
            df = table.view(pd.DataFrame).unstack().reset_index()
            df.columns = ['feature_id', 'sample_name', 'feature_frequency']
            df['table_uuid'] = ["{}".format(table.uuid)] * df.shape[0]
              #param data
            pdf = pd.DataFrame.from_records([paramdict])
              #merge params into main df
            df = df.merge(pdf, on='table_uuid')
            

            #I like having these columns as the last three. Makes it more readable
            cols = df.columns.tolist()
            reorder = ['sample_name', 'feature_id', 'feature_frequency']
            for val in reorder:
                cols.append(cols.pop(cols.index(val)))
            df = df[cols]
            df['table_path'] = [table_path] * df.shape[0]
            df['sample_name'] = df['sample_name'].str.replace('-', '.')
            dataframes.append(df)
            
            # Adding table_id, forward and reverse trim columns
            #df['table_id'] = str(table_path.split('/')[-3]) #add a table_id column
            #df['forward_trim'], df['reverse_trim'] = df['table_id'].str.split('R', 1).str
            #df['forward_trim'] = df['forward_trim'].map(lambda x: x.lstrip('F'))
            #df["forward_trim"] = pd.to_numeric(df["forward_trim"])
            #df["reverse_trim"] = pd.to_numeric(df["reverse_trim"])

    #Stick all the dataframes together
    #outputfile="merged_all_tables.tsv"
    df = pd.concat(dataframes)
    df['sample_name'] = df['sample_name'].str.replace(r'\.S([1-9]|[1-9][0-9]|[1-9][0-9][0-9]).L001\.','', regex=True)
    
    #df.to_csv(comm+'/merged_all_tables.tsv', sep='\t', index=False)
    print("Successfully saved all tables.")
    return df

In [9]:
def merge_metadata(df):
    #df = pd.read_csv('02-PROKs/'+'/merged_all_tables.tsv', sep='\t')

    tables = df[['sample_name', 'feature_id', 'feature_frequency']].copy()
    tables.rename(columns={'sample_name':'sampleid'}, inplace=True)

    all_md['sampleid'] = all_md['sampleid'].str.replace('_', '.')
    merged = pd.merge(tables,all_md, on='sampleid', how='left') #all_md is the metadata file
    merged = merged[merged.feature_frequency != 0]
    
    merged['year'] = 2022
    merged["size_code"] = ''

    merged["size_code"] = merged["sampleid"].str.extract(r'[1-9][0-9]?[A-E]([L-S])')
    merged["size_code"] = merged["size_code"].fillna('W')
    merged["depth_code"] = merged["sampleid"].str.extract(r'[1-9][0-9]?([A-E])')
    merged['depth']= merged['depth_code'].map(depth_num)
    merged["weekn"] = merged["sampleid"].str.extract(r'\.([1-9][0-9]?)[A-E]')
    merged['weekn'] = pd.to_numeric(merged['weekn'])
    merged['depth'] = pd.to_numeric(merged['depth'])
    merged['date'] = merged.groupby('weekn', as_index=False)['date'].transform('first')
    
    merged['Total'] = merged['feature_frequency'].groupby(merged['sampleid']).transform('sum')
    merged['ratio'] = merged['feature_frequency']/merged['Total']
    merged['nASVs'] = merged['feature_id'].groupby(merged['sampleid']).transform('count')
    merged['weekdepth'] = merged["weekn"].astype(str) + merged["depth"].astype(str)
    merged['avg'] = merged['nASVs'].groupby(merged['weekdepth']).transform('mean')
    merged['diff'] = merged['nASVs'] - merged['avg']

    print('Set up metadata ...')
    
    #merged.to_csv(comm+'/merged_asvs_metadata.tsv', sep = '\t')
    print('Saved merged_asvs_metadata.tsv')
    
    return merged

In [10]:
def pick_metadata(merged, depth='all', size_fraction='both', year='all', R='all', F='all', txsubset = 'all'):
#make df of features/composition+run+comm

    depth = depth
    year = year
    size_fraction = size_fraction
    txsubset = txsubset
        
    files = glob.glob('{0}/*/class/*/data/taxonomy.tsv'.format('/Users/Diana/Documents/escuela/phd/size_fractions/BB22_size-fraction-comparison-analysed/to_transfer'))
    taxos = []
#    if not os.path.exists(path+composition):
#        os.mkdir(path+composition)
    for filename in files:
        tax = pd.read_csv(filename, sep='\t')
        taxos.append(tax)
        
    print('Appended all taxonomies to taxos')
    taxos = pd.concat(taxos)
    taxos = taxos.rename(columns={"Feature ID": "feature_id"}, errors="raise")
    taxos = taxos.drop_duplicates()

    separated = merged.merge(taxos, how='left', on='feature_id') #merged excludes features of frequency = 0
    separated = separated.drop_duplicates()
    
    if depth != 'all':
        separated = separated[separated["depth"] == depth]
    if size_fraction != 'both':
        separated = separated[separated["size_fraction"] == size_fraction]

    separated[['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']] = separated['Taxon'].str.split('; ', expand=True)
    cols = ['Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
    for col in cols:
        separated[col] = separated[col].fillna('Unassigned')
    
    #separated['total'] = separated.groupby(['table_id','sample-id'])['feature_frequency'].transform('sum')
    #separated['ratio'] = separated['feature_frequency']/(separated['total'])
    #separated_taxonomies = separated.copy()
    
    #make a dictionary with keys for id-ing the taxon belonging to this sub-community
    #separated_dic = pd.Series(separated.Taxon.values,separated.feature_id.values).to_dict()
    print('Saved separated by metadata dataframe.')
    
    return separated

In [None]:
def investigate_one_feature(separated, depth='all', size_fraction='both', feature_id):
    copy_of=copy_of[copy_of.feature==feature_id]


In [None]:
def plot_depth(separated, depth='all', size_fraction='both', txsubset = 'all'):
    if depth != 'all':
    sfd=separated[separated.depth==depth]


In [11]:
df = consolidate_tables('18S')

Found all 18S tables.
Successfully saved all tables.


In [12]:
merged = merge_metadata(df)

Set up metadata ...
Saved merged_asvs_metadata.tsv


In [13]:
separated = pick_metadata(merged)

Appended all taxonomies to taxos
Saved separated by metadata dataframe.


In [None]:
copy_of = separated.copy()

In [14]:
sizecodes = ['S', 'L', 'W']
palette_colors = sns.color_palette()
palette_dict = {sizecode: color for sizecode, color in zip(sizecodes, palette_colors)}

In [None]:
ax = sns.lineplot(data=sfd1, x ='weekn', y = 'feature_frequency',
                  hue='size_code', 
                  palette=palette_dict,
                  legend='full', lw=3)

ax.xaxis.set_major_locator(ticker.MultipleLocator(4))
plt.legend(bbox_to_anchor=(1, 1))
plt.ylabel('Relative abundance')
plt.xlabel('Week number')
plt.show()

In [15]:
sfd1=separated[separated.depth==1]
sfd5=separated[separated.depth==5]
sfd10=separated[separated.depth==10]
sfd30=separated[separated.depth==30]
sfd60=separated[separated.depth==60]

In [None]:
plt.subplot(511)
ax=sns.lineplot(x = "weekn", y = "feature_frequency", data=sfd1, hue="size_code", palette=palette_dict)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
ax.xaxis.set_major_locator(ticker.MultipleLocator(4))
plt.legend(bbox_to_anchor=(1, 1))
plt.ylabel('1m')
plt.xlabel('')


plt.subplot(512)
ax1=sns.lineplot(x = "weekn", y = "feature_frequency", data=sfd5, hue="size_code", palette=palette_dict)
ax1.xaxis.set_major_locator(ticker.MultipleLocator(4))
plt.ylabel('5m')
plt.xlabel('')
ax1.get_legend().remove()

plt.subplot(513)
ax2=sns.lineplot(x = "weekn", y = "feature_frequency", data=sfd10, hue="size_code", palette=palette_dict)
ax2.xaxis.set_major_locator(ticker.MultipleLocator(4))
plt.ylabel('10m')
plt.xlabel('')
ax2.get_legend().remove()

plt.subplot(514)
ax3=sns.lineplot(x = "weekn", y = "feature_frequency", data=sfd30, hue="size_code", palette=palette_dict)
ax3.xaxis.set_major_locator(ticker.MultipleLocator(4))
plt.ylabel('30m')
plt.xlabel('')
ax3.get_legend().remove()


plt.subplot(515)
ax4=sns.lineplot(x = "weekn", y = "feature_frequency", data=sfd60, hue="size_code", palette=palette_dict)
ax4.xaxis.set_major_locator(ticker.MultipleLocator(4))
plt.ylabel('60m')
plt.xlabel('')
ax4.get_legend().remove()

plt.savefig('18S_depth_profile_alltaxa.png')

### Upset plot data prep

In [16]:
cumulab = separated[['feature_frequency', 'depth', 'size_code', 'Genus']].copy()
cumulab1 = cumulab.groupby(['Genus']).agg({'feature_frequency':sum})

In [17]:
def taxbarplot(separated, level, depth, topn): #separated is the df, #level is a string of taxonomic level column name, depth is an integer
    sfd=separated[separated.depth==depth]
    toptaxa = sfd[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', level]].copy()
    toptaxa = toptaxa.drop_duplicates()
    df_agg = toptaxa.groupby(['size_code',level, 'depth']).agg({'feature_frequency':sum})
    topd = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(topn)
    topd = topd.to_frame()
    topd = topd.reset_index()

    df_agg = df_agg.reset_index()
    df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)

    resultpivot = df_agg.pivot_table(index=level, columns='set_name', values='feature_frequency')
    resultpivot = resultpivot.fillna(0)
    resultpivot[resultpivot != 0] = 1
    tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
    tosave.to_csv(level+'_'+str(depth)+'16S_relab.csv')
    
    top10d_list = topd[level].unique()
    top10d = sfd.copy()
    top10d.loc[~top10d[level].isin(top10d_list), level] = 'Other' #isnot in top list
    phyld = top10d.groupby(['size_code','weekn', level])['ratio'].sum()
    phyld = phyld.reset_index()


    fig = px.bar(phyld, x="size_code", y="ratio", facet_col="weekn", color=level, labels={
                     "feature_frequency": "Relative abundance",
                     "size_code": "",
                     "weekn": "w"})# color_discrete_map=colors_dict)
    fig.update_xaxes(type='category', dtick=1)
    fig.update_layout(
        title="Relative abundance of top 10" + level + 'observed at Depth' + str(depth),
        yaxis_title="Relative abundance",
        xaxis_title="Size fraction",
        legend_title=level,
        font=dict(size=8)
    )

    fig.show()
    #fig.write_image("outputs/fig1.png")
    #fig.to_image(format="png")

In [22]:
taxbarplot(separated, 'Phylum', 60, 10)

In [27]:
toptaxa = sfd1[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', 'Genus' ]].copy()
toptaxa = toptaxa.drop_duplicates()
df_agg = toptaxa.groupby(['size_code','Genus', 'depth']).agg({'feature_frequency':sum})
topd1 = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(10)
topd1 = topd1.to_frame()
topd1 = topd1.reset_index()

df_agg = df_agg.reset_index()
df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)

resultpivot = df_agg.pivot_table(index='Genus', columns='set_name', values='feature_frequency')
resultpivot = resultpivot.fillna(0)
resultpivot[resultpivot != 0] = 1
tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
tosave.to_csv('Genus_d118S_relab.csv')

In [28]:
toptaxa = sfd5[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', 'Genus']].copy()
toptaxa = toptaxa.drop_duplicates()
df_agg = toptaxa.groupby(['size_code','Genus', 'depth']).agg({'feature_frequency':sum})
topd5 = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(10)
topd5 = topd5.to_frame()
topd5 = topd5.reset_index()
topd5['set_name'] = topd5['size_code']+topd5['depth'].astype(str)

df_agg = df_agg.reset_index()
df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)

resultpivot = df_agg.pivot_table(index='Genus', columns='set_name', values='feature_frequency')
resultpivot = resultpivot.fillna(0)
resultpivot[resultpivot != 0] = 1
tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
tosave.to_csv('Genus_d518S_relab.csv')

In [29]:
toptaxa = sfd10[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', 'Genus' ]].copy()
toptaxa = toptaxa.drop_duplicates()
df_agg = toptaxa.groupby(['size_code','Genus', 'depth']).agg({'feature_frequency':sum})
topd10 = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(10)
topd10 = topd10.to_frame()
topd10 = topd10.reset_index()
topd10['set_name'] = topd10['size_code']+topd10['depth'].astype(str)

df_agg = df_agg.reset_index()
df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)

resultpivot = df_agg.pivot_table(index='Genus', columns='set_name', values='feature_frequency')
resultpivot = resultpivot.fillna(0)
resultpivot[resultpivot != 0] = 1
tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
tosave.to_csv('Genus_d1018S_relab.csv')

In [33]:
toptaxa = sfd30[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', 'Genus' ]].copy()
toptaxa = toptaxa.drop_duplicates()
df_agg = toptaxa.groupby(['size_code','Genus', 'depth']).agg({'feature_frequency':sum})
topd30 = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(10)
topd30 = topd30.to_frame()
topd30 = topd30.reset_index()
topd30['set_name'] = topd30['size_code']+topd30['depth'].astype(str)

df_agg = df_agg.reset_index()
df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)

resultpivot = df_agg.pivot_table(index='Genus', columns='set_name', values='feature_frequency')
resultpivot = resultpivot.fillna(0)
resultpivot[resultpivot != 0] = 1
tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
tosave.to_csv('Genus_d3018S_relab.csv')

In [34]:
toptaxa = sfd60[['feature_id', 'feature_frequency', 'Taxon', 'size_code', 'depth','weekn', 'Genus' ]].copy()
toptaxa = toptaxa.drop_duplicates()
df_agg = toptaxa.groupby(['size_code','Genus', 'depth']).agg({'feature_frequency':sum})
topd60 = df_agg['feature_frequency'].groupby('size_code', group_keys=False).nlargest(10)
topd60 = topd60.to_frame()
topd60 = topd60.reset_index()
topd60['set_name'] = topd60['size_code']+topd60['depth'].astype(str)

df_agg = df_agg.reset_index()
df_agg['set_name'] = df_agg['size_code']+df_agg['depth'].astype(str)

resultpivot = df_agg.pivot_table(index='Genus', columns='set_name', values='feature_frequency')
resultpivot = resultpivot.fillna(0)
resultpivot[resultpivot != 0] = 1
tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
tosave.to_csv('Genus_d6018S_relab.csv')

In [None]:
frames = topd1, topd5, topd10, topd30, topd60

In [None]:
frames = sfd1, sfd5, sfd10, sfd30, sfd60

In [35]:
sfd1['set_name'] = sfd1['size_code']+sfd1['depth'].astype(str)
sfd5['set_name'] = sfd5['size_code']+sfd5['depth'].astype(str)
sfd10['set_name'] = sfd10['size_code']+sfd10['depth'].astype(str)
sfd30['set_name'] = sfd30['size_code']+sfd30['depth'].astype(str)
sfd60['set_name'] = sfd60['size_code']+sfd60['depth'].astype(str)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

In [None]:
result = pd.concat(frames)

In [None]:
resultpivot = result.pivot_table(index='Genus', columns='set_name', values='feature_frequency')
resultpivot = resultpivot.fillna(0)
resultpivot[resultpivot != 0] = 1
tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)
tosave.to_csv('genus_all16S_relab.csv')

In [None]:
tosave = pd.merge(resultpivot, cumulab1, left_index=True, right_index=True)

In [None]:
tosave.to_csv('genus_all16S_relab.csv')

### Venn diagrams

In [None]:
#Depth 1 all 16S at genus level
venn3(subsets = (115, 68, 33, 77, 47, 85, 229), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);
plt.title("1m depth") 
plt.savefig("venn_diagram/02-PROKs/D1_genus.png")
plt.show()

In [None]:
#Depth 5 all 16S at genus level
venn3(subsets = (119, 90, 56, 49, 39, 64, 235), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

plt.title("5m depth") 
plt.savefig("venn_diagram/02-PROKs/D5_genus.png")
plt.show()

In [None]:
#Depth 10 all 16S at genus level
v=venn3(subsets = (131, 72, 45, 43, 53, 62, 248), set_labels = ('Large >3μm', 'Small 3-0.2μm', 'Whole water <0.22μm'), alpha = 0.5);
c=venn3_circles(subsets = (131, 72, 45, 43, 53, 62, 248))#, linestyle='dashed', linewidth=1, color="grey")
#c[0].set_lw(8.0)
c[2].set_ls('dotted')
#c[0].set_color('skyblue')
plt.title("10m depth") 
plt.savefig("venn_diagram/02-PROKs/D10_genus.png")
plt.show()

In [None]:
#Depth 30 all 16S at genus level
#N=542
venn3(subsets = (54, 62, 18, 80, 42, 88, 198), set_labels = ('Large >3μm', 'Small 3-02.μm', 'Whole water <0.22μm'), alpha = 0.5);

plt.title("30m depth") 
plt.savefig("venn_diagram/02-PROKs/D30_genus.png")
plt.show()

In [None]:
#Depth 60 all 16S at genus level
#N=559
venn3(subsets = (89, 70, 38, 35, 29, 54, 244), set_labels = ('Large >3μm', 'Small 3-02μm', 'Whole water <0.22μm'), alpha = 0.5);

plt.title("60m depth") 
plt.savefig("venn_diagram/02-PROKs/D60_genus.png")
plt.show()

### taxonomy bar plots

In [None]:
top10d1_list = topd1['Phylum'].unique()
top10d1 = sfd1.copy()
top10d1.loc[~top10d1['Phylum'].isin(top10d1_list), 'Phylum'] = 'Other' #isnot in top list
phyld1 = top10d1.groupby(['size_code','weekn', 'Phylum'])['ratio'].sum()
phyld1=phyld1.reset_index()

In [None]:
top10d5_list = topd5['Phylum'].unique()
top10d5 = sfd5.copy()
top10d5.loc[~top10d5['Phylum'].isin(top10d5_list), 'Phylum'] = 'Other' #isnot in top list
phyld5 = top10d5.groupby(['size_code','weekn', 'Phylum'])['ratio'].sum()
phyld5 = phyld5.reset_index()

In [None]:
top10d10_list = topd10['Phylum'].unique()
top10d10 = sfd10.copy()
top10d10.loc[~top10d10['Phylum'].isin(top10d10_list), 'Phylum'] = 'Other' #isnot in top list
phyld10 = top10d10.groupby(['size_code','weekn', 'Phylum'])['ratio'].sum()
phyld10 = phyld10.reset_index()

In [None]:
top10d30_list = topd30['Phylum'].unique()
top10d30 = sfd30.copy()
top10d30.loc[~top10d30['Phylum'].isin(top10d30_list), 'Phylum'] = 'Other' #isnot in top list
phyld30 = top10d30.groupby(['size_code','weekn', 'Phylum'])['ratio'].sum()
phyld30 = phyld30.reset_index()

In [None]:
top10d60_list = topd60['Phylum'].unique()
top10d60 = sfd60.copy()
top10d60.loc[~top10d60['Phylum'].isin(top10d60_list), 'Phylum'] = 'Other' #isnot in top list
phyld60 = top10d60.groupby(['size_code','weekn', 'Phylum'])['ratio'].sum()
phyld60 = phyld60.reset_index()

In [None]:
joinedlist = [*top10d1_list, *top10d5_list, *top10d10_list, *top10d30_list, *top10d60_list]
joinedlist.append("Other")

joinedlist = list(OrderedDict.fromkeys(joinedlist))

colors_dict = {taxon: color for taxon, color in zip(joinedlist, palette_colors)}

In [None]:
from collections import OrderedDict
joinedlist = list(OrderedDict.fromkeys(joinedlist))

In [None]:
joinedlist = list(OrderedDict.fromkeys(joinedlist))

In [None]:
colors_dict = {taxon: color for taxon, color in zip(joinedlist, palette_colors)}

In [None]:
import plotly.express as px

fig = px.bar(phyld5, x="size_code", y="ratio", facet_col="weekn", color="Phylum", labels={
                     "feature_frequency": "Relative abundance",
                     "size_code": "",
                     "weekn": "w"}, color_discrete_map=colors_dict)
fig.update_xaxes(type='category', dtick=1)
fig.update_layout(
    title="Relative abundance of top 10 Phylum observed at Depth 5m",
    yaxis_title="Relative abundance",
    xaxis_title="Size fraction",
    legend_title="Phylum",
    font=dict(size=8)
)

fig.show()

In [None]:
colors_dict

In [None]:
plotadiv = separated[['sampleid', 'nasvs', 'weekn', 'size_code', 'depth']].copy()
null = plotadiv.loc[plotadiv['feature_count'] == 0]

In [None]:
sns.barplot(data=sfd1, x="weekn", y="diff", hue="size_code", palette=palette_dict)#, hue="size_code")
plt.ylabel('Number of ASVs relative to weekly average')
plt.xlabel('Week number')
plt.savefig('outputs/02-PROKs/D1_avgbarplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.barplot(data=sfd5, x="weekn", y="diff", hue="size_code", palette=palette_dict)#, hue="size_code")
plt.ylabel('Number of ASVs relative to weekly average')
plt.xlabel('Week number')
plt.savefig('outputs/02-PROKs/D5_avgbarplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.barplot(data=sfd10, x="weekn", y="diff", hue="size_code", palette=palette_dict)#, hue="size_code")
plt.ylabel('Number of ASVs relative to weekly average')
plt.xlabel('Week number')
plt.savefig('outputs/02-PROKs/D10_avgbarplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.barplot(data=sfd30, x="weekn", y="diff", hue="size_code", palette=palette_dict)#, hue="size_code")
plt.ylabel('Number of ASVs relative to weekly average')
plt.xlabel('Week number')
ax.get_legend().remove()
plt.savefig('outputs/02-PROKs/D30_avgbarplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.barplot(data=sfd1, x="weekn", y="diff", hue="size_code", palette=palette_dict)#, hue="size_code")
plt.ylabel('Number of ASVs relative to weekly average')
plt.xlabel('Week number')
plt.savefig('outputs/02-PROKs/D1_avgbarplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(4, 3)})
sns.boxplot(data=sfd1, x="size_code", y="nASVs", palette=palette_dict, order=['S', 'L', 'W'])#, hue="size_code")
plt.ylabel('Number of ASVs')
plt.xlabel('Size fraction')
plt.savefig('outputs/02-PROKs/D1_adboxplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(4, 3)})
sns.boxplot(data=sfd5, x="size_code", y="nASVs", palette=palette_dict, order=['S', 'L', 'W'])#, hue="size_code")
plt.ylabel('Number of ASVs')
plt.xlabel('Size fraction')
plt.savefig('outputs/02-PROKs/D5_adboxplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(4, 3)})
sns.boxplot(data=sfd10, x="size_code", y="nASVs", palette=palette_dict, order=['S', 'L', 'W'])#, hue="size_code")
plt.ylabel('Number of ASVs')
plt.xlabel('Size fraction')
plt.savefig('outputs/02-PROKs/D10_adboxplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(4, 3)})
sns.boxplot(data=sfd30, x="size_code", y="nASVs", palette=palette_dict, order=['S', 'L', 'W'])#, hue="size_code")
plt.ylabel('Number of ASVs')
plt.xlabel('Size fraction')
plt.savefig('outputs/02-PROKs/D30_adboxplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(4, 3)})
sns.boxplot(data=sfd60, x="size_code", y="nASVs", palette=palette_dict, order=['S', 'L', 'W'])#, hue="size_code")
plt.ylabel('Number of ASVs')
plt.xlabel('Size fraction')
plt.savefig('outputs/02-PROKs/D60_adboxplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(7, 3)})
ax=sns.lineplot(x = "weekn", y = "nASVs", data=sfd1, hue="size_code", palette=palette_dict)
plt.ylabel('Number of ASVs')
plt.xlabel('Week')
plt.legend(title='Size fraction')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.savefig('outputs/02-PROKs/D1_adlineplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(7, 3)})
ax=sns.lineplot(x = "weekn", y = "nASVs", data=sfd5, hue="size_code", palette=palette_dict)
plt.ylabel('Number of ASVs')
plt.xlabel('Week')
plt.legend(title='Size fraction')
ax.get_legend().remove()
plt.savefig('outputs/02-PROKs/D5_adlineplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(7, 3)})
ax=sns.lineplot(x = "weekn", y = "nASVs", data=sfd10, hue="size_code", palette=palette_dict)
plt.ylabel('Number of ASVs')
plt.xlabel('Week')
plt.legend(title='Size fraction')
ax.get_legend().remove()
plt.savefig('outputs/02-PROKs/D10_adlineplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(7, 3)})
ax=sns.lineplot(x = "weekn", y = "nASVs", data=sfd30, hue="size_code", palette=palette_dict)
plt.ylabel('Number of ASVs')
plt.xlabel('Week')
plt.legend(title='Size fraction')
ax.get_legend().remove()
plt.savefig('outputs/02-PROKs/D30_adlineplot.png', dpi=200, bbox_inches="tight")

In [None]:
sns.set(rc={"figure.figsize":(7, 3)})
ax=sns.lineplot(x = "weekn", y = "nASVs", data=sfd60, hue="size_code", palette=palette_dict)
plt.ylabel('Number of ASVs')
plt.xlabel('Week')
plt.legend(title='Size fraction')
ax.get_legend().remove()
plt.savefig('outputs/02-PROKs/D60_adlineplot.png', dpi=200, bbox_inches="tight")

### Beta diversity analysis

In [None]:
def pcaplot(separated, depth, comm):
    
    if comm == '16S':
        folder = '02-PROKs'
    df=separated[separated.depth==depth]
    
    sfdpiv= df.pivot(index='sampleid', columns='feature_id', values='feature_frequency')
    sfdpiv=sfdpiv.fillna(0)
    sfdclr=sfdpiv.mask(sfdpiv==0).fillna(0.1)
    clr_transformed_array = clr(sfdclr)
    samples = sfdpiv.index
    asvs = sfdpiv.columns
    
    #Creating the dataframe with the clr transformed data, and assigning the sample names
    clr_transformed = pd.DataFrame(clr_transformed_array, columns=asvs)
    #Assigning the asv names
    clr_transformed['samples'] = samples
    clr_transformed = clr_transformed.set_index('samples')
    clr_transformed.head()

    #calculate distance matrix
    dist = cdist(clr_transformed, clr_transformed, 'euclid')
    distance_matrix = pd.DataFrame(dist, columns=samples)
    distance_matrix['samples'] = samples
    distance_matrix = distance_matrix.set_index('samples')

    #format for pca
    dm = DistanceMatrix(distance_matrix)

    pca = PCA(n_components=2)
    components = pca.fit_transform(distance_matrix)

    sns.set(rc={"figure.figsize":(4, 3)})
    sns.set_style("whitegrid", {'axes.grid' : False})
    plot_df = pd.DataFrame(data = components, columns = ['dim1', 'dim2'], index = sfdpiv.index)
    plot_df['dim1'] = plot_df['dim1']/1000
    plot_df['dim2'] = plot_df['dim2']/1000
    plot_df2 = pd.merge(plot_df,df[['sampleid','size_code','weekn']],on='sampleid', how='left')
    #plot
    ax=sns.scatterplot(x = 'dim1', y = 'dim2', size = 'weekn', hue= 'size_code', style="size_code", data = plot_df2,palette=palette_dict) #,palette=sns.color_palette("dark:salmon_r", as_cmap=True))
    plt.ylabel('Component 2')
    plt.xlabel('Component 1')
    sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
    plt.savefig('outputs/'+folder+'/D'+str(depth)+'_PCAplot.png', dpi=200, bbox_inches="tight")

In [None]:
pcaplot(separated, 60, '16S')

In [None]:
null = plotadiv.loc[plotadiv['feature_count'] == 0]

In [None]:
null = null[['year','weekn','size_code','depth']].copy()

In [None]:
null = null.drop_duplicates()

In [None]:
null = null.reset_index(drop=True)

In [None]:
null.to_csv('missingsamples.csv', index=False)

In [None]:
notsizefractionated = plotadiv.loc[plotadiv['year'] < 2018]

In [None]:
sizefractionated = plotadiv.loc[plotadiv['year'] >= 2018]

In [None]:
nullnsf = notsizefractionated.loc[notsizefractionated['feature_count'] == 0]
nullnsf = nullnsf.drop_duplicates()

In [None]:
nullnsf

In [None]:
nullsf = sizefractionated.loc[sizefractionated['feature_count'] == 0]
nullsf = nullsf.drop_duplicates()

In [None]:
nullsf

In [None]:
sns.histplot(
    nullnsf,
    x="weekn", hue="year",
    multiple="stack",
    #palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
)
plt.savefig('figures/missingsamplesNSF.png', format="png")

In [None]:
sns.histplot(
    nullsf,
    x="weekn", hue="year",
    multiple="stack",
    #palette="light:m_r",
    edgecolor=".3",
    linewidth=.5,
)
plt.savefig('figures/missingsamplesSF.png', format="png")

In [None]:
nonulladiv = plotadiv.loc[plotadiv['feature_count'] != 0]

In [None]:
allyears_aphotic = copy_of.loc[copy_of['depth'] == 60]

In [None]:
allyearsaphoSEP = allyears_aphotic.loc[allyears_aphotic['month'] == 9]

In [None]:
allshallow = copy_of.loc[copy_of['depth'] != 60]

In [None]:
allyearshallowSEP = allyearshallow.loc[allyearshallow['month'] == 9]

In [None]:
allyears = allyears_aphotic[['sampleid','feature_id','feature_frequency']].copy()

In [None]:
allyears = allyears.drop_duplicates()

In [None]:
allyears= allyears.pivot(index='sampleid', columns='feature_id', values='feature_frequency')
allyears=allyears.fillna(0)

In [None]:
allyears.rename(index={'BB10.20BL':'BB19.20BL'},inplace=True)

In [None]:
allyears.to_csv('deepdfs.csv', index_label='sampleid')

In [None]:
allyearshallow = allyearshallowSEP.copy()

In [None]:
allyearshallow.set_index('sampleid', inplace=True)

In [None]:
allyearshallow = allyearshallow.drop(columns=['feature_id', 'feature_frequency',
                                              'Taxon', 'Confidence', 'Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species'])

In [None]:
allyearshallow = allyearshallow.drop_duplicates()

In [None]:
#some samples contain duplicates because two scans were done; groupby sample and keep mean
allyearshallow = allyearshallow.groupby(by=["sampleid"]).mean()

In [None]:
#forward fill missing values
allyearshallow = allyearshallow.fillna(method='ffill')

In [None]:
allyearshallow = allyearshallow.dropna(axis=1, how='all')

In [None]:
allyearshallow.rename(index={'BB10.20BL':'BB19.20BL'},inplace=True)

In [None]:
allyearshallow.to_csv('env_data_deep.csv', index_label='sampleid')

In [None]:
envindex = allyearshallow.index.tolist()

In [None]:
speindex = allyears.index.tolist()

In [None]:
temp3 = []
for element in envindex:
    if element not in speindex:
        temp3.append(element)
print(temp3)

In [None]:
temp3 = []
for element in speindex:
    if element not in envindex:
        temp3.append(element)
print(temp3)

In [None]:
allyearshallow["ratio_fid"] = allyearshallow["feature_frequency"] / allyearshallow["feature_count"]

In [None]:
allyearshallown19 = allyearshallow.loc[allyearshallow['year'] != 2019]

In [None]:
allyearshallow = allyearshallow[allyearshallow['ratio_fid'].notna()]

In [None]:
allyearshallow19 = allyearshallow.loc[allyearshallow['year'] == 2019]

In [None]:
allyearshallow19 = allyearshallow19[allyearshallow19['ratio_fid'].notna()]

In [None]:
top14 = allyearshallow.loc[allyearshallow['year'] == 2014]
top14 = top14.nlargest(5, "ratio_fid")

In [None]:
top15 = allyearshallow.loc[allyearshallow['year'] == 2015]
top15 = top15.nlargest(5, "ratio_fid")

In [None]:
top16 = allyearshallow.loc[allyearshallow['year'] == 2016]
top16 = top16.nlargest(5, "ratio_fid")

In [None]:
top17 = allyearshallow.loc[allyearshallow['year'] == 2017]
top17 = top17.nlargest(5, "ratio_fid")

In [None]:
top18 = allyearshallow.loc[allyearshallow['year'] == 2018]
top18 = top18.nlargest(5, "ratio_fid")

In [None]:
top19 = allyearshallow.loc[allyearshallow['year'] == 2019]
top19 = top19.nlargest(5, "ratio_fid")

In [None]:
top20 = allyearshallow.loc[allyearshallow['year'] == 2020]
top20 = top20.nlargest(5, "ratio_fid")

In [None]:
top21 = allyearshallow.loc[allyearshallow['year'] == 2021]
top21 = top21.nlargest(5, "ratio_fid")

In [None]:
result = pd.concat([top14, top15, top16, top17, top18, top19, top20, top21], axis=0)

In [None]:
ax=sns.barplot(x = 'year', y = 'ratio_fid', hue = 'Family', data = result,
            edgecolor = 'w')
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))

plt.show()

In [None]:
allyears

In [None]:
sfd1piv= sfd1.pivot(index='sampleid', columns='feature_id', values='feature_frequency')
sfd1piv=sfd1piv.fillna(0)
sfd1clr=sfd1piv.mask(sfd1piv==0).fillna(0.1)
clr_transformed_array = clr(sfd1clr)
samples = sfd1piv.index
asvs = sfd1piv.columns
#Creating the dataframe with the clr transformed data, and assigning the sample names
clr_transformed = pd.DataFrame(clr_transformed_array, columns=asvs)
#Assigning the asv names
clr_transformed['samples'] = samples
clr_transformed = clr_transformed.set_index('samples')
clr_transformed.head()

#calculate distance matrix
dist = cdist(clr_transformed, clr_transformed, 'euclid')
distance_matrix = pd.DataFrame(dist, columns=samples)
distance_matrix['samples'] = samples
distance_matrix = distance_matrix.set_index('samples')

#format for pca
dm = DistanceMatrix(distance_matrix)

pca = PCA(n_components=2)
components = pca.fit_transform(distance_matrix)

fig = px.scatter(components, x=0, y=1, color=sfd1['weekn'], symbol=sfd1['size_code'])
fig.show()

In [None]:
dm = DistanceMatrix(distance_matrix)

In [None]:
permanova(dm, grouping=allyears_sc['size_code'])

In [None]:
permanova(dm, grouping=allyearshallow['Nitrite'])

In [None]:
from skbio.stats.composition import ancom

In [None]:
allyears_sc = allyears_aphotic[['sampleid','size_code']].copy()

In [None]:
allyears_sc = allyears_sc.drop_duplicates()

In [None]:
allyears_sc=allyears_sc.set_index('sampleid')

In [None]:
results = ancom(table=allyearsclr, grouping=allyears_sc['size_code'])

In [None]:
prcentile = results[1].copy()

In [None]:
prcentile.iloc[:, prcentile.columns.get_level_values(1)=='W']

In [None]:
idx = pd.IndexSlice
prcentile.loc[:,idx[:,'W'] == 0.1]

In [None]:
prcentile.query('W == 0.1')

In [None]:
prcentile.loc[prcentile.index == '2997f1920aefbb91632b549f2538b73b']

In [None]:
DAresults = results[0].copy()

In [None]:
DARejected_year = DAresults.loc[DAresults['Reject null hypothesis'] == True]

In [None]:
DARejected_year.sort_values(by=['W'])

In [None]:
taxonomy = copy_of[['feature_id', 'Confidence', 'Taxon', 'Phylum', 'Class', 'Family', 'Genus', 'Species']].copy()

In [None]:
taxonomy = taxonomy.drop_duplicates()

In [None]:
DARejected_year_taxonomy = pd.merge(DARejected_year, taxonomy, on="feature_id", how="left")

In [None]:
DARejected_year_taxonomy.sort_values(by='W')

In [None]:
copy_of['avgNitrite'] = (
    copy_of.groupby(["weekn"])['Nitrite'].mean())

In [None]:
allyearshallow = allyearshallow.sort_values('sampleid')

In [None]:
allyearshallow['weektotaln'] = np.arange(allyearshallow.shape[0])

In [None]:
allyearshallow[ '4WksRollAv' ] = allyearshallow.Nitrite.rolling(4).mean()

In [None]:
df[“Error”] = df[“Forecast”] — df[“Demand”]
m = df[“Error”].mean()
s = df[“Error”].std()
from scipy.stats import norm
limit_high = norm.ppf(0.99,m,s)+df[“Forecast”]
limit_low = norm.ppf(0.01,m,s)+df[“Forecast”]
df[“Updated”] = df[“Demand”].clip(lower=limit_low,upper=limit_high)
print(df)

In [None]:
preall_md_photic = preall_md.loc[preall_md['depth'] == 60]

In [None]:
nitriteonly = preall_md_photic[['weekn', 'year', 'Phosphate']].copy().drop_duplicates().dropna()

In [None]:
avgs = nitriteonly.groupby('weekn').mean()

In [None]:
avgs = avgs.drop(columns=['year'])

In [None]:
nitriteonly = nitriteonly.groupby(['weekn','year']).mean()

In [None]:
nitriteonly = nitriteonly.reset_index()

In [None]:
nitriteonly = pd.merge(nitriteonly, avgs, on="weekn")

In [None]:
nitriteonly

In [None]:
nitriteonly["Error"] = nitriteonly["Phosphate_y"]-nitriteonly["Phosphate_x"]

In [None]:
m = nitriteonly["Error"].mean()

In [None]:
s = nitriteonly["Error"].mean()

In [None]:
from scipy.stats import norm

In [None]:
limit_high = norm.ppf(0.99,m,s)+nitriteonly["Phosphate_y"]
limit_low = norm.ppf(0.01,m,s)+nitriteonly["Phosphate_y"]
nitriteonly["Updated"] = nitriteonly["Phosphate_x"].clip(lower=limit_low,upper=limit_high)
print(nitriteonly)

In [None]:
nitriteonly = nitriteonly.sort_values(["year", "weekn"]).reset_index()

In [None]:
nitriteonly = nitriteonly.reset_index()

In [None]:
nitriteonly.loc[nitriteonly['year'] == 2020]

In [None]:
# set figure size
plt.figure( figsize = ( 12, 5))
  
# plot a simple time series plot
# using seaborn.lineplot()
sns.lineplot( x = 'level_0',
             y = 'Phosphate_x',
             data = nitriteonly,
             label = 'Phopshate')

sns.lineplot( x = 'level_0',
             y = 'Phosphate_y',
             data = nitriteonly,
             label = 'Phopshate Forecast')

plt.axvline(51, ls='--')
plt.axvline(92, ls='--')
plt.axvline(135, ls='--')
plt.axvline(185, ls='--')
plt.axvline(234, ls='--')
plt.axvline(269, ls='-')
plt.axvline(281, ls='--')
plt.axvline(300, ls='--')
  
plt.xlabel('Weeks')
plt.ylabel('[Phosphate]')

In [None]:
autoplot(tsclean(nitriteonly), series="clean", color='red', lwd=0.9)+
autolayer(nitriteonly, series="original", color='gray', lwd=1)+
geom_point(data = tsoutliers(nitriteonly) %>% as.data.frame(), aes(x='level_0', y='Nitrite_x'), col='blue')+labs(x = "Day", y = "Gold price ($US)")

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(distance_matrix)

fig = px.scatter(components, x=0, y=1, color=allyearshallow['weekn'], symbol=allyearshallow['year'])
fig.show()

In [None]:
pca = PCA(n_components=2)
components = pca.fit_transform(allyears)

fig = px.scatter(components, x=0, y=1,color=allyearshallow['weekn'],symbol=allyearshallow['year'])
fig.show()

In [None]:
sns.lineplot(x = "weekn", y = "PON", data=allyearshallow, hue="year")
plt.show()

In [None]:
grouped_data = sales_data.groupby('ProductID')
sales_data['AvgRevenuePerUnit'] = grouped_data['Revenue'].transform('mean') / grouped_data['Quantity'].transform('mean')

In [None]:
allyearshallow['year'] = pd.Categorical(allyearshallow['year'])

In [None]:
allyearshallow = allyearshallow.reset_index()

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(np.array(allyears))
plot_df = pd.DataFrame(data = principalComponents, columns = ['dim1', 'dim2'], index = allyears.index)

In [None]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(np.array(allyears))
plot_df = pd.DataFrame(data = principalComponents, columns = ['dim1', 'dim2'], index = allyears.index)
plot_df['dim1'] = plot_df['dim1']/1000
plot_df['dim2'] = plot_df['dim2']/1000
plot_df2 = pd.merge(plot_df,allyearshallow[['sampleid','depth']],on='sampleid', how='left')
sns.scatterplot(x = 'dim1', y = 'dim2', hue = 'depth', data = plot_df2)

In [None]:
sns.scatterplot(x = 'dim1', y = 'dim2', data = plot_df)

In [None]:
plot_df2 = pd.merge(plot_df,allyearshallow[['sampleid','depth']],on='sampleid', how='left')
sns.scatterplot(x = 'dim1', y = 'dim2', hue = 'depth', data = plot_df2)

In [None]:
plot_df3 = pd.merge(plot_df,allyearshallow[['sampleid','weekn']],on='sampleid', how='left')
sns.scatterplot(x = 'dim1', y = 'dim2', hue = 'weekn', data = plot_df3)

In [None]:
from sklearn.manifold import TSNE
tsne = TSNE(metric = 'jaccard')
embeddings = tsne.fit_transform(allyears)
plot_df = pd.DataFrame(data = embeddings, columns = ['dim1', 'dim2'], index = allyears.index)

In [None]:
sns.scatterplot(x = 'dim1', y = 'dim2', data = plot_df)

In [None]:
plot_df2 = pd.merge(plot_df,copy_of[['sampleid','year']],on='sampleid', how='left')
sns.scatterplot(x = 'dim1', y = 'dim2', hue = 'year', data = plot_df2)

In [None]:
plot_df3 = pd.merge(plot_df,copy_of[['sampleid','weekn']],on='sampleid', how='left')
sns.scatterplot(x = 'dim1', y = 'dim2', hue = 'weekn', data = plot_df3)

In [None]:
plot_df4 = pd.merge(plot_df,copy_of[['sampleid','depth']],on='sampleid', how='left')
sns.scatterplot(x = 'dim1', y = 'dim2', hue = 'depth', data = plot_df4)

In [None]:
conda install -c conda-forge umap-learn

In [None]:
import umap
reducer = umap.UMAP(n_components = 2, n_neighbors = 15, metric = 'jaccard', random_state = 0)
embeddings = reducer.fit_transform(allyears)
plot_df = pd.DataFrame(data = embeddings, columns = ['dim1', 'dim2'], index = allyears.index)

In [None]:
copy_of

In [None]:
y2014

In [None]:
onlyphyl = copy_of.filter(['year','Phylum','season'], axis=1)

In [None]:
onlyphyl=onlyphyl.drop_duplicates()

In [None]:
onlyphyl.pivot(index='Phylum', columns='season'

In [None]:
forupset=onlyphyl.groupby(['season', 'Phylum']).count().unstack().fillna(0)

In [None]:
forupset.where(forupset <= 1, 1, inplace=True)

In [None]:
forupset.to_csv('upsetplots.csv', index_label='weekn')

In [None]:
forupset

In [None]:
sns.lineplot(data=y2014, x="month", y="temperature", hue="depth")

In [None]:
sns.lineplot(data=y2015, x="month", y="temperature", hue="depth")

In [None]:
salinity = shallow[['Weekn', 'temperature', "Year"]].copy()

In [None]:
salinity = salinity[salinity['temperature'].notna()]

In [None]:
salinity2014 = salinity[salinity.Year == 2019]
salinity2014 = salinity2014.drop(columns=['Year'])
salinity2014 = salinity2014.set_index(['Weekn'])

In [None]:
outliers_fraction = float(.01)
scaler = StandardScaler()
np_scaled = scaler.fit_transform(salinity2014.values.reshape(-1, 1))
data = pd.DataFrame(np_scaled)
# train isolation forest
model =  IsolationForest(contamination=outliers_fraction)
model.fit(data)

In [None]:
salinity2014

In [None]:
salinity2014['anomaly'] = model.predict(data)

In [None]:
# visualization
fig, ax = plt.subplots(figsize=(10,6))
a = salinity2014.loc[salinity2014['anomaly'] == -1, ['salinity']] #anomaly
ax.plot(salinity2014.index, salinity2014['salinity'], color='black', label = 'Normal')
ax.scatter(a.index,a['salinity'], color='red', label = 'Anomaly')
plt.legend()
plt.show();

In [None]:
fig = px.line(deep, x="Weekn", y="salinity", color='Year')
fig.show()

In [None]:
fig = px.line(deep, x="Weekn", y="Nitrate", color='Year')
fig.show()

In [None]:
all_md

In [None]:
print(all_md['month'].to_string(index=False))

In [None]:
all_md

In [None]:
all_md.Year.unique()

In [None]:
fig = px.line(depth1, x="Weekn", y="temperature", color='Year', )
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

In [None]:
for index, row in dfo_md.iterrows():
    dfo_md['week_n'] = datetime.date(dfo_md['year_time'], dfo_md['month_time'], dfo_md['day_time']).isocalendar()[1]

In [None]:
df

In [None]:
all_md.columns.values 

In [None]:
df2 = all_md[['temperature', 'Chlorophyll A', 'Nitrate', 'Phosphate', 'Silicate', 'Salinity', 'Year']]

In [None]:
df2

In [None]:
bio_niskin[bio_niskin.isna().any(axis=1)]

In [None]:
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler

df = df2
features = ['pH','Temperature', 'temperature', 'Chlorophyll A', 'Nitrate', 'Phosphate', 'Silicate', 'Salinity']
X = df[features]

pca = PCA(n_components=2)
components = pca.fit_transform(X)

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

fig = px.scatter(components, x=0, y=1, color=df['Year'])

for i, feature in enumerate(features):
    fig.add_annotation(
        ax=0, ay=0,
        axref="x", ayref="y",
        x=loadings[i, 0],
        y=loadings[i, 1],
        showarrow=True,
        arrowsize=2,
        arrowhead=2,
        xanchor="right",
        yanchor="top"
    )
    fig.add_annotation(
        x=loadings[i, 0],
        y=loadings[i, 1],
        ax=0, ay=0,
        xanchor="center",
        yanchor="bottom",
        text=feature,
        yshift=5,
    )
fig.show()

In [None]:
#restrict to only years of interest 2019
dfo_md_14 = dfo_md[dfo_md.year_time == 2014]
dfo_md_14.loc[:,'year'] = "BB14" #add this column to match with metadata for QIIME2

In [None]:
dfo_14 = dfo_md_14.copy()

In [None]:
#restrict to only pressures 1,5,10,60
dfo_pressure_1 = dfo_14[dfo_14.pressure == 1.0]
dfo_pressure_1.loc[:,'depth_code'] = "A"

dfo_pressure_5 = dfo_14[dfo_14.pressure == 5.0]
dfo_pressure_5.loc[:,'depth_code'] = "B"

dfo_pressure_10 = dfo_14[dfo_14.pressure == 10.0]
dfo_pressure_10.loc[:,'depth_code'] = "C"

dfo_pressure_60 = dfo_14[dfo_14.pressure == 60.0]
dfo_pressure_60.loc[:,'depth_code'] = "D"

In [None]:
dfo_14 = pd.concat([dfo_pressure_1,dfo_pressure_5,dfo_pressure_10,dfo_pressure_60])

In [None]:
dfo_14 = dfo_14.sort_values(by="time_string")
dfo_14

In [None]:
dfo_14.shape

In [None]:
dfo_19 = dfo_19.reset_index()

In [None]:
dfo_19.loc[dfo_19['month_time'] == 2]

In [None]:
dfo_14.columns

In [None]:
dfo_19.loc[26.5] = 'Nan', 'Nan', 2019, 2, 20, 51, 'Nan', 60.0, 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan','Nan', 'Nan', 'Nan', 'Nan', 'Nan','Nan', 'Nan', 'Nan', 'Nan', 'Nan','Nan', 'Nan', 'Nan', 'Nan', 'Nan','BB19', 'D' #insert a column for the missing sample

In [None]:
dfo_14 = dfo_14.sort_index().reset_index(drop=True)

In [None]:
#add week number
week = np.arange(2,52)
week = np.repeat(week, 4)

In [None]:
dfo_14['week'] = week

In [None]:
dfo_14

In [None]:
#how to join them:

#We'll make a new list to be our new column.
new_labels = []

#we can iterate through rows with this.
for _, row in dfo_14.iterrows():
    #we can get the column value of each row by indexing on the column name
    year = row['year']
    depth_code = row['depth_code']
    week = row['week']
    #we use string formatting here to make a new label
    new_label = "{0}-{1}{2}".format(year, week, depth_code)
    #add the new label to the list
    new_labels.append(new_label)
#this will make a new column in the df called 'NewLabel' with the values
dfo_14['name'] = new_labels
dfo_14

In [None]:
#how to join them:

#We'll make a new list to be our new column.
new_labels = []

#we can iterate through rows with this.
for _, row in bb_md.iterrows():
    #we can get the column value of each row by indexing on the column name
    depth_code = row['Depth_Code']
    week = row['Week#']
    #we use string formatting here to make a new label
    new_label = "BB19-{1}{2}".format(year, week, depth_code)
    #add the new label to the list
    new_labels.append(new_label)
#this will make a new column in the df called 'NewLabel' with the values
bb_md['name'] = new_labels
bb_md

In [None]:
dfo_14 = dfo_metadata_y14.copy()

In [None]:
dfo_14.to_csv('dfo_metadata_y14.csv', index=False)

In [None]:
#merge 2 metadata files
metadata_bb = pd.merge(bb_md, dfo_14, on = 'name',how='outer')
metadata_bb

In [None]:
sns.relplot(data=metadata_bb, x="week", y="temperature", hue='pressure')

In [None]:
metadata_bb2 = metadata_bb.fillna(0)

In [None]:
metadata_bb2["week"] = pd.to_numeric(metadata_bb2["week"])

In [None]:
metadata_bb2["temperature"] = pd.to_numeric(metadata_bb2["temperature"])

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})
g = sns.relplot(
    data=metadata_bb2, kind="line",
    x="week", y="temperature", hue="pressure", height = 8, aspect = 1.25
)
g.set(xticks=np.arange(0,52,5))
plt.axvline(36, ls='--')

In [None]:
metadata_bb

In [None]:
biomtable19 = pd.read_csv("/Users/Diana/Documents/escuela/phd/bb_data/2019/dada2_output_270210_exported/table_from_biom.txt", sep='\t')
classification = pd.read_csv("/Users/Diana/Documents/escuela/phd/bb_data/2019/taxa_270210/taxonomy.tsv", sep='\t')

In [None]:
biomtable14 = pd.read_csv("/Users/Diana/Documents/escuela/phd/bb_data/2014/dada2_output_exported/feature-table.tsv", sep='\t')
classification14 = pd.read_csv("/Users/Diana/Documents/escuela/phd/bb_data/2014/taxa/taxonomy.tsv", sep='\t')

In [None]:
biomtable14.rename({'#OTU ID': 'Feature ID'}, axis=1, inplace=True)

In [None]:
biomtable14

In [None]:
meltedbiom = pd.melt(frame=biomtable14, id_vars = 'Feature ID', var_name = 'sampleid', value_name = 'Rel Abun')

In [None]:
mergedmelty = pd.merge(meltedbiom, classification, on = 'Feature ID',how='inner')

In [None]:
merged = mergedmelty.set_index('Taxon')

In [None]:
merged = merged.drop(columns=['Confidence', 'Feature ID'])

In [None]:
merged

In [None]:
merged.loc[:,'Total'] = merged.sum(axis=1)
merged

In [None]:
groupedbytaxon = merged.groupby(['Taxon']).sum()

In [None]:
groupformelty = groupedbytaxon.reset_index()

In [None]:
totals = groupformelty[['Taxon', 'Total']]

In [None]:
top20 = groupedbytaxon.nlargest(20,'Total')

In [None]:
top5 = groupedbytaxon.nlargest(5, 'Total')

In [None]:
rare5 =  groupedbytaxon.nsmallest(5, 'Total')

In [None]:
rare5.index

In [None]:
top20 = top20.drop(columns=['Total'])

In [None]:
top20T = top20.T
top20T['sampleid'] = top20T.index

In [None]:
top20T.columns

In [None]:
df_long.pivot_table(index=["sampleid"], 
                    columns='class', 
                    values='grade')

In [None]:
df_long.pivot_table(index=["student", "school"], 
                    columns='class', 
                    values='grade')

In [None]:
withmeta = pd.merge(top20T, metadata_bb, on = 'sampleid',how='inner')

In [None]:
withmetamelty = pd.merge(mergedmelty, metadata_bb, on='sampleid', how='inner')

In [None]:
withmetamelty = pd.merge (withmetamelty, totals, on = 'Taxon', how='inner')

In [None]:
withmetamelty

In [None]:
top20 = ['d__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Chitinophagales; f__Saprospiraceae; g__Lewinella; s__uncultured_marine',
       'd__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Planktomarina',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Oceanospirillales; f__Nitrincolaceae; g__uncultured; s__uncultured_marine',
       'd__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__SAR11_clade; f__Clade_I; g__Clade_Ia',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Thiomicrospirales; f__Thioglobaceae',
       'd__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Amylibacter',
       'd__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Crocinitomicaceae; g__Fluviicola',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Flavobacteriaceae; g__uncultured; s__uncultured_marine',
       'd__Bacteria; p__Planctomycetota; c__OM190; o__OM190; f__OM190; g__OM190; s__marine_metagenome',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Thiomicrospirales; f__Thioglobaceae; g__SUP05_cluster',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Cryomorphaceae; g__uncultured',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Flavobacteriaceae',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Flavobacteriaceae; g__Tenacibaculum',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Flavobacteriaceae; g__Ulvibacter; s__uncultured_marine',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Flavobacteriaceae; g__NS5_marine_group',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Flavobacteriaceae; g__Polaribacter',
       'd__Bacteria; p__Nitrospinota; c__Nitrospinia; o__Nitrospinales; f__Nitrospinaceae; g__LS-NOB',
       'd__Bacteria; p__Planctomycetota; c__Planctomycetes; o__Planctomycetales; f__Rubinisphaeraceae; g__Fuerstia; s__uncultured_bacterium',
       'd__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Flavobacteriales; f__Flavobacteriaceae; g__NS3a_marine_group']


In [None]:
top5 = ['d__Bacteria; p__Bacteroidota; c__Bacteroidia; o__Chitinophagales; f__Saprospiraceae; g__Lewinella; s__uncultured_marine',
       'd__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhodobacterales; f__Rhodobacteraceae; g__Planktomarina',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Oceanospirillales; f__Nitrincolaceae; g__uncultured; s__uncultured_marine',
       'd__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__SAR11_clade; f__Clade_I; g__Clade_Ia',
       'd__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Thiomicrospirales; f__Thioglobaceae']

In [None]:
mdtop20 = withmetamelty[withmetamelty['Taxon'].isin(top20)]

In [None]:
mdtop5 = withmetamelty[withmetamelty['Taxon'].isin(top5)]

In [None]:
sns.set_style("whitegrid", {'axes.grid' : False})
g = sns.relplot(
    data=mdtop5, kind="line",
    x="week", y="Rel Abun", hue="Taxon", height = 8, aspect = 1.25
)
g.set(xticks=np.arange(0,52,5))
plt.axvline(36, ls='--')

In [None]:
mdtop5

In [None]:
metadata_bb

In [None]:
mdtop5.pressure.unique()

In [None]:
mdtop5['pressure'] = mdtop5['pressure'].replace('nan', 0.0)

In [None]:
mdtop5_dA = mdtop5[mdtop5.depth_code == 'A']

In [None]:
fig = px.line(mdtop5_dA, x="week", y="Rel Abun", color='Taxon')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

In [None]:
fig = px.scatter(mdtop5_dA, x="week", y="temperature", color='Season')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

In [None]:
fig = px.scatter(mdtop5_dA, x="week", y="oxygen", color='Season')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

In [None]:
mdtop5_dA

In [None]:
withmetamelty.to_csv('w_md_melted.csv', index=False)

In [None]:
withmeta.columns