# PATHWAY ABUNDANCE MANIPULATION

This file focuses only on calculating relative pathway abundance, adding metadata, and cleaning up the data. 

This pipeline was last edited by Yu Han Daisy Wang on 28 August 2023.

## step 0: load all relevant/needed packages

In [29]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib import font_manager

---
## IF YOU'VE **ALREADY RAN** THIS PIPELINE AND HAVE DATA THAT'S ALREADY CLEANED UP, START HERE
Use the following bit to load all of your data that's already been cleaned up

---
## IF YOU **HAVEN'T ALREADY RAN** THIS PIPELINE/YOU WANT TO CHANGE SOMETHING ABOUT THE DATA PROCESSING, START HERE
This starts the data processing from scratch!

### load all necessary data for starting from scratch

In [3]:
# bowtie hit summaries for propionate
compiled_bt_hit_summaries_allpathways = pd.read_csv("compiled_bt_hit_summaries_allpathways.csv")

# gene lengths 
allpathways_gene_catalogue_seqlengths = pd.read_csv("allpathways_gene_catalogue_seqlengths.csv").set_index("gene")

# gene information
allpathways_gene_info = pd.read_csv("allpathways_genesInCatalogue_long.csv")
allpathways_gene_info = allpathways_gene_info.drop(allpathways_gene_info.columns[0], axis=1).set_index("strain_gene")

### gene length correction (new, using formula proposed by RC, step 1 of it)

`gene_length_correction_new` performs the first half of the gene length correction process. Specifically, it does $hits\;of\;gene \cdot length(gene)$. 

For reference, the full formula for our new gene length correction method is given as follows: $$\frac{hits\;of\;gene \cdot length(gene)}{hits\;of\;rplB \cdot length(rplB)}$$

In [30]:
def gene_length_correction_new(gene_catalogue_seqlengths, compiled_bt_hit_summaries):

    gene_length_df = gene_catalogue_seqlengths

    df = compiled_bt_hit_summaries.drop(["pathway"], axis=1).set_index("read_accession")

    new = df.copy()

    # first replace the values in new with the gene length of that gene
    for gene in compiled_bt_hit_summaries:
        
        if gene in gene_length_df.index:
        
            gene_length = gene_length_df.loc[gene].at["length"]

            new[gene] = [gene_length] * len(new)

    # equivalent to hits of gene * length(gene)
    new = df.multiply(new)

    return new

### running the code for gene length correction

In [31]:
compiled_bt_hit_summaries_all_pathways_length_corrected = gene_length_correction_new(allpathways_gene_catalogue_seqlengths, compiled_bt_hit_summaries_allpathways)

### useful setup for calculating pathway abundances

There's not much being done here, just some pure setup stuff that can be useful later.

In [32]:
abbreviations_dict = {
    "acetylCoA_buk": "Ace (buk)",
    "acetylCoA_but": "Ace (but)",
    "aminobutyrate_buk": "4-Ami (buk)",
    "aminobutyrate_but": "4-Ami (but)",
    "glutarate_buk": "Glu (buk)",
    "glutarate_but": "Glu (but)",
    "lysine": "Lys",
    "sodium-pumping decarboxylase": "SP",
    "Wood-Werkman Cycle": "WWC",
    "acrylate pathway": "Acr",
    "propanediol pathway": "Pro"
}

pathway_length_dict = {
    "Ace (buk)": 6,
    "Ace (but)": 5,
    "4-Ami (buk)": 6,
    "4-Ami (but)": 5,
    "Glu (buk)": 6,
    "Glu (but)": 5,
    "Lys": 7,
    "SP": 4,
    "WWC": 4,
    "Acr": 3,
    "Pro": 5
}

### calculating pathway abundances

The function `add_genes_in_catalogue` merges information about genes onto the hit table.

In [33]:
def add_genes_in_catalogue(hit_table, gene_info, gene_catalogue_seqlengths):

    temp = hit_table.transpose()

    # this merges the information about the genes into the hit table
    temp = temp.merge(gene_info, how="left", left_index=True, right_index=True)

    # everything after this is purely cosmetic, it just moves the columns strain, pathway, 
    # gene, strain_pathway, and length to the front 
    strain = temp.pop("strain")
    temp.insert(0, strain.name, strain)

    pathway = temp.pop("pathway")
    temp.insert(1, pathway.name, pathway)

    gene = temp.pop("gene")
    temp.insert(2, gene.name, gene)

    strain_pathway = temp.pop("strain_pathway")
    temp.insert(3, strain_pathway.name, strain_pathway)

    temp = temp.merge(gene_catalogue_seqlengths, how="left", left_index=True, right_index=True)

    length = temp.pop("length")
    temp.insert(4, length.name, length)

    return temp

In [34]:
final_hit_table = add_genes_in_catalogue(compiled_bt_hit_summaries_all_pathways_length_corrected, allpathways_gene_info, allpathways_gene_catalogue_seqlengths)

### part 2 of length correction: dividing by number of hits for rplB

In [35]:
def length_correction_denominator(hit_table):

    grouped = hit_table.groupby("strain_pathway").sum().drop(["strain", "pathway", "gene", "length"], axis=1)

    rplB_sum = grouped.loc["housekeeping"]

    not_samples = ["strain", "pathway", "gene", "strain_pathway", "length"]

    new = hit_table.copy

    for read in grouped:

        hit_table[read] /= rplB_sum[read]

    return hit_table

In [36]:
final_hit_table = length_correction_denominator(final_hit_table)

### this step is to normalise by proportion for the unknown samples

The formula for this normalisation is given as follows. Let $x$ be the amount of the mixed sample, $S_n$ be the known we are trying to approximate, and $S_i$ be the $i$-th known sample that is in the mixed sample. Let there be $n$ known samples in the mixed sample. As such, the total amount of $S_n$ can be given as follows:

$$actual\;amount\;of\;S_n = \frac{S_n}{\sum_{i=1}^{n} S_i} \cdot x$$

A function to perform this normalisation is given below.

In [37]:
def normalise_unknown(df):

    # this function does not operate in place. instead, it returns a modified copy of the original dataframe.
    new = df.copy()

    # This dictionary stores all of the names of the pathways that are unclear
    unknown_samples = {
        "acetylCoA_buk or glutarate_buk": ["acetylCoA_buk", "glutarate_buk"],
        "aminobutyrate_but or acetylCoA_but": ["aminobutyrate_but", "acetylCoA_but"],
        "acetylCoA_buk or lysine": ["acetylCoA_buk", "lysine"],
        "acetylCoA_but or glutarate_but": ["acetylCoA_but", "glutarate_but"],
        "acetylCoA_but or glutarate_but or lysine": ["acetylCoA_but", "glutarate_but", "lysine"],
        "acetylCoA_but or lysine": ["acetylCoA_but", "lysine"],
        "aminobutyrate_buk or acetylCoA_buk": ["aminobutyrate_buk", "acetylCoA_buk"],
        "aminobutyrate_buk or acetylCoA_buk or lysine": ["aminobutyrate_buk", "acetylCoA_buk", "lysine"],
        "aminobutyrate_buk or lysine": ["aminobutyrate_buk", "lysine"],
        "aminobutyrate_but or acetylCoA_but": ["aminobutyrate_but", "acetylCoA_but"],
        "aminobutyrate_but or acetylCoA_but or glutarate_but": ["aminobutyrate_but", "acetylCoA_but", "glutarate_but"]
    }

    # for each sample, loop through each pathway in the sample
    for sample in df:

        for pathway in list(df.index):

            # this is my way of finding all the pathways that are ambigious
            if " or " in pathway:

                # this just gets the raw number of hits for the ambigious sample
                unknown_proportion = df.loc[pathway].at[sample]

                # if the raw number of hits == 0, exit this loop
                # if the loop is not exited, this will cause a divide by 0 error later
                if unknown_proportion == 0: break

                # setup for the rest of the calculations
                components_list = unknown_samples[pathway]
                proportion_dict = {}
                denominator = 0
                
                # for every component in the unkonwn samples
                # if it's != 0, add add it to the denominator
                for component in components_list:

                    if df.loc[component].at[sample] != 0:

                        proportion_dict[component] = df.loc[component].at[sample]
                        denominator += proportion_dict[component]

                # if there ends up being more than one type of sample in the unknown mixture
                # actually do the calculations
                # if there's only one, don't bother
                if len(proportion_dict) > 1:

                    for component in components_list:

                        new.loc[component].at[sample] = proportion_dict[component] + ((proportion_dict[component] / denominator) * unknown_proportion)

    new = new.drop(list(unknown_samples.keys()))

    return new

Now, actually implement the `normalisation_unknown` function on actual data

First, perform some preprocessing of the data

In [38]:
overall_pathway_group = final_hit_table.groupby(["strain_pathway"]).sum()

overall_pathway_group = overall_pathway_group.drop(["strain", "pathway", "gene", "length"], axis=1).drop(["housekeeping"], axis=0)

Then, actually implement everything. The code for parsing out the unknown proportions is not very well written, so this section might take a long time to run, as long as just under 10 minutes. For context, the last time I tried to run this chunk, it took me 7 mins and 31 seconds.

In [39]:
normalised_unknown_overall_pathway_group = normalise_unknown(overall_pathway_group)

# the next two steps are purely for aesthetic reasons
# it just orders it so that the propionate and butyrate pathways are together
normalised_unknown_overall_pathway_group = normalised_unknown_overall_pathway_group.transpose()
normalised_unknown_overall_pathway_group = normalised_unknown_overall_pathway_group.reindex(columns= ["acetylCoA_buk", "acetylCoA_but", "aminobutyrate_buk", "aminobutyrate_but", "glutarate_buk", "glutarate_but", "lysine", "sodium-pumping decarboxylase", "Wood-Werkman Cycle", "acrylate pathway", "propanediol pathway"])

### fixing column names to the abbreviations

this next step just changes things from the raw names to the abbreviations established earlier. It does this by looping through the `abbreviations_dict` established earlier. 

In [40]:
for column in normalised_unknown_overall_pathway_group:
    normalised_unknown_overall_pathway_group.rename(columns = {column:abbreviations_dict[column]}, inplace = True)

### normalisation with respect to the length of each pathway

In [41]:
for pathway in normalised_unknown_overall_pathway_group:

    # if the pathway is in our pathway_length_dict (it should be, this step is just to make sure)
    # then divide the abundance by the length of that pathway
    if pathway in pathway_length_dict.keys():

        normalised_unknown_overall_pathway_group[pathway] /= pathway_length_dict[pathway]

### now add what the actual final pathways are

This part simply goes through the entire file/column to figure out what pathway produces, then appends this result to the given table.

The input table should have the **pathways** as the index, and the **samples** as the columns.

In [42]:
def find_pathway(df):

    new = df.copy()

    prop_pathways = ["SP", "WWC", "Pro", "Acr"]

    butyrate_pathways = ["Ace (buk)", "4-Ami (buk)", "Lys", "Glu (but)", "Glu (buk)", "Ace (but)", "4-Ami (but)"]

    pathway_result = []

    # for each pathway in 
    for path in list(new.index):

        if path in prop_pathways: pathway_result.append("PROP")

        if path in butyrate_pathways: pathway_result.append("BTR")

    new.insert(0, "overall_pathway", pathway_result)

    return new

Apply the function `find_pathway` to the table `normalised_unknown_overall_pathway_group`

In [43]:
prop_but_groupby = find_pathway(normalised_unknown_overall_pathway_group.transpose()).groupby(["overall_pathway"]).sum()

### add the stuff about metadata + species 

This code is taken directly from August Burton, who I believe might have first gotten this code from Rebecca Christesen (RC)? Either way, all this code does is get you one final table that you need to care about (`relab2`), which contains taxonomical information for what I believe to be most, if not all, samples from the curated metagenomics dataset. 

In [44]:
# Load curated microbiome data
# The original data is provided in R. Here, we use only the abundance data, provided in easily to handle csv files (subfolder data_curated_microbiome). The abundance levels were extracted from metagenomics using https://github.com/biobakery/MetaPhlAn.
#big table with abundance of different species across all thousands of samples from the data collection
relab=pd.read_csv("data_curated_microbiome/relabundance.csv")
relab.rename(columns={'Unnamed: 0': 'tax_identifier'},inplace=True) # Gives first column label "tax_identifier"

#information about samples
colnames=pd.read_csv("data_curated_microbiome/relabundance_colData.csv")
colnames.rename(columns={'Unnamed: 0': 'sample'},inplace=True) # Gives name to first column

#information about different species detected in the different samples
rownames=pd.read_csv("data_curated_microbiome/relabundance_rowData.csv")
rownames.rename(columns={'Unnamed: 0': 'tax_identifier'},inplace=True)

#add species information to major data table. Used for groupby analysis later on
relab2 = relab.merge(rownames, on='tax_identifier', how='inner')  # Rows = species, Cols = Samples + species info

  colnames=pd.read_csv("data_curated_microbiome/relabundance_colData.csv")


### Adding metadata into it all

This adds metadata to both the versions where we've grouped by prop vs btr already (`with_metadata_prop_but_normalised`), or we're still at the pathway level (`with_metadata_overall_group_normalised`). 

In [45]:
metadata = pd.read_csv("data_curated_microbiome/nayfach_asnicar_hmp_metadata.csv").set_index("NCBI_accession")

temp = prop_but_groupby.transpose()

with_metadata_prop_but_normalised = temp.merge(metadata, how="left", left_index=True, right_index=True)

with_metadata_prop_but_normalised.to_csv("output/with_metadata_prop_but_normalised.csv")

In [46]:
temp = normalised_unknown_overall_pathway_group

with_metadata_overall_group_normalised = temp.merge(metadata, how="left", left_index=True, right_index=True)

with_metadata_overall_group_normalised.to_csv("output/with_metadata_overall_group_normalised.csv")

## **RELOAD DATA HERE! RELOAD THE FIRST VERSION OF DATA METADATA!!**

In [47]:
with_metadata_prop_but_normalised = pd.read_csv("output/with_metadata_prop_but_normalised.csv").set_index("Unnamed: 0.1")

with_metadata_overall_group_normalised = pd.read_csv("output/with_metadata_overall_group_normalised").set_index("Unnamed: 0.1")

### now lets normalise for percentage

Both of these functions simply convert everything from regular stuff to percentages.

`percentageNormaliseOne` does normalisation correctly if there is no metadata attached. It requires the given data to have the **pathways** as the index, and the **samples** as the columns.

`percentageNormaliseTwo` does normalisation correctly only if there is metadata attached. It requires teh given data to have the **samples** as the index, and **pathways** as the columns. It also accounts for the case of dividing by 0, in which it just fills in `np.nan`.

In [48]:
def percentageNormaliseOne(df):

    new = df.copy()

    summed = df.sum(axis=0)

    for row in df.index:

        new.loc[row] = df.loc[row].div(summed)

    return new

def percentageNormaliseTwo(df, pathways):

    summed = df[pathways[0]].add(df[pathways[1]])

    for pathway in pathways:

        pathway_name = pathway+"_percent"
        
        df[pathway_name] = df[pathway].div(summed)

        df.loc[~np.isfinite(df[pathway_name]), pathway_name] = np.nan

    return df.transpose()

### Percentage normalisation in actiion

The following applies the percentage normalisation function to all of the actual data. 

Note: here, I included these two metrics: `BTR % - PROP %` and `BTR - PROP`. I highly advise against using these metrics, as I don't think that they add anything of value to our analysis. The metrics are as follows:
- `BTR % - PROP %` is the percentage of BTR pathways minus the percentage of PROP pathways. 
- `BTR - PROP` is the raw relative abundance of BTR pathways minus the percentage of PROP pathways

Otherwise, the percentages are listed under 

In [49]:
with_metadata_prop_but_percent_change = percentageNormaliseTwo(with_metadata_prop_but_normalised, ["BTR", "PROP"])

with_metadata_prop_but_percent_change.loc["BTR % - PROP %"] = with_metadata_prop_but_percent_change.loc["BTR_percent"] - with_metadata_prop_but_percent_change.loc["PROP_percent"]

with_metadata_prop_but_percent_change.loc["BTR - PROP"] = with_metadata_prop_but_percent_change.loc["BTR"] - with_metadata_prop_but_percent_change.loc["PROP"]

with_metadata_prop_but_percent_change = with_metadata_prop_but_percent_change.transpose()

This adds my own binning for age. I felt like the way that binning was done for the data wasn't very useful, so I redid the binning according to NIH style guidelines (https://www.nih.gov/nih-style-guide/age).

In [50]:
age_category_list = []

index = 0

# go through every sample to get their age, and bin it accordingly
for sample in with_metadata_prop_but_percent_change.age:

    if sample <=1:
        if with_metadata_prop_but_percent_change.infant_age[index] <= 30:
            age_category_list.append("Newborn")
        else:
            age_category_list.append("Infant")

    elif sample > 1 and sample <=12: age_category_list.append("Child")

    elif sample > 12 and sample < 18: age_category_list.append("Adolescent")

    elif sample >= 18 and sample < 65: age_category_list.append("Adult")

    elif sample >= 65: age_category_list.append("Older Adult")

    else: age_category_list.append(np.nan)

    index += 1

# update the age_category column with the correct values
with_metadata_prop_but_percent_change["age_category"] = age_category_list

# fix the "infant_age" column name to be "Infant Age (days)" instead
with_metadata_prop_but_percent_change = with_metadata_prop_but_percent_change.rename(columns={"infant_age": "Infant Age (days)"})

now save everything

In [51]:
with_metadata_prop_but_percent_change.to_csv("output/with_metadata_prop_but_percent_change.csv")

## RELOAD THE DATA HERE

In [53]:
with_metadata_prop_but_percent_change = pd.read_csv("output/with_metadata_prop_but_percent_change.csv").set_index("Unnamed: 0.1")

In [54]:
healthy_overall_group_noramlised = with_metadata_overall_group_normalised.loc[with_metadata_overall_group_normalised['disease'] == "healthy"]

healthy_prop_but_noramlised = with_metadata_prop_but_percent_change.loc[with_metadata_prop_but_percent_change['disease'] == "healthy"]

adult_healthy_overall_group_normalised = healthy_overall_group_noramlised.loc[healthy_overall_group_noramlised["age"] >= 18]

adult_health_prop_but_normalised = with_metadata_prop_but_percent_change.loc[with_metadata_prop_but_percent_change["age"] >= 18]

### add the taxonomical information in

This code is taken directly from August Burton, who I believe might have first gotten this code from Rebecca Christesen (RC)? Either way, all this code does is get you one final table that you need to care about (`relab2`), which contains taxonomical information for what I believe to be most, if not all, samples from the curated metagenomics dataset. 

In [55]:
# Load curated microbiome data
# The original data is provided in R. Here, we use only the abundance data, provided in easily to handle csv files (subfolder data_curated_microbiome). The abundance levels were extracted from metagenomics using https://github.com/biobakery/MetaPhlAn.
#big table with abundance of different species across all thousands of samples from the data collection
relab=pd.read_csv("data_curated_microbiome/relabundance.csv")
relab.rename(columns={'Unnamed: 0': 'tax_identifier'},inplace=True) # Gives first column label "tax_identifier"

#information about samples
colnames=pd.read_csv("data_curated_microbiome/relabundance_colData.csv")
colnames.rename(columns={'Unnamed: 0': 'sample'},inplace=True) # Gives name to first column

#information about different species detected in the different samples
rownames=pd.read_csv("data_curated_microbiome/relabundance_rowData.csv")
rownames.rename(columns={'Unnamed: 0': 'tax_identifier'},inplace=True)

#add species information to major data table. Used for groupby analysis later on
relab2 = relab.merge(rownames, on='tax_identifier', how='inner')  # Rows = species, Cols = Samples + species info

  colnames=pd.read_csv("data_curated_microbiome/relabundance_colData.csv")


In [56]:
# this takes only the phylum level data, as currently that's what we care the most about
# however, please feel free to modify this as needed/as you would like
phylum_df = relab2.groupby("phylum").sum().drop(["superkingdom", "class", "order", "family", "genus", "species"], axis=1).reset_index().set_index("phylum").drop(["tax_identifier"], axis=1)

temp = phylum_df.transpose()

metadata = pd.read_csv("data_curated_microbiome/nayfach_asnicar_hmp_metadata.csv").drop(["infant_age", "age_category", "gender", "country", "non_westernized", "BMI"], axis=1).set_index("Unnamed: 0")

with_phylum = metadata.merge(temp, how="left", left_index=True, right_index=True).reset_index().set_index("NCBI_accession")

with_phylum.to_csv("output/with_phylum.csv")

# get only Bacteroidetes and Firmicutes
bacteroidetes_vs_firmicutes = with_phylum[["Bacteroidetes", "Firmicutes"]]

bacteroidetes_vs_firmicutes = bacteroidetes_vs_firmicutes.reindex(["Firmicutes", "Bacteroidetes"], axis="columns").transpose()

adult_healthy_bacteroidetes_vs_firmicutes = with_phylum[["Bacteroidetes", "Firmicutes", "age", "disease"]]

adult_healthy_bacteroidetes_vs_firmicutes = adult_healthy_bacteroidetes_vs_firmicutes.loc[adult_healthy_bacteroidetes_vs_firmicutes["age"] >= 18]

adult_healthy_bacteroidetes_vs_firmicutes = adult_healthy_bacteroidetes_vs_firmicutes.loc[adult_healthy_bacteroidetes_vs_firmicutes["disease"] == "healthy"]

adult_healthy_bacteroidetes_vs_firmicutes = adult_healthy_bacteroidetes_vs_firmicutes.drop(["age", "disease"], axis=1).reindex(["Firmicutes", "Bacteroidetes"], axis="columns").transpose()

adult_healthy_bacteroidetes_vs_firmicutes

In [14]:
valid_samples = adult_healthy_bacteroidetes_vs_firmicutes.transpose().reset_index().dropna()

valid_samples = valid_samples.set_index("NCBI_accession")

In [18]:
temp = adult_health_prop_but_normalised[["BTR", "PROP"]]

adult_health_prop_but_percent = percentageNormaliseOne(temp.transpose())

In [23]:
temp_merged = adult_health_prop_but_percent.transpose().merge(valid_samples, how="left", left_index=True, right_index=True)

temp_merged = percentageNormaliseTwo(temp_merged, ["Firmicutes", "Bacteroidetes"])

temp_merged = temp_merged.transpose().sort_values("Firmicutes_percent")

sorted_adult_healthy_bacteroidetes_vs_firmicutes_percent = temp_merged[["Firmicutes_percent", "Bacteroidetes_percent"]].transpose()

sorted_adult_healthy_prop_but_percent = temp_merged[["BTR", "PROP"]].transpose()

## random plots testing time

### custom fonts because why not I'm annoying like that

In [22]:
font_dirs = ["Hind"]
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

for font_file in font_files:
    font_manager.fontManager.addfont(font_file)

# set font
plt.rcParams['font.family'] = "Hind"

### defining functions to make some basic stacked bar plots

In [None]:
def stackedBarDF(overall_pathway_df):

    temp_table = overall_pathway_df.transpose()

    temp_dict = {}

    for column in temp_table:

        temp_dict[column] = temp_table[column].tolist()

    samples_list = list(temp_table.index.values)

    plottingDF = pd.DataFrame(
        temp_dict,
        index = samples_list
    )

    return plottingDF.fillna(0) 

def plotStackedBar(stackedBarDF, name="overall pathway groupby", x_name="samples"):

    mpl.rcParams.update(mpl.rcParamsDefault)
    n = len(stackedBarDF.columns)
    colors = plt.cm.viridis(np.linspace(0, 1, n))

    plt.rcParams.update({'font.size': 33})
    plt.tight_layout()
    plt.rcParams['figure.dpi']=600
    plt.rcParams['font.family'] = "Hind"
    
    return stackedBarDF.plot(kind="bar", stacked=True, color=colors, figsize=(25,7), xlabel=x_name, ylabel="relative pathway abundance", title=name, xticks=([])).legend(loc="center left", bbox_to_anchor=(1, 0.5))
    
def sortedStackedBar(stackedBarDF, sortBy, auto_name=False, x_name="samples"):

    # stackedBarDF = stackedBarDF[["butyrate_percent", "propionate_percent"]] if percent else stackedBarDF[["BTR", "PROP"]]
    
    name = "Sorted by abundance of " + sortBy if not auto_name else ""

    return plotStackedBar(stackedBarDF.sort_values(by=sortBy), name, x_name)

### defining stuff for scatter plots

In [None]:
def plotScatter(scatterDF, x_data, y_data):

    plt.rcParams.update({'font.size': 23})
    plt.tight_layout()
    plt.rcParams['figure.dpi']=600
    sns.set_style("darkgrid")

    name = "Proportions of " + x_data + " producers versus\n" + y_data + " producers for 1203 healthy adult samples"
    filename = x_data + "_vs_" + y_data
    
    fig = scatterDF.plot(kind="scatter", x=x_data, y=y_data, figsize=(15,6),colormap="viridis", xlabel="relative abundance of " + x_data, ylabel="relative abundance of " + y_data, title=name)

    plt.show()

def plotScatterContinuous(scatterDF, x_data, y_data, c_data):

    scatterDF = scatterDF.dropna(subset=[c_data], axis=0)

    plt.rcParams.update({'font.size': 20})
    plt.tight_layout()
    plt.rcParams['figure.dpi']=600
    sns.set_style("darkgrid")

    name = x_data + " vs " + y_data + " with respect to " + c_data
    
    fig = scatterDF.plot(kind="scatter", x=x_data, y=y_data, c=c_data, colormap="viridis", alpha=0.7, figsize=(15,5), xlabel="relative abundance of " + x_data, ylabel="relative abundance of " + y_data, title=name)

def plotScatterDiscrete(scatterDF, x_data, y_data, c_data):

    name = x_data + " vs " + y_data + " with respect to " + c_data

    sns.reset_defaults()
    sns.set_style("darkgrid")
    plt.rcParams.update({'font.size': 20})

    plt.rcParams['figure.dpi']=600
    plt.figure(figsize=(14, 5))

    sns.set_palette(wesanderson.film_palette("darjeeling"))

    sns.scatterplot(data=scatterDF,x=x_data, y=y_data, hue=c_data, alpha=0.5).set(xlabel="relative abundance of " + x_data, ylabel="relative abundance of " + y_data, title=name)
    plt.show()

### defining stuff for line/scatter plots

In [None]:
def plotScatterLine(df, x_data, y_data, name):
    
    sns.reset_defaults()
    sns.set_style("darkgrid")
    plt.rcParams.update({'font.size': 53})
    plt.rcParams['font.family'] = "Hind"
    plt.rcParams['figure.dpi']=600
    sns.set_palette("viridis")
    plt.figure(figsize=(31, 10))
    
    return sns.regplot(data=df, x=x_data, y=y_data, x_jitter=.05, lowess=True, scatter_kws={"s": 500, 'alpha': 0.3}).set(title=name, ylabel="Relative Abundance of BTR")

def plotStrip(df, x_data, y_data, c_data, order_list, name):
    
    sns.reset_defaults()
    sns.set_style("darkgrid")
    plt.rcParams.update({'font.size': 53})
    plt.rcParams['font.family'] = "Hind"
    plt.rcParams['figure.dpi']=600
    sns.set_palette("viridis")
    plt.figure(figsize=(31, 10))
    
    temp = df.reset_index()

    sns.set_palette("viridis")

    sns.stripplot(data=temp, x=x_data, y=y_data, hue=c_data, order=order_list, dodge=True, alpha=0.4, s=15)

### defining stuff for violin plots

In [None]:
def violinDF(df, category):

    temp = pd.melt(df.reset_index(), id_vars="Unnamed: 0.1", value_vars=["butyrate", "propionate"])

    temp = temp.set_index("Unnamed: 0.1").rename(columns={"value": "Relative Abundance"})
    
    temp1 = df[[category]]

    return temp.merge(temp1, how="left", left_index=True, right_index=True)


def plotViolin(df, x_data, y_data, c_data, order_list, split_option, name):
    
    sns.reset_defaults()
    sns.set_style("darkgrid")
    plt.rcParams.update({'font.size': 53})
    plt.rcParams['font.family'] = "Hind"
    plt.rcParams['figure.dpi']=600
    plt.figure(figsize=(31, 10))
    sns.set_palette("viridis_r")

    sns.violinplot(data=df, x=x_data, y=y_data, hue=c_data, order=order_list, cut=0, inner="stick", split=split_option, bw=0.2).set(title=name)