# PATHWAY ABUNDANCE MANIPULATION

This file focuses only on calculating relative pathway abundance, adding metadata, and cleaning up the data. 

This pipeline was last edited by Yu Han Daisy Wang on 28 August 2023.

## step 0: load all relevant/needed packages

In [15]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from matplotlib import font_manager

---
# **RELOAD DATA HERE!**
---


In [16]:
with_metadata_prop_but_normalised = pd.read_csv("output/with_metadata_prop_but_normalised.csv").set_index("Unnamed: 0.1")

with_metadata_overall_group_normalised = pd.read_csv("output/with_metadata_overall_group_normalised.csv").set_index("Unnamed: 0.1")

with_metadata_prop_but_percent_change = pd.read_csv("output/with_metadata_prop_but_percent_change.csv").set_index("Unnamed: 0.1")

with_phylum = pd.read_csv("output/with_phylum.csv").set_index("NCBI_accession")

### now lets normalise for percentage

Both of these functions simply convert everything from regular stuff to percentages.

`percentageNormaliseOne` does normalisation correctly if there is no metadata attached. It requires the given data to have the **pathways** as the index, and the **samples** as the columns.

`percentageNormaliseTwo` does normalisation correctly only if there is metadata attached. It requires the given data to have the **samples** as the index, and **pathways** as the columns. It also accounts for the case of dividing by 0, in which it just fills in `np.nan`.

In [17]:
def percentageNormaliseOne(df):

    new = df.copy()

    summed = df.sum(axis=0)

    for row in df.index:

        new.loc[row] = df.loc[row].div(summed)

    return new

def percentageNormaliseTwo(df, pathways):
    """
    df = dataframe you wish to work with
    pathways = the pathways that you wish to consider in this normalisation
    """

    summed = df[pathways[0]].add(df[pathways[1]])

    for pathway in pathways:

        pathway_name = pathway+"%"
        
        df[pathway_name] = df[pathway].div(summed)

        df.loc[~np.isfinite(df[pathway_name]), pathway_name] = np.nan

    return df.transpose()

## Some more last minute data wrangling!

### Here's some stuff about healthy adults

In [18]:
healthy_overall_group_noramlised = with_metadata_overall_group_normalised.loc[with_metadata_overall_group_normalised['disease'] == "healthy"]

healthy_prop_but_noramlised = with_metadata_prop_but_percent_change.loc[with_metadata_prop_but_percent_change['disease'] == "healthy"]

adult_healthy_overall_group_normalised = healthy_overall_group_noramlised.loc[healthy_overall_group_noramlised["age"] >= 18]

adult_health_prop_but_normalised = with_metadata_prop_but_percent_change.loc[with_metadata_prop_but_percent_change["age"] >= 18]

### Here's some stuff about bacteroidetes vs firmicutes

`bacteroidetes_vs_firmicutes` gives you the abundance of bacteroidetes vs the abundance of firmicutes for ALL samples

`adult_healthy_bacteroidetes_vs_firmicutes` gives you the abundance of bacteroidetes vs the abundance of firmicutes for ALL HEALTHY ADULT samples

In [19]:
bacteroidetes_vs_firmicutes = with_phylum[["Bacteroidetes", "Firmicutes"]]

bacteroidetes_vs_firmicutes = bacteroidetes_vs_firmicutes.reindex(["Firmicutes", "Bacteroidetes"], axis="columns").transpose()

adult_healthy_bacteroidetes_vs_firmicutes = with_phylum[["Bacteroidetes", "Firmicutes", "age", "disease"]]

adult_healthy_bacteroidetes_vs_firmicutes = adult_healthy_bacteroidetes_vs_firmicutes.loc[adult_healthy_bacteroidetes_vs_firmicutes["age"] >= 18]

adult_healthy_bacteroidetes_vs_firmicutes = adult_healthy_bacteroidetes_vs_firmicutes.loc[adult_healthy_bacteroidetes_vs_firmicutes["disease"] == "healthy"]

adult_healthy_bacteroidetes_vs_firmicutes = adult_healthy_bacteroidetes_vs_firmicutes.drop(["age", "disease"], axis=1).reindex(["Firmicutes", "Bacteroidetes"], axis="columns").transpose()

adult_healthy_bacteroidetes_vs_firmicutes

NCBI_accession,ERR4330026,ERR4330027,ERR4330028,ERR4330029,ERR4330030,ERR4330031,ERR4330032,ERR4330033,ERR4330034,ERR4330035,...,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7,NaN.8,NaN.9
Firmicutes,27.71216,41.3705,43.70317,33.88667,59.64222,43.99187,31.2887,64.9387,47.43638,19.93554,...,16.59618,10.50896,14.46574,24.15066,31.21463,44.78141,36.312,59.3616,68.22818,35.96867
Bacteroidetes,64.86127,54.68318,34.64965,61.13726,35.83284,48.15292,63.60274,32.86029,36.27032,68.64813,...,80.57261,85.14478,83.6576,75.27369,67.96384,48.82434,59.20388,36.2418,31.06421,59.77691


---
# PLOTTING TIME
---

### This next part just sets up custom fonts given the files for the fonts
I currently have the font set to Hind (https://fonts.google.com/specimen/Hind), but feel free to switch it to anything else. Feel free to also just ignore the custom fonts and use default fonts.

`font_dirs = ["Hind"]`: set `Hind` to the name/path to the folder that you end up using to store all the files for your fonts. The fonts should all `.ttf` files.

In [8]:
font_dirs = ["Hind"]
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)

# use the font manager to fully download fonts
for font_file in font_files:
    font_manager.fontManager.addfont(font_file)

# the actual line that sets the font
plt.rcParams['font.family'] = "Hind"

### defining functions to make some basic stacked bar plots

`stackedBarDF` takes a dataframe and makes it suitable for making stacked bar plots

`plotStackedBar` takes a data frame and plots it as a stacked bar plot. It simply takes the given parameters and plots the graph as is.

`sortedStackedBar` takes a data frame, sorts it by the given value (`sortBy`), then plugs all of that information into `plotStackedBar`.

In [20]:
def stackedBarDF(overall_pathway_df):
    """
    overall_pathway_df = dataframe you wish to use
    """
    temp_table = overall_pathway_df.transpose()

    temp_dict = {}

    for column in temp_table:

        temp_dict[column] = temp_table[column].tolist()

    samples_list = list(temp_table.index.values)

    plottingDF = pd.DataFrame(
        temp_dict,
        index = samples_list
    )

    return plottingDF.fillna(0) 

def plotStackedBar(stackedBarDF, name="overall pathway groupby", x_name="samples"):
    """
    stackedBarDF = dataframe you wish to work with
    name = title of graph
    x_name = the label that you'd like on the axis
    """

    mpl.rcParams.update(mpl.rcParamsDefault)
    n = len(stackedBarDF.columns)
    colors = plt.cm.viridis(np.linspace(0, 1, n))

    plt.rcParams.update({'font.size': 33})
    plt.tight_layout()
    plt.rcParams['figure.dpi']=600
    plt.rcParams['font.family'] = "Hind"
    
    return stackedBarDF.plot(kind="bar", stacked=True, color=colors, figsize=(25,7), xlabel=x_data, ylabel="relative pathway abundance", title=name, xticks=([])).legend(loc="center left", bbox_to_anchor=(1, 0.5))
    
def sortedStackedBar(stackedBarDF, sortBy, auto_name=False, x_name="samples"):
    """
    stackedBarDF = dataframe you wish to work with
    sortBy = what column you wish to sort the data by
    auto_name = whether not you'd like the automatically generated name. True or False. 
    x_name = the label that you'd like on the axis
    """
    
    name = "Sorted by abundance of " + sortBy if not auto_name else ""

    return plotStackedBar(stackedBarDF.sort_values(by=sortBy), name, x_name)

### defining stuff for scatter plots

`plotScatter` simply plots a basic scatter graph for the x variable vs y variable.

`plotScatterContinuous` plots a scatter graph for the x variable against the y variable, but using some **continuous** variable to create variations in hue of of the dots.

`plotScatterContinuous` plots a scatter graph for the x variable against the y variable, but using some **discrete** variable to create variations in the hue of the dots.

In [21]:
def plotScatter(df, x_data, y_data):
    """
    df = the dataframe you wish to work with
    x_data = the name of the column you want on the x axis
    y_data = the name of the column you want on the y axis
    """

    plt.rcParams.update({'font.size': 23})
    plt.tight_layout()
    plt.rcParams['figure.dpi']=600
    sns.set_style("darkgrid")

    # this title can be modified to fit whatever you need
    name = "Proportions of " + x_data + " producers versus\n" + y_data + " producers for 1203 healthy adult samples"
    filename = x_data + "_vs_" + y_data
    
    fig = df.plot(kind="scatter", x=x_data, y=y_data, figsize=(15,6),colormap="viridis", xlabel="relative abundance of " + x_data, ylabel="relative abundance of " + y_data, title=name)

    plt.show()

def plotScatterContinuous(df, x_data, y_data, c_data):
    """
    df = the dataframe you wish to work with
    x_data = the name of the column you want on the x axis
    y_data = the name of the column you want on the y axis
    c_data = the name of the column you want to use to assign colors of the dots by, has to be continuous (age?)
    """

    df = df.dropna(subset=[c_data], axis=0)

    plt.rcParams.update({'font.size': 20})
    plt.tight_layout()
    plt.rcParams['figure.dpi']=600
    sns.set_style("darkgrid")

    name = x_data + " vs " + y_data + " with respect to " + c_data
    
    fig = df.plot(kind="scatter", x=x_data, y=y_data, c=c_data, colormap="viridis", alpha=0.7, figsize=(15,5), xlabel="relative abundance of " + x_data, ylabel="relative abundance of " + y_data, title=name)

def plotScatterDiscrete(df, x_data, y_data, c_data):
    """
    df = the dataframe you wish to work with
    x_data = the name of the column you want on the x axis
    y_data = the name of the column you want on the y axis
    c_data = the name of the column you want to use to assign colors of the dots by, has to be discrete (feeding practices?)
    """

    name = x_data + " vs " + y_data + " with respect to " + c_data

    sns.reset_defaults()
    sns.set_style("darkgrid")
    plt.rcParams.update({'font.size': 20})

    plt.rcParams['figure.dpi']=600
    plt.figure(figsize=(14, 5))

    sns.set_palette(wesanderson.film_palette("darjeeling"))

    sns.scatterplot(data=df,x=x_data, y=y_data, hue=c_data, alpha=0.5).set(xlabel="relative abundance of " + x_data, ylabel="relative abundance of " + y_data, title=name)
    plt.show()

### defining stuff for line/scatter plots

`plotScatterLine` plots a scatter plot with a regression line. It's currently been set to plot a lowess curve, but this can be easily changed as needed. For more details on how to change that, this may be a helpful resource: https://seaborn.pydata.org/generated/seaborn.regplot.html

`plotStrip` plots a strip plot. For each value in the x_data, it splits it into different categories based on the c_data. This difference is shown by a change in color. For more details on how to customise this, this may be a helpful resource: https://seaborn.pydata.org/generated/seaborn.stripplot.html

In [22]:
def plotScatterLine(df, x_data, y_data, name):
    """
    df = the dataframe that you want to work with
    x_data = the name of the column you want on the x axis
    y_data = the name of the column you want on the y axis
    name = the title of the graph
    """
    
    sns.reset_defaults()
    sns.set_style("darkgrid")
    plt.rcParams.update({'font.size': 53})
    plt.rcParams['font.family'] = "Hind"
    plt.rcParams['figure.dpi']=600
    sns.set_palette("viridis")
    plt.figure(figsize=(31, 10))
    
    return sns.regplot(data=df, x=x_data, y=y_data, x_jitter=.05, lowess=True, scatter_kws={"s": 500, 'alpha': 0.3}).set(title=name, ylabel="Relative Abundance of BTR")

def plotStrip(df, x_data, y_data, c_data, order_list, name):
    """
    df = the dataframe that you want to work with
    x_data = the name of the column you want on the x axis
    y_data = the name of the column you want on the y axis
    c_data = the name of the column you want to use to seperate each category by/change the colors by (prop vs but?)
    order_list = the order in which you want the items in the x axis on
    name = title of the graph
    """
    
    sns.reset_defaults()
    sns.set_style("darkgrid")
    plt.rcParams.update({'font.size': 53})
    plt.rcParams['font.family'] = "Hind"
    plt.rcParams['figure.dpi']=600
    sns.set_palette("viridis")
    plt.figure(figsize=(31, 10))
    
    temp = df.reset_index()

    sns.set_palette("viridis")

    sns.stripplot(data=temp, x=x_data, y=y_data, hue=c_data, order=order_list, dodge=True, alpha=0.4, s=15)

### defining stuff for violin plots

`violinDF` converts a data frame into the data format required for using the seaborn violin plot function. The key thing is that the data gets processed into long form data. 

`plotViolin` plots a split violin plot using the given dataframe. Useful reading: https://seaborn.pydata.org/generated/seaborn.violinplot.html
- `cut = 0` means that it doesn't smooth the data past where the data exists
- `inner="stick"` means that it shows the quartiles as lines inside the plot
-  `bw` is the bandwidth. tldr: higher bandwidth = more smoothening, less bandwith = less smoothening

In [23]:
def violinDF(df, category):
    """
    df = the dataframe that you wish to plot
    category = name of the column containing the metadata tags that you want to bin your data for
    for example, for me, category might have been "feeding practices"
    """

    temp = pd.melt(df.reset_index(), id_vars="Unnamed: 0.1", value_vars=["butyrate", "propionate"])

    temp = temp.set_index("Unnamed: 0.1").rename(columns={"value": "Relative Abundance"})
    
    temp1 = df[[category]]

    return temp.merge(temp1, how="left", left_index=True, right_index=True)

# set split option to be equal
def plotViolin(df, x_data, y_data, c_data, order_list, split_option, name):
    """
    df = the dataframe that you wish to plot
    x_data = the name of the column that you wish to use for the x axis
    y_data = the name of the column that you wish to use for the y axis
    c_data = the name of the column that you wish to split the data by (in my case, it was propionate vs butyrate)
    split_option = are we plotting a split violin plot or not? input True or False
    name = title of the graph
    """

    sns.reset_defaults()
    sns.set_style("darkgrid")
    plt.rcParams.update({'font.size': 53})
    plt.rcParams['font.family'] = "Hind"
    plt.rcParams['figure.dpi']=600
    plt.figure(figsize=(31, 10))
    sns.set_palette("viridis_r")

    sns.violinplot(data=df, x=x_data, y=y_data, hue=c_data, order=order_list, cut=0, inner="stick", split=split_option, bw=0.2).set(title=name)

In [13]:
help(plotViolin)

Help on function plotViolin in module __main__:

plotViolin(df, x_data, y_data, c_data, order_list, split_option, name)
    df = the dataframe that you wish to plot
    x_data = the name of the column that you wish to use for the x axis
    y_data = the name of the column that you wish to use for the y axis
    c_data = the name of the column that you wish to split the data by (in my case, it was propionate vs butyrate)
    split_option = are we plotting a split violin plot or not? input True or False
    name = title of the graph



In [None]:
sort_acetylCoA_but = sortedStackedBar(normalised_unknown_overall_pathway_group, "Ace (but)", )

### making normalised percentage bar charts?

In [None]:
overall_pathway_normalised_group_percent = percentageNormalise(normalised_unknown_overall_pathway_group)

# unsorted_normalised_final_graph_percent = plotStackedBar(stackedBarDF(overall_pathway_normalised_group_percent), "unsorted, noramlised, overall pathway group")

In [None]:
adult_health_prop_but_normalised


sort_but_normalised = sortedStackedBar(stackedBarDF=adult_health_prop_but_normalised, sortBy="butyrate_percent", auto_name="title", x_name="1973 samples across the Asnicar, Nayfach, and HMP 2019 studies", percent=True)

plt.show()

In [None]:
temp = adult_health_prop_but_normalised[["butyrate", "propionate"]]

temp = temp.rename(columns={"butyrate": "BTR", "propionate": "PROP"})

adult_health_prop_but_percent = percentageNormaliseOne(temp.transpose())

adult_healthy_sort_but_normalised = sortedStackedBar(adult_health_prop_but_percent.transpose(), sortBy="BTR", auto_name="title", x_name="1506 healthy adult samples")

plt.show()

In [None]:
sorted_adult_healthy_prop_but_percent_graph = sortedStackedBar(sorted_adult_healthy_prop_but_percent.transpose(), sortBy="BTR", auto_name="title", x_name="1098 healthy adult samples")

In [None]:
temp = sorted_adult_healthy_bacteroidetes_vs_firmicutes_percent.transpose().reindex(columns=["Firmicutes", "Bacteroidetes"])

sorted_adult_healthy_bacteroidetes_vs_firmicutes_percent_graph = plotStackedBar(temp, "Bacteroidetes Vs. Firmicutes, sorted by proportions of BTR", "1098 healthy adult samples")
plt.show()

In [None]:
sorted_acetylCoA_buk_normalised_percent = sortedStackedBar(overall_pathway_normalised_group_percent, "acetylCoA_buk")

In [None]:
sorted_acetylCoA_but_normalised_percent = sortedStackedBar(overall_pathway_normalised_group_percent, "Ace (but)", title="Pathway Variations in Healthy Individuals", x_name="1973 samples across the Asnicar, Nayfach, and HMP 2019 studies")

In [None]:
temp = adult_healthy_overall_group_normalised[list(pathway_length_dict.keys())]

adult_healthy_overall_group_percent = percentageNormaliseOne(temp.transpose())

adult_healthy_sort_acetylCoA_normalised = sortedStackedBar(adult_healthy_overall_group_percent.transpose(), sortBy="Ace (but)", auto_name="title", x_name="1506 healthy adult samples")

plt.show()

In [None]:
sorted_propanediol_normalised_percent = sortedStackedBar(overall_pathway_normalised_group_percent, "propanediol pathway")

In [None]:
phylum_percentage_df = percentageNormaliseOne(bacteroidetes_vs_firmicutes)

phylum_distribution_unsorted = plotStackedBar(stackedBarDF(phylum_percentage_df), "unsorted, noramlised, bacteroidetes vs firmicutes")

plt.show()

In [None]:
sorted_bacteroidetes_normalised_percent = sortedStackedBar(phylum_percentage_df, "Firmicutes", autoname=True, x_name = "3584 samples")
plt.show()

In [None]:
adult_healthy_phylum_percentage_df = percentageNormaliseOne(adult_healthy_bacteroidetes_vs_firmicutes).transpose()

adult_healthy_sorted_bacteroidetes_normalised_percent = sortedStackedBar(adult_healthy_phylum_percentage_df, "Firmicutes", x_name = "1506 healthy adult samples", percent=False)
plt.show()

### scatter plots

In [None]:
temp = adult_health_prop_but_normalised[["butyrate", "propionate"]]

healthy_adult_prop_vs_but_graph = plotScatter(temp, x_data="butyrate", y_data="propionate")
plt.show()

In [None]:
prop_vs_but = prop_but_groupby.transpose()

prop_vs_but_graph = plotScatter(prop_vs_but, x_data="butyrate", y_data="propionate")
plt.show()

In [None]:
age_graph = plotScatterContinuous(with_metadata_prop_but_normalised, "butyrate", "propionate", "age")
plt.show()

In [None]:
age_graph_percent = plotScatterContinuous(with_metadata_prop_but_percent, "butyrate", "propionate")

In [None]:
age_graph = plotScatterContinuous(with_metadata_prop_but_normalised, "butyrate", "propionate", "age")
plt.show()

In [None]:
age_graph_phyla = plotScatterContinuous(bacteroidetes_vs_firmicutes, "butyrate", "propionate", "age")
plt.show()

In [None]:
feeding_practice_graph = plotScatterDiscrete(with_metadata_prop_but_normalised, "butyrate", "propionate", "feeding practice")

## line graphs

In [None]:
name = "Relative Abundance of BTR Pathways\nVs. Infant Age (days)"

g = plotScatterLine(df=with_metadata_prop_but_percent_change, x_data = "Infant Age (days)", y_data = "butyrate", name=name)

df = with_metadata_prop_but_percent_change[["Infant Age (days)", "butyrate"]].dropna()

# def annotate(data, **kws):
#     r, p = sp.stats.pearsonr(df['Infant Age (days)'], df['butyrate'])
#     ax = plt.gca()
#     ax.text(.05, .8, 'r={:.2f}, p={:.2g}'.format(r, p),
#             transform=ax.transAxes)
    
# g.map_dataframe(annotate)

plt.show()

In [None]:
age_violin_df = violinDF(with_metadata_prop_but_percent_change, "age_category")

age_violin_df = age_violin_df.rename(columns={"age_category": "Age", "variable": "Pathway"})

adjusted_names = []

adjusted_pathways = []

adjusted_names_dict = {
    "Newborn": "Newborn\n≤ 1 month",
    "Infant": "Infant\n(1 month to 1 year)",
    "Child": "Child\n(1 to 12 years)",
    "Adolescent": "Adolescent\n(13 to 17 years)",
    "Adult": "Adult\n≥ 18 years",
    "Older Adult": "Older Adult\n≥ 65",
    np.nan: np.nan
}

adjusted_pathways_dict = {
    "butyrate": "BTR",
    "propionate": "PROP"
}

for curr in age_violin_df.Age:
    adjusted_names.append(adjusted_names_dict[curr])

for curr in age_violin_df.Pathway:
    adjusted_pathways.append(adjusted_pathways_dict[curr])


age_violin_df["Age"] = adjusted_names

age_violin_df["Pathway"] = adjusted_pathways

In [None]:
title = "Relative Abundance of BTR and PROP Pathways Vs. Age"

plotViolin(df=age_violin_df, x_data = "Age", order_list=["Newborn\n≤ 1 month", "Infant\n(1 month to 1 year)", "Adult\n≥ 18 years", "Older Adult\n≥ 65"], y_data = "Relative Abundance", c_data="Pathway", split_option=True, name=title)
plt.show()

In [None]:
title = "Relative Abundance of BTR and PROP Pathways Vs. Age"

plotStrip(df=age_violin_df, x_data = "Age", order_list=["Newborn\n≤ 1 month", "Infant\n(1 month to 1 year)", "Adult\n≥ 18 years", "Older Adult\n≥ 65"], y_data = "Relative Abundance", c_data="Pathway", name=title)
plt.show()

#### data preprocessing before I plot the graphs for feeding practices

In [None]:
feeding_practice_df = violinDF(with_metadata_prop_but_percent_change, "feeding_practice")

adjusted_names = []

feeding_practice_names = {
    "exclusively_breastfeeding": "Exclusively\nBreastfeeding",
    "mixed_feeding": "Mixed\nFeeding",
    "any_breastfeeding": "Any\nBreastfeeding",
    "no_breastfeeding": "No\nBreastfeeding",
    "exclusively_formula_feeding": "Exclusively\nFormula\nFeeding",
    np.nan: np.nan
}

for curr in feeding_practice_df.feeding_practice:
    adjusted_names.append(feeding_practice_names[curr])

feeding_practice_df["feeding_practice"] = adjusted_names

adjusted_pathways = []

adjusted_pathways_dict = {
    "butyrate": "BTR",
    "propionate": "PROP"
}

for curr in feeding_practice_df.variable:
    adjusted_pathways.append(adjusted_pathways_dict[curr])

feeding_practice_df["variable"] = adjusted_pathways


feeding_practice_df = feeding_practice_df.rename(columns={"feeding_practice": "Feeding Practice"}).rename(columns={"variable": "Pathway"})

In [None]:
title = "Relative Abundance of BTR and PROP Pathways Vs. Feeding Practices"

plotViolin(df=feeding_practice_df, x_data = "Feeding Practice", order_list=["Exclusively\nBreastfeeding", "Mixed\nFeeding", "Any\nBreastfeeding", "No\nBreastfeeding", "Exclusively\nFormula\nFeeding"], y_data = "Relative Abundance", c_data="Pathway", split_option=True, name=title)

plt.show()

In [None]:
title = "Relative Abundance of BTR and PROP Pathways Vs. Feeding Practices"

plotStrip(df=feeding_practice_df, x_data = "Feeding Practice", order_list=["Exclusively\nBreastfeeding", "Mixed\nFeeding", "Any\nBreastfeeding", "No\nBreastfeeding", "Exclusively\nFormula\nFeeding"], y_data = "Relative Abundance", c_data="Pathway", name=title)

plt.show()