In [None]:
# Import the libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans, k_means
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from plotly import graph_objects
from plotly import express
from plotly import figure_factory

In [None]:
podo_data = pd.read_excel("C4C3-Podo-NewHeader.xlsx", header = [0, 1, 2])
podo_data.head()

In [None]:
podo_df = podo_data.copy()
print("Podo dataset:", podo_df.shape)

In [None]:
for col in podo_df.columns:
    print(col[0], "/-/", col[1], "/-/", col[2], " - & Number of NaN:", podo_df[col[0]][col[1]].isnull().sum()[0], "- & dtype:", podo_df[col].dtypes)
    print("----------------")

In [None]:
podo_df = podo_df[podo_df["Accession"]["Protein names"]["Total_Inf_Accession_Protein.names"].notna()]
podo_df = podo_df[podo_df["Phospho ID Metrics"]["Phospho Sites"]["Phos_Inf_Phospho.ID.Metrics_Phosphosites"].notna()]
podo_df = podo_df[podo_df["No Header"]["Phosphopeptide Present"]["Both_Phosphopeptide.Present"] == True]
podo_df = podo_df[podo_df["Database"]["Human"]["Total_Database_Human"] == True]
podo_df.shape

In [None]:
for col in podo_df.columns:
    print(col[0], "/-/", col[1], "/-/", col[2], " - & Number of NaN:", podo_df[col[0]][col[1]].isnull().sum()[0], "- & dtype:", podo_df[col].dtypes)
    print("----------------")

## Volcano Plot

In [None]:
def volcano_dataset_prep(index_col, t_test, log_fc, fdr_col, heatmap_data = None):
    """
    Separates the columns requred for making a volcano plot.
    
    Parameters
    ----------
    index_col: Pandas series holding index values
    t_test: Pandas series holding t test values
    log_fc: Pandas series holding log fc values
    fdr_col: Pandas series holding fdr values
    
    
    Returns
    ----------
     A new pandas dataframe for volcano plotheatmap_data = Optional, pandas dataframe that can be used to create heatmap later
    """
    dataset = pd.concat((index_col, t_test, log_fc, fdr_col, heatmap_data), axis = 1)
    dataset = dataset.set_index("Total_Inf_Accession_Protein.names")
    dataset = dataset.dropna()
    return dataset

def x_y_axes(data):
    """
    Prepares x and y axes for volcano plot
    
    Parameters
    ----------
    data: Pandas datframe prepared by volcano_dataset_prep
    
    Returns
    ----------
    x_values: List, from LogFC column, to be the values on x axis
    y_value: List, negative log 10 of t_test column values, to be the values on the y axis
    hover: List, containing the names of the proteins, to be used to identify the name of the values on the plot 
    """
    x_col = [x for x in data.columns if "LogFC" in x][0]
    y_col = [y for y in data.columns if "Test" in y][0]
    x_values = data[x_col].values
    y_values = np.log10(data[y_col].values) * -1
    hover = list(data.index)
    return x_values, y_values, hover

def volcano_plot(x_val, y_val, hover_list, title):
    """
    Creates annotated volcano plot from p values and FDR values
    
    Parameters
    ----------
    x_values: List, from LogFC column, to be the values on x axis
    y_value: List, negative log 10 of t_test column values, to be the values on the y axis
    hover: List, containing the names of the proteins, to be used to identify the name of the values on the plot
    title: String, used for the title of the plot
    
    Returns
    ----------
    A volcano plot separating proteins by biological and statistical significane
    
    """
    x_minimum = min(x_val)
    x_maximum = max(x_val)
    y_minimum = min(y_val)
    y_maximum = max(y_val)
    p_val_cut = np.log10(0.05) * -1
    
    fig = graph_objects.Figure()
    image = graph_objects.Scatter(x = x_val, y = y_val, mode = 'markers', hovertext = hover_list)
    fig.add_trace(image)
    fig.add_shape(type = 'line', x0 = -1, y0 = y_minimum - 0.2, x1 = -1, y1 = y_maximum + 0.2, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.add_shape(type = 'line', x0 = 1, y0 = y_minimum - 0.2, x1 = 1, y1 = y_maximum + 0.2, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.add_shape(type = 'line', x0 = x_minimum - 0.1, y0 = p_val_cut, x1 = x_maximum + 0.1, y1 = p_val_cut, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.update_layout(title = title, xaxis_title = "Log2 Fold Change", yaxis_title = "P-Value (-Log10)")
    fig.show()

In [None]:
# Separate the parts required for calculations

first_ge_df = ge_df[["No Header", "Accession" ,"Stats - C3a/C", "Stats - C4a/C", "Phospho Adjusted Log2 Normalised Abundanxe"]]
first_ge_df = first_ge_df[first_ge_df["No Header"]["Phosphopeptide Present"]["Both_Phosphopeptide.Present"] == True]

In [None]:
# Filter NaN values
first_ge = first_ge_df.dropna()

# Make Protein namse the index 
index = first_ge["Accession"]["Protein names"].reset_index(drop = True)
c3_logfc = first_ge["Stats - C3a/C"]["Adjusted Phosphopeptide Stats LogFC"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)
c3_ttest = first_ge["Stats - C3a/C"]["Adjusted Phosphopeptide Stats T Test"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)
c3_fdr = first_ge["Stats - C3a/C"]["Adjusted Phosphopeptide Stats FDR"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)

In [None]:
c3_dataset = volcano_dataset_prep(index, c3_ttest, c3_logfc, c3_fdr).dropna()
x, y, hover = x_y_axes(c3_dataset)
volcano_plot(x, y, hover, "C3/C Adjusted Phospho Abundance")

In [None]:
c3_dataset.rename(columns = {"Phos_Stats.Adj_C3a.C_T.Test": "C3 T Test", 
                                                          "Phos_Stats.Adj_C3a.C_LogFC": "C3 LogFC",
                                                         "Phos_Stats.Adj_C3a.C_FDR": "C3 FDR"}, inplace = True)
c3_dataset.index.rename("Protein Names", inplace = True)

c3_dataset["P Value"] = np.log10(c3_dataset["C3 T Test"]) * -1
p = np.log10(0.05) * -1
volcano_list = c3_dataset[((c3_dataset["C3 LogFC"] > 1) | (c3_dataset["C3 LogFC"] < -1)) & (c3_dataset["P Value"] >= p)]
volcano_list

In [None]:
heatmap_c3_dataset = volcano_dataset_prep(index, c3_ttest, c3_logfc, c3_fdr, heatmap_part)
heatmap_c3_dataset = heatmap_c3_dataset.rename(columns = {"Phos_Stats.Adj_C3a.C_T.Test": "C3 T Test", 
                                                          "Phos_Stats.Adj_C3a.C_LogFC": "C3 LogFC",
                                                         "Phos_Stats.Adj_C3a.C_FDR": "C3 FDR"})
p_val = np.log10(heatmap_c3_dataset["C3 T Test"]) * -1
heatmap_c3_dataset.insert(loc = 3, column = "P Value", value = p_val)
heatmap_c3_dataset.index.rename("Protein Names", inplace = True)
heatmap_c3_dataset