In [None]:
# Import the libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans, k_means
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from plotly import graph_objects
from plotly import express
from plotly import figure_factory

In [None]:
podo_data = pd.read_excel("NewHeader.xlsx", header = [0, 1, 2])
podo_data.head()

In [None]:
df = podo_data.copy()
print("dataset:", podo_df.shape)

In [None]:
for col in podo_df.columns:
    print(col[0], "/-/", col[1], "/-/", col[2], " - & Number of NaN:", podo_df[col[0]][col[1]].isnull().sum()[0], "- & dtype:", podo_df[col].dtypes)
    print("----------------")

In [None]:
df = df[df["Accession"]["Protein names"]["Total_Inf_Accession_Protein.names"].notna()]
df = df[df["Phospho ID Metrics"]["Phospho Sites"]["Phos_Inf_Phospho.ID.Metrics_Phosphosites"].notna()]
df = df[df["No Header"]["Phosphopeptide Present"]["Both_Phosphopeptide.Present"] == True]
df = df[df["Database"]["Human"]["Total_Database_Human"] == True]
df.shape

In [None]:
for col in _df.columns:
    print(col[0], "/-/", col[1], "/-/", col[2], " - & Number of NaN:", podo_df[col[0]][col[1]].isnull().sum()[0], "- & dtype:", podo_df[col].dtypes)
    print("----------------")

## Volcano Plot

In [None]:
def volcano_dataset_prep(index_col, t_test, log_fc, fdr_col, heatmap_data = None):
    """
    Separates the columns requred for making a volcano plot.
    
    Parameters
    ----------
    index_col: Pandas series holding index values
    t_test: Pandas series holding t test values
    log_fc: Pandas series holding log fc values
    fdr_col: Pandas series holding fdr values
    
    
    Returns
    ----------
     A new pandas dataframe for volcano plotheatmap_data = Optional, pandas dataframe that can be used to create heatmap later
    """
    dataset = pd.concat((index_col, t_test, log_fc, fdr_col, heatmap_data), axis = 1)
    dataset = dataset.set_index("Total_Inf_Accession_Protein.names")
    dataset = dataset.dropna()
    return dataset

def x_y_axes(data):
    """
    Prepares x and y axes for volcano plot
    
    Parameters
    ----------
    data: Pandas datframe prepared by volcano_dataset_prep
    
    Returns
    ----------
    x_values: List, from LogFC column, to be the values on x axis
    y_value: List, negative log 10 of t_test column values, to be the values on the y axis
    hover: List, containing the names of the proteins, to be used to identify the name of the values on the plot 
    """
    x_col = [x for x in data.columns if "LogFC" in x][0]
    y_col = [y for y in data.columns if "Test" in y][0]
    x_values = data[x_col].values
    y_values = np.log10(data[y_col].values) * -1
    hover = list(data.index)
    return x_values, y_values, hover

def volcano_plot(x_val, y_val, hover_list, title):
    """
    Creates annotated volcano plot from p values and FDR values
    
    Parameters
    ----------
    x_values: List, from LogFC column, to be the values on x axis
    y_value: List, negative log 10 of t_test column values, to be the values on the y axis
    hover: List, containing the names of the proteins, to be used to identify the name of the values on the plot
    title: String, used for the title of the plot
    
    Returns
    ----------
    A volcano plot separating proteins by biological and statistical significane
    
    """
    x_minimum = min(x_val)
    x_maximum = max(x_val)
    y_minimum = min(y_val)
    y_maximum = max(y_val)
    p_val_cut = np.log10(0.05) * -1
    
    fig = graph_objects.Figure()
    image = graph_objects.Scatter(x = x_val, y = y_val, mode = 'markers', hovertext = hover_list)
    fig.add_trace(image)
    fig.add_shape(type = 'line', x0 = -1, y0 = y_minimum - 0.2, x1 = -1, y1 = y_maximum + 0.2, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.add_shape(type = 'line', x0 = 1, y0 = y_minimum - 0.2, x1 = 1, y1 = y_maximum + 0.2, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.add_shape(type = 'line', x0 = x_minimum - 0.1, y0 = p_val_cut, x1 = x_maximum + 0.1, y1 = p_val_cut, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.update_layout(title = title, xaxis_title = "Log2 Fold Change", yaxis_title = "P-Value (-Log10)")
    fig.show()

In [None]:
# Separate the parts required for calculations

first_ge_df = ge_df[["No Header", "Accession" ,"Stats - C3a/C", "Stats - C4a/C", "Phospho Adjusted Log2 Normalised Abundanxe"]]
first_ge_df = first_ge_df[first_ge_df["No Header"]["Phosphopeptide Present"]["Both_Phosphopeptide.Present"] == True]

In [None]:
# Filter NaN values
first_ge = first_ge_df.dropna()

# Make Protein namse the index 
index = first_ge["Accession"]["Protein names"].reset_index(drop = True)
logfc = first_ge["Stats"]["Adjusted Phosphopeptide Stats LogFC"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)
ttest = first_ge["Stats"]["Adjusted Phosphopeptide Stats T Test"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)
fdr = first_ge["Stats"]["Adjusted Phosphopeptide Stats FDR"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)

In [None]:
c3_dataset = volcano_dataset_prep(index, c3_ttest, c3_logfc, c3_fdr).dropna()
x, y, hover = x_y_axes(c3_dataset)
volcano_plot(x, y, hover, "C3/C Adjusted Phospho Abundance")

In [None]:
c3_dataset.rename(columns = {"Phos_Stats.Adj_C3a.C_T.Test": "T Test", 
                                                          "Phos_Stats.Adj_LogFC": "LogFC",
                                                         "Phos_Stats.Adj_FDR": "FDR"}, inplace = True)
c3_dataset.index.rename("Protein Names", inplace = True)

c3_dataset["P Value"] = np.log10(c3_dataset["T Test"]) * -1
p = np.log10(0.05) * -1
volcano_list = c3_dataset[((c3_dataset["LogFC"] > 1) | (c3_dataset["LogFC"] < -1)) & (c3_dataset["P Value"] >= p)]
volcano_list

In [None]:
heatmap_c3_dataset = volcano_dataset_prep(index, c3_ttest, c3_logfc, c3_fdr, heatmap_part)
heatmap_c3_dataset = heatmap_c3_dataset.rename(columns = {"Phos_Stats.Adj_T.Test": "T Test", 
                                                          "Phos_Stats.Adj_LogFC": "LogFC",
                                                         "Phos_Stats.Adj_FDR": "FDR"})
p_val = np.log10(heatmap_c3_dataset["T Test"]) * -1
heatmap_c3_dataset.insert(loc = 3, column = "P Value", value = p_val)
heatmap_c3_dataset.index.rename("Protein Names", inplace = True)
heatmap_c3_dataset

In [None]:
# Creating dendrogram with heatmap

dendogram = heatmap_c3_dataset[['sample1', 'sample2',
       'sample3', 'sample4',
       'sample5', 'sample6']]

dendo = dendogram.T
labels = dendogram.columns

fig = figure_factory.create_dendrogram(dendo, orientation = 'bottom', labels = labels)
for i in range(len(fig['data'])):
    fig['data'][i]['yaxis'] = 'y2'
    
dendro_side = figure_factory.create_dendrogram(dendo, orientation = 'right')
for i in range(len(dendro_side['data'])):
    dendro_side['data'][i]['xaxis'] = 'x2'


for data in dendro_side['data']:
    fig.add_trace(data)

dendro_leaves = dendro_side['layout']['yaxis']['ticktext']
dendro_leaves = list(map(int, dendro_leaves))
data_dist = pdist(dendo)
heat_data = squareform(data_dist)
heat_data = heat_data[dendro_leaves, :]
heat_data = heat_data[:, dendro_leaves]


heatmap = [
    graph_objects.Heatmap(
        x = dendro_leaves,
        y = dendro_leaves,
        z = heat_data,
        colorscale = 'Reds'
    )
]

heatmap[0]['x'] = fig['layout']['xaxis']['tickvals']
heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']

# Add Heatmap Data to Figure
for data in heatmap:
    fig.add_trace(data)
    
fig.update_layout({'width':800, 'height':800,
                         'showlegend':False, 'hovermode': 'closest',
                         })
# Edit xaxis
fig.update_layout(xaxis={'domain': [.15, 1],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  'zeroline': False,
                                  'ticks':""})
# Edit xaxis2
fig.update_layout(xaxis2={'domain': [0, .15],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""})

# Edit yaxis
fig.update_layout(yaxis={'domain': [0, .85],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  'zeroline': False,
                                  'showticklabels': False,
                                  'ticks': ""
                        })
# Edit yaxis2
fig.update_layout(yaxis2 = {'domain':[.825, .975],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""})

fig.show()

In [None]:
# creating correlation analysis table

mask = np.triu(np.ones_like(dendogram.corr(), dtype = bool))


plt.figure(dpi = 100)
sns.set(rc = {'figure.figsize':(5, 5)})
plt.title("Correlation Analysis", fontsize = 10)
sns.heatmap(dendogram.corr(), mask = mask, cmap = "viridis", fmt = ".0%",
            annot = True, linewidths = 0.1, linecolor = "black", annot_kws = {"size": 10})
plt.xticks(rotation = 70, fontsize = 10)
plt.yticks(rotation = 0, fontsize = 10)
plt.show()

In [None]:
c3_volc_sig = heatmap_c3_dataset[((heatmap_c3_dataset["LogFC"] > 1) | (heatmap_c3_dataset["LogFC"] < -1)) & (heatmap_c3_dataset["P Value"] >= p)]
c3_volc_sig

## PCA

In [None]:
main_indeces = ['sample1', 'sample2',
       'sample3', 'sample4',
       'sample5', 'sample6']

controls = ["C-1", "C-2", "C-3"]
treatment1 = ["T-1.1", "T-1.2", "T-1.3"]
treatment2 = ["T-2.1", "T-2.2", "T-2.3"]




def pca_initiation(sub_data):
    scaled_data = scale(sub_data.T)
    pca_init = PCA()
    pca_init.fit(scaled_data)
    pca_data_final = pca_init.transform(scaled_data)
    per_var_info = np.round(pca_init.explained_variance_ratio_ * 100, decimals = 1)
    labels = ["PC" + str(x) for x in range(1, len(per_var_info) + 1)]
    return pca_init, pca_data_final, per_var_info, labels

def scree_plot(variance, all_labels, save = False, name = None):
    if save == False:
        plt.figure(figsize = (10, 5))
        plt.bar(x = range(1, len(variance) + 1), height = variance, tick_label = all_labels)
        plt.ylabel("Percentage of Explained Variance")
        plt.xlabel("Principal Component")
        plt.title("Scree Plot")
        plt.show()
        
    else:
        plt.figure(figsize = (10, 5))
        plt.bar(x = range(1, len(variance) + 1), height = variance, tick_label = all_labels)
        plt.ylabel("Percentage of Explained Variance")
        plt.xlabel("Principal Component")
        plt.title("Scree Plot")
        plt.savefig(name)
        
def pca_result_df(final_pca_df ,all_labels, indx1, indx2):
    result_df = pd.DataFrame(final_pca_df, 
             index = [*indx1, *indx2],
             columns = all_labels)

    return result_df
        
def pca_2d_graph(pca_df_final, per_var_info,title, save = False, name = None):
    if save == False:
        plt.figure(figsize = (7, 7))
        plt.scatter(pca_df_final.PC1, pca_df_final.PC2)

        plt.title(title)
        plt.xlabel("PC1 - {0}%".format(per_var_info[0]))
        plt.ylabel("PC2 - {0}%".format(per_var_info[1]))
        plt.plot([0, 0], [min(pca_df_final.PC2) - 3, max(pca_df_final.PC2) + 3], color='green',
             linestyle = 'dashed', linewidth = 2)

        plt.plot([min(pca_df_final.PC1), max(pca_df_final.PC1)], [0, 0], color='green',
             linestyle = 'dashed', linewidth = 2)

        for sample in pca_df_final.index:
            plt.annotate(sample, (pca_df_final.PC1.loc[sample], pca_df_final.PC2.loc[sample]))

        plt.show()
        
    else:
        plt.figure(figsize = (7, 7))
        plt.scatter(pca_df_final.PC1, pca_df_final.PC2)

        plt.title(title)
        plt.xlabel("PC1 - {0}%".format(per_var_info[0]))
        plt.ylabel("PC2 - {0}%".format(per_var_info[1]))

        for sample in pca_df_final.index:
            plt.annotate(sample, (pca_df_final.PC1.loc[sample], pca_df_final.PC2.loc[sample]))
            
        plt.svaefig(name + ".png")
        
def pca_multi_graph(pca_class, pca_dataframe, pca_data_transformed, dimension, title):
    x_y_labels = {
    str(i): f"PC {i + 1} ({var:.1f}%)"
    for i, var in enumerate(pca_class.explained_variance_ratio_ * 100)}
    
    fig = express.scatter_matrix(
        pca_data_transformed,
        labels = x_y_labels,
        dimensions = range(1, dimension),
        color = pca_dataframe.index, width = 1200, height = 1200, title = title)
    fig.update_traces(diagonal_visible = False)
    fig.show()