In [None]:
# Import the libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans, k_means
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from plotly import graph_objects
from plotly import express
from plotly import figure_factory

In [None]:
podo_data = pd.read_excel("NewHeader.xlsx", header = [0, 1, 2])
podo_data.head()

In [None]:
df = podo_data.copy()
print("dataset:", podo_df.shape)

In [None]:
for col in podo_df.columns:
    print(col[0], "/-/", col[1], "/-/", col[2], " - & Number of NaN:", podo_df[col[0]][col[1]].isnull().sum()[0], "- & dtype:", podo_df[col].dtypes)
    print("----------------")

In [None]:
df = df[df["Accession"]["Protein names"]["Total_Inf_Accession_Protein.names"].notna()]
df = df[df["Phospho ID Metrics"]["Phospho Sites"]["Phos_Inf_Phospho.ID.Metrics_Phosphosites"].notna()]
df = df[df["No Header"]["Phosphopeptide Present"]["Both_Phosphopeptide.Present"] == True]
df = df[df["Database"]["Human"]["Total_Database_Human"] == True]
df.shape

In [None]:
for col in _df.columns:
    print(col[0], "/-/", col[1], "/-/", col[2], " - & Number of NaN:", podo_df[col[0]][col[1]].isnull().sum()[0], "- & dtype:", podo_df[col].dtypes)
    print("----------------")

## Volcano Plot

In [None]:
def volcano_dataset_prep(index_col, t_test, log_fc, fdr_col, heatmap_data = None):
    """
    Separates the columns requred for making a volcano plot.
    
    Parameters
    ----------
    index_col: Pandas series holding index values
    t_test: Pandas series holding t test values
    log_fc: Pandas series holding log fc values
    fdr_col: Pandas series holding fdr values
    
    
    Returns
    ----------
     A new pandas dataframe for volcano plotheatmap_data = Optional, pandas dataframe that can be used to create heatmap later
    """
    dataset = pd.concat((index_col, t_test, log_fc, fdr_col, heatmap_data), axis = 1)
    dataset = dataset.set_index("Total_Inf_Accession_Protein.names")
    dataset = dataset.dropna()
    return dataset

def x_y_axes(data):
    """
    Prepares x and y axes for volcano plot
    
    Parameters
    ----------
    data: Pandas dataframe prepared by volcano_dataset_prep
    
    Returns
    ----------
    x_values: List, from LogFC column, to be the values on x axis
    y_value: List, negative log 10 of t_test column values, to be the values on the y axis
    hover: List, containing the names of the proteins, to be used to identify the name of the values on the plot 
    """
    x_col = [x for x in data.columns if "LogFC" in x][0]
    y_col = [y for y in data.columns if "Test" in y][0]
    x_values = data[x_col].values
    y_values = np.log10(data[y_col].values) * -1
    hover = list(data.index)
    return x_values, y_values, hover

def volcano_plot(x_val, y_val, hover_list, title):
    """
    Creates annotated volcano plot from p values and FDR values
    
    Parameters
    ----------
    x_values: List, from LogFC column, to be the values on x axis
    y_value: List, negative log 10 of t_test column values, to be the values on the y axis
    hover: List, containing the names of the proteins, to be used to identify the name of the values on the plot
    title: String, used for the title of the plot
    
    Returns
    ----------
    A volcano plot separating proteins by biological and statistical significane
    
    """
    x_minimum = min(x_val)
    x_maximum = max(x_val)
    y_minimum = min(y_val)
    y_maximum = max(y_val)
    p_val_cut = np.log10(0.05) * -1
    
    fig = graph_objects.Figure()
    image = graph_objects.Scatter(x = x_val, y = y_val, mode = 'markers', hovertext = hover_list)
    fig.add_trace(image)
    fig.add_shape(type = 'line', x0 = -1, y0 = y_minimum - 0.2, x1 = -1, y1 = y_maximum + 0.2, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.add_shape(type = 'line', x0 = 1, y0 = y_minimum - 0.2, x1 = 1, y1 = y_maximum + 0.2, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.add_shape(type = 'line', x0 = x_minimum - 0.1, y0 = p_val_cut, x1 = x_maximum + 0.1, y1 = p_val_cut, line = dict(color = 'Red',), xref = 'x', yref = 'y')
    fig.update_layout(title = title, xaxis_title = "Log2 Fold Change", yaxis_title = "P-Value (-Log10)")
    fig.show()

In [None]:
# Separate the parts required for calculations

first_ge_df = ge_df[["No Header", "Accession" ,"Stats - C3a/C", "Stats - C4a/C", "Phospho Adjusted Log2 Normalised Abundanxe"]]
first_ge_df = first_ge_df[first_ge_df["No Header"]["Phosphopeptide Present"]["Both_Phosphopeptide.Present"] == True]

In [None]:
# Filter NaN values
first_ge = first_ge_df.dropna()

# Make Protein namse the index 
index = first_ge["Accession"]["Protein names"].reset_index(drop = True)
logfc = first_ge["Stats"]["Adjusted Phosphopeptide Stats LogFC"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)
ttest = first_ge["Stats"]["Adjusted Phosphopeptide Stats T Test"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)
fdr = first_ge["Stats"]["Adjusted Phosphopeptide Stats FDR"].apply(pd.to_numeric, args = ('coerce',)).astype(float).reset_index(drop = True)

In [None]:
c3_dataset = volcano_dataset_prep(index, c3_ttest, c3_logfc, c3_fdr).dropna()
x, y, hover = x_y_axes(c3_dataset)
volcano_plot(x, y, hover, "C3/C Adjusted Phospho Abundance")

In [None]:
c3_dataset.rename(columns = {"Phos_Stats.Adj_C3a.C_T.Test": "T Test", 
                                                          "Phos_Stats.Adj_LogFC": "LogFC",
                                                         "Phos_Stats.Adj_FDR": "FDR"}, inplace = True)
c3_dataset.index.rename("Protein Names", inplace = True)

c3_dataset["P Value"] = np.log10(c3_dataset["T Test"]) * -1
p = np.log10(0.05) * -1
volcano_list = c3_dataset[((c3_dataset["LogFC"] > 1) | (c3_dataset["LogFC"] < -1)) & (c3_dataset["P Value"] >= p)]
volcano_list

In [None]:
heatmap_c3_dataset = volcano_dataset_prep(index, c3_ttest, c3_logfc, c3_fdr, heatmap_part)
heatmap_c3_dataset = heatmap_c3_dataset.rename(columns = {"Phos_Stats.Adj_T.Test": "T Test", 
                                                          "Phos_Stats.Adj_LogFC": "LogFC",
                                                         "Phos_Stats.Adj_FDR": "FDR"})
p_val = np.log10(heatmap_c3_dataset["T Test"]) * -1
heatmap_c3_dataset.insert(loc = 3, column = "P Value", value = p_val)
heatmap_c3_dataset.index.rename("Protein Names", inplace = True)
heatmap_c3_dataset

In [None]:
# Creating dendrogram with heatmap

dendogram = heatmap_c3_dataset[['sample1', 'sample2',
       'sample3', 'sample4',
       'sample5', 'sample6']]

dendo = dendogram.T
labels = dendogram.columns

fig = figure_factory.create_dendrogram(dendo, orientation = 'bottom', labels = labels)
for i in range(len(fig['data'])):
    fig['data'][i]['yaxis'] = 'y2'
    
dendro_side = figure_factory.create_dendrogram(dendo, orientation = 'right')
for i in range(len(dendro_side['data'])):
    dendro_side['data'][i]['xaxis'] = 'x2'


for data in dendro_side['data']:
    fig.add_trace(data)

dendro_leaves = dendro_side['layout']['yaxis']['ticktext']
dendro_leaves = list(map(int, dendro_leaves))
data_dist = pdist(dendo)
heat_data = squareform(data_dist)
heat_data = heat_data[dendro_leaves, :]
heat_data = heat_data[:, dendro_leaves]


heatmap = [
    graph_objects.Heatmap(
        x = dendro_leaves,
        y = dendro_leaves,
        z = heat_data,
        colorscale = 'Reds'
    )
]

heatmap[0]['x'] = fig['layout']['xaxis']['tickvals']
heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']

# Add Heatmap Data to Figure
for data in heatmap:
    fig.add_trace(data)
    
fig.update_layout({'width':800, 'height':800,
                         'showlegend':False, 'hovermode': 'closest',
                         })
# Edit xaxis
fig.update_layout(xaxis={'domain': [.15, 1],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  'zeroline': False,
                                  'ticks':""})
# Edit xaxis2
fig.update_layout(xaxis2={'domain': [0, .15],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""})

# Edit yaxis
fig.update_layout(yaxis={'domain': [0, .85],
                                  'mirror': False,
                                  'showgrid': False,
                                  'showline': False,
                                  'zeroline': False,
                                  'showticklabels': False,
                                  'ticks': ""
                        })
# Edit yaxis2
fig.update_layout(yaxis2 = {'domain':[.825, .975],
                                   'mirror': False,
                                   'showgrid': False,
                                   'showline': False,
                                   'zeroline': False,
                                   'showticklabels': False,
                                   'ticks':""})

fig.show()

In [None]:
# creating correlation analysis table

mask = np.triu(np.ones_like(dendogram.corr(), dtype = bool))


plt.figure(dpi = 100)
sns.set(rc = {'figure.figsize':(5, 5)})
plt.title("Correlation Analysis", fontsize = 10)
sns.heatmap(dendogram.corr(), mask = mask, cmap = "viridis", fmt = ".0%",
            annot = True, linewidths = 0.1, linecolor = "black", annot_kws = {"size": 10})
plt.xticks(rotation = 70, fontsize = 10)
plt.yticks(rotation = 0, fontsize = 10)
plt.show()

In [None]:
c3_volc_sig = heatmap_c3_dataset[((heatmap_c3_dataset["LogFC"] > 1) | (heatmap_c3_dataset["LogFC"] < -1)) & (heatmap_c3_dataset["P Value"] >= p)]
c3_volc_sig

## Clustering Analysis

In [None]:
main_indeces = ['sample1', 'sample2',
       'sample3', 'sample4',
       'sample5', 'sample6']

controls = ["C-1", "C-2", "C-3"]
treatment1 = ["T-1.1", "T-1.2", "T-1.3"]
treatment2 = ["T-2.1", "T-2.2", "T-2.3"]



def k_finder(data, max_k = 20):
    """
    Calculates inertia for a specific range of clusters to find the
    best number to assign to k in k-means clustering algorithm
    
    Parameters
    ----------
    data: Pandas dataframe, data that is ready for cluster analysis
    max_k: Int, maximum number of clusters that is going to be tested.
           20, by default   
    
    Returns
    ----------
    A matplotlib figure
    """
    k_num = range(2, max_k)
    inertia = []
    
    for k in k_num:
        kmeans = KMeans(n_clusters = k, randoms_state = 69)
        km = kmeans.fit(scaled_data.values)
        inertia.append(km.inertia_)
        
    fig, ax = plt.subplots(1, figsize = (15, 6))
    xax = np.arange(len(k_num))
    ax.plot(xax, inertia)
    ax.set_xticks(xax)
    ax.set_xticklabels(k_num)
    ax.set_yticklabels(np.array(inertia).round(1)[::-1], fontsize = 15)
    plt.xlabel("Number of Clusters", fontsize = 20)
    plt.ylabel("Inertia Score", fontsize = 20)
    plt.title("Inertia per K", fontsize = 20)
    plt.show()
    
    
def k_cluster(data, cluster_num = 2):
    """
    Accepts the data prepared for clustering and fit the kmenas clustering
    algorithm to it
    
    Parameters
    ----------
    data: Pandas dataframe, data that is ready for cluster analysis
    cluster_num: Int, best number of cluster. 2, by default
    
    Parameters
    ----------
    """
    kmeans= KMeans(n_clusters = cluster_num, random_state = 69)
    predicted = kmeans.fit_predict(scaled_data.values)
    un, counts = np.unique(predicted, return_counts = True)
    count = counts.reshape(1, cluster_num)
    cols = ["Cluster " + str(x) for x in range(1, cluster_num + 1)]
    return pd.DataFrame(count, columns = cols)


def optimized_kcluster(data, n_comp, k_clust):
    """
    Optimazation of K means clustering with PCA
    
    Parameters
    ----------
    data: Pandas datframe, data prepared for clustering analysis
    n_comp: Int, the ideal number for principal components provided from best_comp function
    k_clust: Int, best number of cluster provided by k_finder function
    
    Returns
    ----------
    Pandas dataframe of optimized k means clustering analysis
    """
    pca = PCA(n_components = n_comp, random_state = 42)
    pca_mod = pca.fit_transform(data)
    kmean = KMeans(n_clusters = k_clust, random_state = 69)
    km = kmean.fit_predict(pca_mod)
    un, counts = np.unique(km, return_counts = True)
    count = counts.reshape(1, k_clust)
    cols = ["Cluster " + str(x) for x in range(1, k_clust + 1)]
    return pd.DataFrame(count, columns = cols)


def best_comp(data):
    """
    Works out what is the ideal number of component for PCA analysis
    
    Parameters
    ----------
    data: Pandas datframe, data prepared for clustering analysis
    
    Returns
    ----------
    A matplotlib plot
    """
    comp_num = data.shape[1]
    pca = PCA(n_components = comp_num, random_state = 42)
    pca_mod = pca.fit_transform(data.values)
    total_var = sum(pca.explained_variance_)
    var_95 = total_var * 0.95
    print(f"Total Variance: {round(total_var, 2)}")
    print()
    
    values = zip(range(0, comp_num), pca.explained_variance_)
    report = pd.DataFrame(values, columns = ["PCA Components", "Explained Variance"])
    print(report)
    print()

    plt.figure(figsize = (15, 7))
    plt.plot(pca.explained_variance_ratio_, linewidth = 2.5, c = "b")
    plt.xlabel("Number of Components", fontsize = 15)
    plt.ylabel("Explained Ratio", fontsize = 15)
    plt.show()
    

def pca_initiation(sub_data, n_comp):
    """
    Accepts data for PCA, scale the data and fits the algorithm to it
    
    Parameters
    ----------
    sub_data: Pandas dataframe required for PCA analysis
    n_comp: Int, the ideal number for principal components provided from best_comp function
    
    Returns
    ----------
    pca_data_final: Numpy array, applied PCA to values from the original dataset 
    per_var_info: Numpy array, the explained variance of PCA that is calculated as ratio of eigenvalue 
                  of a articular principal component (eigenvector) with total eigenvalues.
    labels: List, labels for components of the analysis to be used for plotting in scree_plot function
    """
    pca_init = PCA(n_components = n_comp, random_state = 42)
    pca_data_final = pca_init.fit_transform(sub_data)
    per_var_info = np.round(pca_init.explained_variance_ratio_ * 100, decimals = 1)
    labels = ["PC" + str(x) for x in range(1, len(per_var_info) + 1)]
    return pca_data_final, per_var_info, labels


def scree_plot(variance, all_labels, save = False, name = None):
    """
    Accpets the PCA data and creats the scree plot
    
    Parameters
    ----------
    variance: Numpy array, the second result returned from pca_initiation function
    all_labels: List, the third result returned from pca_initiation function
    save: Boolean, determines whether the scree plot should be saved in the directory or not. 
          False, by default.
    name: String, if the save argument is True, then a name is required to save the image with.
          None, by default
          
    Returns
    ---------
    A matplotlib image with png format
    """
    if save == False:
        plt.figure(figsize = (10, 5))
        plt.bar(x = range(1, len(variance) + 1), height = variance, tick_label = all_labels)
        plt.ylabel("Percentage of Explained Variance", fontsize = 15)
        plt.xlabel("Principal Components", fontsize = 15)
        plt.title("Scree Plot")
        plt.show()
        
    else:
        plt.figure(figsize = (10, 5))
        plt.bar(x = range(1, len(variance) + 1), height = variance, tick_label = all_labels)
        plt.ylabel("Percentage of Explained Variance", fontsize = 15)
        plt.xlabel("Principal Components", fontsize = 15)
        plt.title("Scree Plot")
        plt.savefig(name)
        
        
def pca_result_df(final_pca_df ,all_labels):
    """
    Creates a Pandas dataframe from the data whose values have been transformed with PCA analysis.
    
    Parameters
    ----------
    final_pca_df: Numpy array, the first result returned from pca_initiation function
    all_labels: List, the third result returned from pca_initiation function
    
    Returns
    ----------
    Pandas dataframe with principal components as columns and the name of samples as index
    
    """
    result_df = pd.DataFrame(final_pca_df, 
             index = [*cs, *c4as],
             columns = all_labels)

    return result_df

        
def pca_2d_graph(pca_df_final, per_var_info, title, save = False, name = None):
    """
    Accepts data whose values have been transformed with PCA algorithm and creates
    a scatter plot with the first and second principal components of each sample.
    
    Parameters
    ----------
    pca_df_final: Pandas dataframe, result of pca_result_df function 
    per_var_info: Numpy array, the second result returned from pca_initiation function 
    title: String, title needed for the scatter plot
    save: Boolean, saves the plot as a png file if True. False by default
    name: String, if save is True, then a name is required for the plot to be saved with
    
    Returns
    ----------
    A matplotlib plot with png format
    """
    if save == False:
        plt.figure(figsize = (7, 7))
        plt.scatter(pca_df_final.PC1, pca_df_final.PC2)

        plt.title(title)
        plt.xlabel("PC1 - {0}%".format(per_var_info[0]))
        plt.ylabel("PC2 - {0}%".format(per_var_info[1]))
        plt.plot([0, 0], [min(pca_df_final.PC2) - 3, max(pca_df_final.PC2) + 3], color='green',
             linestyle = 'dashed', linewidth = 2)

        plt.plot([min(pca_df_final.PC1), max(pca_df_final.PC1)], [0, 0], color='green',
             linestyle = 'dashed', linewidth = 2)

        for sample in pca_df_final.index:
            plt.annotate(sample, (pca_df_final.PC1.loc[sample], pca_df_final.PC2.loc[sample]))

        plt.show()
        
    else:
        plt.figure(figsize = (7, 7))
        plt.scatter(pca_df_final.PC1, pca_df_final.PC2)

        plt.title(title)
        plt.xlabel("PC1 - {0}%".format(per_var_info[0]))
        plt.ylabel("PC2 - {0}%".format(per_var_info[1]))

        for sample in pca_df_final.index:
            plt.annotate(sample, (pca_df_final.PC1.loc[sample], pca_df_final.PC2.loc[sample]))
            
        plt.svaefig(name + ".png")
        
        
def pca_multi_graph(var_ratio, pca_dataframe, pca_data_transformed, dimension, title):
    """
    Creates a correlograph if all the principal components from the PCA analysis
    
    Parameters
    ----------
    var_ratio: Numpy array, the second result returned from pca_initiation function
    pca_dataframe: Pandas dataframe, result of pca_result_df function
    pca_data_transformed: Numpy array, the first result returned from pca_initiation function
    dimension: Int, length of pca_dataframe 
    title: String, used as the title of the correlograph
    
    Returns
    ----------
    An interactive correlograph
    """
    x_y_labels = {
    str(i): f"PC {i + 1} ({var:.1f}%)"
    for i, var in enumerate(var_ratio * 100)}
    
    fig = express.scatter_matrix(
        pca_data_transformed,
        labels = x_y_labels,
        dimensions = range(1, dimension),
        color = pca_dataframe.index, width = 1200, height = 1200, title = title)
    fig.update_traces(diagonal_visible = False)
    fig.show()