In [None]:
# Import the libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import scale
from sklearn.cluster import KMeans, k_means
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
from plotly import graph_objects
from plotly import express
from plotly import figure_factory

In [None]:
podo_data = pd.read_excel("C4C3-Podo-NewHeader.xlsx", header = [0, 1, 2])
podo_data.head()

In [None]:
podo_df = podo_data.copy()
print("Podo dataset:", podo_df.shape)

In [None]:
for col in podo_df.columns:
    print(col[0], "/-/", col[1], "/-/", col[2], " - & Number of NaN:", podo_df[col[0]][col[1]].isnull().sum()[0], "- & dtype:", podo_df[col].dtypes)
    print("----------------")

In [None]:
podo_df = podo_df[podo_df["Accession"]["Protein names"]["Total_Inf_Accession_Protein.names"].notna()]
podo_df = podo_df[podo_df["Phospho ID Metrics"]["Phospho Sites"]["Phos_Inf_Phospho.ID.Metrics_Phosphosites"].notna()]
podo_df = podo_df[podo_df["No Header"]["Phosphopeptide Present"]["Both_Phosphopeptide.Present"] == True]
podo_df = podo_df[podo_df["Database"]["Human"]["Total_Database_Human"] == True]
podo_df.shape

In [None]:
for col in podo_df.columns:
    print(col[0], "/-/", col[1], "/-/", col[2], " - & Number of NaN:", podo_df[col[0]][col[1]].isnull().sum()[0], "- & dtype:", podo_df[col].dtypes)
    print("----------------")

## Volcano Plot

In [None]:
def volcano_dataset_prep(index_col, t_test, log_fc, fdr_col, heatmap_data = None):
    """
    Separates the columns requred for making a volcano plot.
    
    Parameters
    ----------
    index_col: Pandas series holding index values
    t_test: Pandas series holding t test values
    log_fc: Pandas series holding log fc values
    fdr_col: Pandas series holding fdr values
    
    
    Returns
    ----------
     A new pandas dataframe for volcano plotheatmap_data = Optional, pandas dataframe that can be used to create heatmap later
    """
    dataset = pd.concat((index_col, t_test, log_fc, fdr_col, heatmap_data), axis = 1)
    dataset = dataset.set_index("Total_Inf_Accession_Protein.names")
    dataset = dataset.dropna()
    return dataset

def x_y_axes(data):
    """
    Prepares x and y axes for volcano plot
    
    Parameters
    ----------
    data: Pandas datframe prepared by volcano_dataset_prep
    
    Returns
    ----------
    x_values: List, from LogFC column, to be the values on x axis
    y_value: List, negative log 10 of t_test column values, to be the values on the y axis
    hover: List, containing the names of the proteins, to be used to identify the name of the values on the plot 
    """
    x_col = [x for x in data.columns if "LogFC" in x][0]
    y_col = [y for y in data.columns if "Test" in y][0]
    x_values = data[x_col].values
    y_values = np.log10(data[y_col].values) * -1
    hover = list(data.index)
    return x_values, y_values, hover