In [1]:
import pandas as pd

# Step 1: Load RNA, protein, and phospho dataframes
mrna_df = pd.read_csv("data_input/mrna.csv", index_col=0)
proteo_df = pd.read_csv("data_input/proteo.csv", index_col=0)
phospho_df = pd.read_csv("data_input/phospho.csv", index_col=0)
metadata_df = pd.read_csv("data_input/metadata.csv", index_col=0)

# Strip whitespace from column names
mrna_df.columns = mrna_df.columns.str.strip()
proteo_df.columns = proteo_df.columns.str.strip()
phospho_df.columns = phospho_df.columns.str.strip()

# Step 2: Clean column names for proteomics and phosphoproteomics data

# For proteomics data (remove "iBAQ " prefix)
proteo_df.columns = [col.replace("iBAQ ", "") for col in proteo_df.columns]

# For phosphoproteomics data (remove "Intensity " prefix and "_phos" suffix)
phospho_df.columns = [col.replace("Intensity ", "").replace("_phos", "") for col in phospho_df.columns]

# Step 3: Get sample name lists
mrna_list = list(mrna_df.columns)
proteo_list = list(proteo_df.columns)
phospho_list = list(phospho_df.columns)

common_values = set(mrna_list) & set(proteo_list) & set(phospho_list)
strings_to_filter = list(common_values)

#strings_to_filter

# Step 5: Filter the dataframes to keep only common samples
mrna_df_filtered = mrna_df.loc[:, mrna_df.columns.isin(strings_to_filter)]
proteo_df_filtered = proteo_df.loc[:, proteo_df.columns.isin(strings_to_filter)]
phospho_df_filtered = phospho_df.loc[:, phospho_df.columns.isin(strings_to_filter)]

# Save filtered dataframes as CSV
mrna_df_filtered.to_csv("data_output/mrna_filtered.csv")
phospho_df_filtered.to_csv("data_output/phospho_filtered.csv")
proteo_df_filtered.to_csv("data_output/proteo_filtered.csv")

#proteo_df_filtered
#mrna_df_filtered

In [2]:
# Transpose and clean function
def format_expression_matrix(df):
    df_t = df.T  # Transpose directly
    df_t.index.name = 'Sample_ID'
    df_t.columns.name = 'Features'
    return df_t


mrna_df_t = format_expression_matrix(mrna_df_filtered)
proteo_df_t = format_expression_matrix(proteo_df_filtered)
phospho_df_t = format_expression_matrix(phospho_df_filtered)

In [3]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly.express as px

def perform_pca_and_plot(df, title, metadata_df, missingness_threshold=0.9):
    # Match samples between df and metadata
    df = df.loc[df.index.intersection(metadata_df.index)]
    labels = metadata_df.loc[df.index, 'Label']

    # Step 1: Remove features with ≥ missingness_threshold missing values
    missing_pct = df.isnull().mean()
    df = df.loc[:, missing_pct < missingness_threshold]

    # Step 2: Impute missing values
    imputer = SimpleImputer(strategy='mean')
    df_pca_imputed = imputer.fit_transform(df)

    # Step 3: Scale the data
    scaler = StandardScaler()
    df_pca_scaled = scaler.fit_transform(df_pca_imputed)

    # Step 4: Perform PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(df_pca_scaled)
    explained_variance = pca.explained_variance_ratio_ * 100

    # Step 5: Prepare dataframe for plotting
    pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'], index=df.index)
    pca_df['Label'] = labels.values

    # Step 6: Plot
    fig = px.scatter(
        pca_df, x='PC1', y='PC2', color='Label', title=f'PCA of {title}',
        labels={'color': 'Sample Label'}, hover_name=pca_df.index
    )
    fig.update_traces(marker=dict(size=10, line=dict(width=2, color='DarkSlateGrey')))
    fig.update_layout(
        autosize=False, width=400, height=300,
        xaxis_title=f'PC 1 ({explained_variance[0]:.2f}% Variance)',
        yaxis_title=f'PC 2 ({explained_variance[1]:.2f}% Variance)',
        title=f'PCA Analysis of {title}'
    )
    fig.show()


# Perform PCA on each dataset
perform_pca_and_plot(mrna_df_t, 'RNA Data', metadata_df)
perform_pca_and_plot(proteo_df_t, 'Protein Data', metadata_df)
perform_pca_and_plot(phospho_df_t, 'Phospho Data', metadata_df)


KeyboardInterrupt: 