In [1]:
import pandas as pd

In [2]:
import pandas as pd

# Step 1: Load RNA, protein, and phospho dataframes
mrna_df = pd.read_csv("data_input/mrna.csv", index_col=0)
proteo_df = pd.read_csv("data_input/proteo.csv", index_col=0)
phospho_df = pd.read_csv("data_input/phospho.csv", index_col=0)
metadata_df = pd.read_csv("data_input/metadata.csv", index_col=0)

# Strip whitespace from column names
mrna_df.columns = mrna_df.columns.str.strip()
proteo_df.columns = proteo_df.columns.str.strip()
phospho_df.columns = phospho_df.columns.str.strip()

# Step 2: Clean column names for proteomics and phosphoproteomics data

# For proteomics data (remove "iBAQ " prefix)
proteo_df.columns = [col.replace("iBAQ ", "") for col in proteo_df.columns]

# For phosphoproteomics data (remove "Intensity " prefix and "_phos" suffix)
phospho_df.columns = [col.replace("Intensity ", "").replace("_phos", "") for col in phospho_df.columns]

# Step 3: Get sample name lists
mrna_list = list(mrna_df.columns)
proteo_list = list(proteo_df.columns)
phospho_list = list(phospho_df.columns)

common_values = set(mrna_list) & set(proteo_list) & set(phospho_list)
strings_to_filter = list(common_values)

#strings_to_filter

# Step 5: Filter the dataframes to keep only common samples
mrna_df_filtered = mrna_df.loc[:, mrna_df.columns.isin(strings_to_filter)]
proteo_df_filtered = proteo_df.loc[:, proteo_df.columns.isin(strings_to_filter)]
phospho_df_filtered = phospho_df.loc[:, phospho_df.columns.isin(strings_to_filter)]

# Save filtered dataframes as CSV
mrna_df_filtered.to_csv("data_output/mrna_filtered.csv")
phospho_df_filtered.to_csv("data_output/phospho_filtered.csv")
proteo_df_filtered.to_csv("data_output/proteo_filtered.csv")

#proteo_df_filtered
#mrna_df_filtered

In [3]:
# Transpose and clean function
def format_expression_matrix(df):
    df_t = df.T  # Transpose directly
    df_t.index.name = 'Sample_ID'
    df_t.columns.name = 'Features'
    return df_t


mrna_df_t = format_expression_matrix(mrna_df_filtered)
proteo_df_t = format_expression_matrix(proteo_df_filtered)
phospho_df_t = format_expression_matrix(phospho_df_filtered)


mrna_df_t

Features,A1BG,A1CF,A2ML1,A4GALT,AAAS,AACS,AADACL3,AADACP1,AADAT,AAGAB,AAMDC,AAMP,AANAT,AAR2,AARD,AARS,AARSD1,AASDH,AASDHPPT,AATBC,AATF,ABCA10,ABCA13,ABCA2,ABCA4,ABCA7,ABCA9,ABCB1,ABCB6,ABCB8,ABCB9,ABCC1,ABCC10,ABCC2,ABCC4,ABCC5,ABCC6,ABCC6P1,ABCC6P2,ABCC9,...,ZNF93,ZNFX1,ZNHIT1,ZNHIT2,ZNHIT3,ZNHIT6,ZNRD1,ZNRD1ASP,ZNRF1,ZNRF2P3,ZNRF3,ZP2,ZPR1,ZRANB2,ZRANB3,ZSCAN1,ZSCAN16-AS1,ZSCAN18,ZSCAN2,ZSCAN20,ZSCAN21,ZSCAN22,ZSCAN25,ZSCAN26,ZSCAN31,ZSCAN32,ZSCAN9,ZSWIM3,ZSWIM5,ZSWIM6,ZSWIM7,ZSWIM8,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZYG11A,ZYX
Sample_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
LUAD_17T,1.252622,0.001838,0.632023,6.169563,11.282293,1.031983,,0.027434,1.112645,10.177802,3.261947,17.283050,0.030260,8.383416,0.519271,16.800976,2.510232,1.448635,1.673144,0.180012,8.248414,0.249007,0.017746,9.088212,0.247213,8.839390,0.315913,0.360238,5.880853,3.203712,0.521883,4.862625,7.939100,0.086895,0.655643,2.401721,0.634267,0.295805,0.197957,0.793018,...,0.660582,3.719091,10.181022,10.714254,2.524142,3.435559,2.975146,0.103697,2.923079,0.175672,1.242587,,6.062768,5.161016,0.248139,0.022274,0.817506,2.651290,0.485367,0.666799,0.966627,0.588387,1.356366,1.601835,0.910390,1.382151,2.387111,1.329858,0.132748,2.152100,2.370425,12.206495,0.889204,7.229544,1.748943,9.168108,1.146935,1.076125,0.382825,43.010414
LUAD_18T,0.307195,,0.013609,7.185639,7.060750,2.347547,,0.113883,4.846401,9.079245,4.560252,16.768810,0.048619,7.497659,0.226683,23.170509,1.897431,2.148948,1.810496,1.684233,5.676728,1.306653,1.408678,3.258852,0.088987,12.793041,0.438380,0.749819,6.038588,1.307187,0.412596,8.826334,3.754687,0.142432,7.384445,4.570783,3.793606,2.211666,3.158964,1.807780,...,2.961873,4.592275,4.513997,7.852625,2.576679,2.884488,4.823029,0.259612,1.949223,0.046092,2.541149,0.091440,1.950675,14.653729,0.568355,0.160314,1.781420,3.139017,1.210263,0.683682,0.829400,0.701834,1.378820,1.517884,4.607198,1.795389,2.095057,1.310186,2.168601,4.987443,2.456766,6.226201,2.953023,4.025210,2.162702,11.245055,0.902083,2.038218,0.212627,24.580304
LUAD_19T,0.503481,0.003264,0.003225,1.389186,5.970035,0.809333,,,2.157599,6.603201,5.731292,14.763637,0.013701,8.655994,,21.058881,2.051125,1.657589,4.184340,0.648222,7.950481,0.470894,0.841269,2.313709,0.591552,5.792327,0.033991,0.144004,5.348247,1.305646,0.494430,10.491004,1.767872,0.044392,4.357050,1.799496,2.096081,1.393150,0.583133,0.071467,...,0.220651,3.735461,5.943639,10.228874,2.919144,2.310816,2.394970,0.129798,0.899021,0.112017,2.602377,,2.134030,3.764133,0.349282,,0.286038,0.525772,0.313045,0.839533,0.678822,1.327932,0.766983,1.100983,1.375226,2.042520,1.327603,2.898701,3.102655,2.258368,1.570395,5.819468,1.510892,13.625922,2.701282,5.756810,0.952277,1.526294,0.051018,18.925070
LUAD_20T,0.634032,0.008952,,0.399586,5.875602,1.332691,,0.016045,1.173181,8.264278,1.873769,17.710344,0.170725,8.967878,0.042893,27.352571,1.721263,2.000351,4.088020,0.940071,6.761549,0.877137,1.140937,3.392354,5.402778,4.924643,0.227196,1.104411,4.258697,2.498916,0.675657,5.172456,4.836075,0.066878,2.312741,2.644484,2.151322,1.060072,2.689122,0.522176,...,0.940480,5.963404,8.067601,7.112373,2.870101,3.549905,2.936355,0.268572,0.927400,0.374053,1.629025,,4.563389,14.980269,0.497083,,0.669408,2.193501,0.722506,0.717774,1.337220,0.888533,2.188282,1.022274,2.851974,1.128845,1.107632,2.020485,1.614699,3.160669,2.016577,6.961311,2.517310,10.005023,2.928872,7.024621,1.298478,2.227100,0.169097,28.924445
LUAD_22T,1.118911,0.006170,0.004319,1.850052,5.809641,1.529900,,0.036447,0.852748,6.975206,3.602387,20.633110,0.025449,14.058970,0.333852,18.529975,2.824410,2.196749,3.783415,1.501987,7.927684,0.782533,0.269739,2.778749,1.043826,7.267595,0.307137,0.484265,5.385512,3.217176,0.559487,5.506605,3.017974,0.113347,3.420505,4.569262,0.937405,0.879103,1.134740,0.838591,...,1.018630,5.674616,16.487572,6.864786,3.977406,5.850745,6.113176,0.572506,0.828519,0.043021,1.230601,0.106373,3.711304,18.602074,0.595244,0.383642,1.046365,4.221563,0.552471,1.368599,3.096824,0.812085,4.691810,1.969374,2.908232,1.762307,1.935450,2.147098,3.938765,2.917129,3.302545,6.854756,1.757088,8.429169,3.149157,6.890823,1.015556,1.726142,0.410542,27.144227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
LUAD_57N,1.277264,0.002982,0.021549,3.214655,4.374093,0.854574,,0.051114,1.135120,2.696285,17.056247,21.994630,0.083825,9.864068,0.423236,13.117393,2.272439,0.639706,0.880997,0.271984,6.427694,1.434728,0.019582,1.763682,0.094926,4.673958,1.386303,0.141664,6.344769,2.258348,0.358161,4.662453,1.231042,0.066267,1.818017,1.366144,2.584423,0.759119,1.776407,2.361897,...,0.161324,5.084391,13.134462,11.169448,1.682021,1.532241,4.190784,0.106130,1.930592,0.113295,1.744107,0.007592,2.231262,8.907879,0.205223,0.071713,2.279834,3.767535,0.281181,0.116611,0.848717,0.509803,0.914353,1.124540,0.803271,1.331419,0.405801,1.253120,0.721540,2.414844,5.691168,2.932313,1.101770,1.701239,1.235247,0.707701,0.514289,1.449045,,63.778512
LUAD_60N,0.752805,0.002819,0.019420,2.814836,5.528797,0.804052,,0.350666,1.190712,3.623042,8.227143,16.402868,0.005656,10.924535,1.627948,16.585801,1.722675,2.178614,1.958207,1.072556,5.425864,1.382460,0.856256,2.398771,0.070760,4.601583,1.514251,0.686856,4.313037,1.932349,0.209033,5.959869,2.129636,0.122090,2.753154,1.996148,2.977692,0.580849,1.537675,2.250620,...,0.299408,6.278811,10.074702,11.681693,2.271197,1.731866,2.924911,0.080778,1.709075,0.070670,1.607035,,4.234457,8.962636,0.189924,0.162269,2.442304,3.375730,0.981163,0.277342,1.019736,0.665609,0.696537,1.102293,1.350622,0.981638,0.602561,1.717670,0.953493,3.085884,4.235395,5.079898,1.615513,4.014860,1.554295,1.360537,0.612685,1.472120,0.021812,52.894053
LUAD_61N,0.561870,,0.009612,3.955617,5.623969,0.835335,,0.138909,1.523528,6.057772,10.174935,19.313436,0.042205,9.314484,0.719886,14.932923,1.609937,2.223277,1.928578,0.296349,5.960685,1.674708,0.313140,3.115224,0.096109,2.142701,2.335670,1.021356,5.080308,1.838581,0.154232,6.137741,1.307628,0.080832,2.558359,2.375474,1.962497,0.622870,1.389304,2.763080,...,0.333747,5.159931,9.491311,9.351439,1.793538,2.046173,2.761208,0.069329,1.706975,,2.793582,0.008196,3.651972,10.003966,0.375640,0.189310,1.301189,4.800013,0.258763,0.470798,1.120563,0.646859,1.911546,1.152364,1.010570,0.916030,0.727044,0.652443,1.007317,3.562959,4.183173,6.530700,2.018263,3.308089,1.355302,0.597081,1.245324,2.463351,0.012059,55.002900
LUAD_90N,1.246367,0.005103,,6.084662,5.178132,0.888747,,0.140024,0.529743,4.562461,21.693128,17.981380,0.016326,12.359743,1.584775,16.383221,2.312282,0.665050,1.412137,0.241081,6.944288,0.403474,0.036688,2.033857,0.061385,5.063430,0.272035,0.541962,4.578546,3.041170,0.345422,1.865003,0.896864,0.043768,0.859318,1.143306,2.477160,0.334176,1.515931,1.002275,...,0.124579,4.206557,17.230065,18.868435,0.835349,0.205571,4.074930,0.047430,1.953866,0.115298,1.055056,,2.160837,3.689992,0.126317,0.102420,3.989711,5.069665,0.335824,0.110779,1.279606,0.322509,0.811337,0.564737,0.251009,1.207805,0.601240,0.836306,0.537175,0.285583,4.383727,3.423122,0.852592,0.645479,0.700467,0.354679,0.289876,0.835166,,112.265995


In [5]:
# PCA function
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import plotly.express as px


def perform_pca_and_plot(df, title, metadata_df):
    df = df.loc[df.index.intersection(metadata_df.index)]
    labels = metadata_df.loc[df.index, 'Label']

    imputer = SimpleImputer(strategy='mean')
    df_pca_imputed = imputer.fit_transform(df)

    scaler = StandardScaler()
    df_pca_scaled = scaler.fit_transform(df_pca_imputed)

    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(df_pca_scaled)
    explained_variance = pca.explained_variance_ratio_ * 100

    pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'], index=df.index)
    pca_df['Label'] = labels.values

    fig = px.scatter(
        pca_df, x='PC1', y='PC2', color='Label', title=f'PCA of {title}',
        labels={'color': 'Sample Label'}, hover_name=pca_df.index
    )
    fig.update_traces(marker=dict(size=10, line=dict(width=2, color='DarkSlateGrey')))
    fig.update_layout(
        autosize=False, width=400, height=300,
        xaxis_title=f'PC 1 ({explained_variance[0]:.2f}% Variance)',
        yaxis_title=f'PC 2 ({explained_variance[1]:.2f}% Variance)',
        title=f'PCA Analysis of {title}'
    )
    fig.show()

# Perform PCA on each dataset
perform_pca_and_plot(mrna_df_t, 'RNA Data', metadata_df)
perform_pca_and_plot(proteo_df_t, 'Protein Data', metadata_df)
perform_pca_and_plot(phospho_df_t, 'Phospho Data', metadata_df)
