In [15]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.covariance import EmpiricalCovariance
from scipy.spatial import distance
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from F01_data_preprocessing_function import data_preprocessing

In [16]:
# Load the dataset using pandas
df = pd.read_csv("your_dataset_name.csv")

In [17]:
print(df.shape)

(184, 9876)


In [18]:
df.iloc[:, :]

Unnamed: 0,VAMP1|P23763,KCTD13|Q8WZ19,TXNDC12|O95881,PDHX|O00330,APIP|Q96GX9,CIAO1|O76071,NNT|Q13423,KIF3B|O15066,ULK1|O75385,POLR2L|P62875,...,999925948,999925957,999925981,999925982,999925983,999926062,msex,educ,age_death,cogdx
0,-0.278255,,-0.366508,-0.144993,0.145339,-0.186780,-0.035720,-0.100272,-0.043788,,...,0.631479,,1.064746,1.112526,0.666095,,0.0,16.0,91.000000,1.0
1,0.410722,,-0.498610,0.155336,-0.417854,-0.163890,0.240252,-0.168145,-0.039358,,...,0.706243,0.402799,0.377119,0.505797,0.454829,,0.0,18.0,91.000000,4.0
2,-0.112329,,-0.276421,0.050900,-0.843172,-0.161632,0.061352,-0.118060,-0.117369,,...,1.519027,,0.807341,0.790639,0.533103,,1.0,14.0,91.000000,1.0
3,-0.162641,,-0.354865,0.079455,0.577196,-0.144342,0.075293,0.024694,-0.030365,,...,0.989240,,1.449183,0.975184,,1.455196,0.0,11.0,84.832307,1.0
4,0.117440,,-0.321546,-0.071422,0.221551,-0.196930,0.013169,-0.106460,-0.023631,,...,1.031685,1.142469,1.013210,1.169732,,,1.0,24.0,82.370979,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,-0.196350,,-0.341186,-0.118638,-0.441684,-0.196678,-0.080199,-0.093338,0.009522,0.167130,...,0.969836,,1.244216,1.233083,1.000000,,0.0,12.0,89.308693,4.0
180,-0.068904,,-0.418761,0.068269,-0.263557,-0.182733,-0.000759,-0.116606,-0.050693,0.023869,...,0.909698,0.574822,1.439049,0.880873,1.008289,,0.0,13.0,91.000000,1.0
181,-0.073641,,-0.521676,-0.132236,0.713746,-0.167275,-0.117487,-0.219575,-0.200840,0.096183,...,0.694491,,1.463939,0.973781,1.728835,,0.0,18.0,91.000000,4.0
182,0.084443,0.225794,-0.295169,-0.145319,0.236361,-0.209734,-0.173279,-0.222953,0.243730,0.079022,...,1.280072,,1.102607,0.925441,,,0.0,21.0,87.947981,4.0


In [19]:
### df should have features across the columns and samples across the rows
### (features include protein abundances, metabolite abundances, and clinical features)
### df should have protein expression values expressed as log2 ratios (no change = 0)
### df should have metabolite expression values expressed as ratios, non log2 normalized (no change = 1)
### there should be no missing values in your target variable column (patient diagnosis)

## define slice objects that can be used to slice the dataframe within the function
cols_prot = slice(col_start, col_end)
cols_met = slice(col_start2, col_end2)
cols_clin_con = slice(col_start3, col_end3)
cols_clin_bin = slice(col_start4, col_end4)

In [20]:
#### the input dataframe in my case should only contain prot, met and clinical features +
#### the age column should already have "91" instead of "90+"

In [21]:
result_df = data_preprocessing(df, cols_prot, cols_met, cols_clin_bin, cols_clin_con)

Proteomics data pre-processing completed
Metabolomics data pre-processing completed
Missing values have been replaced
14 samples were removed after filtering outliers based on proteomics data
28 samples were removed after filtering outliers based on metabolomics data
Clinical data pre-processing completed


In [22]:
print(result_df.shape)

(142, 7096)


In [23]:
result_df.head()

Unnamed: 0,VAMP1|P23763,TXNDC12|O95881,PDHX|O00330,APIP|Q96GX9,CIAO1|O76071,NNT|Q13423,KIF3B|O15066,ULK1|O75385,MEF2C|Q06413,MT3|P25713,...,999925855,999925884,999925936,999925948,999925981,999925982,msex,educ,age_death,cogdx
0,-0.207101,-0.019232,-0.205116,0.373932,-1.2e-05,-0.023599,0.008489,0.002501,0.20228,-0.278368,...,0.436035,0.084535,-0.173663,-0.579595,0.139316,0.190447,-1.989986,0.450011,1.994412,1.0
1,0.475842,-0.157368,0.089179,-0.195295,0.016843,0.246339,-0.065419,0.000896,-0.090978,-0.413591,...,0.020955,-0.075851,-0.314899,0.037763,-0.902173,-0.490832,-1.989986,0.797655,1.994412,4.0
2,-0.098888,-0.01499,0.011931,0.798387,0.035024,0.080013,0.126054,0.008523,-0.046719,-0.070404,...,0.372553,-0.053076,-1.813697,0.025903,0.54196,-0.041731,-1.989986,-0.655922,0.847522,1.0
3,0.181075,0.01821,-0.139065,0.442624,-0.017682,0.01777,-0.005218,0.015139,0.09483,-0.293929,...,0.366699,0.290755,0.143212,0.308691,0.24783,0.442878,1.994412,1.646768,0.366377,4.0
4,-0.039547,0.444358,-0.148357,-0.026292,-0.078914,-0.189799,0.008934,-0.009613,0.216311,0.250805,...,0.764707,-0.072413,-0.864451,0.613156,0.547153,0.208624,1.994412,1.252641,1.994412,4.0


In [24]:
result_df.to_csv('M01_output_data.csv', index=False)